summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll20
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-memset-inline.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/expand-select.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/machine-scheduler.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.i16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-i128.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll21
22 files changed, 117 insertions, 114 deletions
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index a148a891a86..e42701b9c6c 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1573,6 +1573,8 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
SUnit *SUa = MemOpRecords[Idx].SU;
SUnit *SUb = MemOpRecords[Idx+1].SU;
+ if (SUa->NodeNum > SUb->NodeNum)
+ std::swap(SUa, SUb);
if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
*MemOpRecords[Idx + 1].BaseOp,
ClusterLength) &&
diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index 6d9b7d4bb4f..5f75b4ef944 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -3,7 +3,7 @@
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: stp_i64_scale:%bb.0
-; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(3) - SU(4)
; CHECK:Cluster ld/st SU(2) - SU(5)
; CHECK:SU(4): STRXui %1:gpr64, %0:gpr64common, 1
; CHECK:SU(3): STRXui %1:gpr64, %0:gpr64common, 2
@@ -24,7 +24,7 @@ entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: stp_i32_scale:%bb.0
-; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(3) - SU(4)
; CHECK:Cluster ld/st SU(2) - SU(5)
; CHECK:SU(4): STRWui %1:gpr32, %0:gpr64common, 1
; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 2
@@ -45,12 +45,12 @@ entry:
; CHECK:********** MI Scheduling **********
; CHECK-LABEL:stp_i64_unscale:%bb.0 entry
-; CHECK:Cluster ld/st SU(5) - SU(2)
-; CHECK:Cluster ld/st SU(4) - SU(3)
-; CHECK:SU(5): STURXi %1:gpr64, %0:gpr64common, -32
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:Cluster ld/st SU(3) - SU(4)
; CHECK:SU(2): STURXi %1:gpr64, %0:gpr64common, -24
-; CHECK:SU(4): STURXi %1:gpr64, %0:gpr64common, -16
; CHECK:SU(3): STURXi %1:gpr64, %0:gpr64common, -8
+; CHECK:SU(4): STURXi %1:gpr64, %0:gpr64common, -16
+; CHECK:SU(5): STURXi %1:gpr64, %0:gpr64common, -32
define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
entry:
%arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
@@ -66,12 +66,12 @@ entry:
; CHECK:********** MI Scheduling **********
; CHECK-LABEL:stp_i32_unscale:%bb.0 entry
-; CHECK:Cluster ld/st SU(5) - SU(2)
-; CHECK:Cluster ld/st SU(4) - SU(3)
-; CHECK:SU(5): STURWi %1:gpr32, %0:gpr64common, -16
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:Cluster ld/st SU(3) - SU(4)
; CHECK:SU(2): STURWi %1:gpr32, %0:gpr64common, -12
-; CHECK:SU(4): STURWi %1:gpr32, %0:gpr64common, -8
; CHECK:SU(3): STURWi %1:gpr32, %0:gpr64common, -4
+; CHECK:SU(4): STURWi %1:gpr32, %0:gpr64common, -8
+; CHECK:SU(5): STURWi %1:gpr32, %0:gpr64common, -16
define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
entry:
%arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 6da959962c0..a45373a1d21 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -36,7 +36,7 @@ define i64 @ldp_sext_int(i32* %p) nounwind {
; Test ldur clustering.
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: ldur_int:%bb.0
-; CHECK: Cluster ld/st SU(2) - SU(1)
+; CHECK: Cluster ld/st SU(1) - SU(2)
; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDURWi
; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDURWi
define i32 @ldur_int(i32* %a) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
index d0b1521fb8a..7a9f3b2fa97 100644
--- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -259,9 +259,9 @@ define void @memset_12_stack() {
define void @memset_16_stack() {
; CHECK-LABEL: memset_16_stack:
; CHECK: mov x8, #-6148914691236517206
-; CHECK-NEXT: str x8, [sp, #-32]!
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp x8, x30, [sp, #8]
+; CHECK-NEXT: str x8, [sp]
; CHECK-NEXT: bl something
%buf = alloca [16 x i8], align 1
%cast = bitcast [16 x i8]* %buf to i8*
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index d1f49e1f44e..82cf7a876cd 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -6,17 +6,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, <2 x i128> *%Out) {
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x1
; CHECK-NEXT: fmov s0, wzr
-; CHECK-NEXT: ldp x10, x9, [sp, #8]
; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: ldp x8, x9, [sp, #8]
+; CHECK-NEXT: ldr x10, [sp]
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
; CHECK-NEXT: fmov w11, s0
; CHECK-NEXT: tst w11, #0x1
; CHECK-NEXT: csel x11, x2, x6, ne
; CHECK-NEXT: csel x12, x3, x7, ne
-; CHECK-NEXT: csel x8, x4, x8, ne
-; CHECK-NEXT: csel x10, x5, x10, ne
-; CHECK-NEXT: stp x8, x10, [x9, #16]
+; CHECK-NEXT: csel x10, x4, x10, ne
+; CHECK-NEXT: csel x8, x5, x8, ne
+; CHECK-NEXT: stp x10, x8, [x9, #16]
; CHECK-NEXT: stp x11, x12, [x9]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
diff --git a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
index 99866c84b5a..cea9d2ac057 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -65,8 +65,8 @@ define void @f3(i32 %a1, i32 %a2) #0 {
define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 {
; CHECK-NEXT: adrp x8, [[SET3]]@PAGE
; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF
-; CHECK-NEXT: stp w2, w0, [x8]
-; CHECK-NEXT: str w1, [x8, #8]
+; CHECK-NEXT: stp w0, w1, [x8, #4]
+; CHECK-NEXT: str w2, [x8]
; CHECK-NEXT: ret
store i32 %a1, i32* @m4, align 4
store i32 %a2, i32* @n4, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-scheduler.mir b/llvm/test/CodeGen/AArch64/machine-scheduler.mir
index 33cb72f8be7..b66a2eff091 100644
--- a/llvm/test/CodeGen/AArch64/machine-scheduler.mir
+++ b/llvm/test/CodeGen/AArch64/machine-scheduler.mir
@@ -18,8 +18,8 @@
---
# CHECK-LABEL: name: load_imp-def
# CHECK: bb.0.entry:
-# CHECK: LDRWui $x0, 0
# CHECK: LDRWui $x0, 1
+# CHECK: LDRWui $x0, 0
# CHECK: STRWui $w1, $x0, 2
name: load_imp-def
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
index ea5f01fbda0..763829094c8 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -1,9 +1,9 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
-; GCN: global_store_dwordx2
-; GCN: global_store_dword v
; GCN: global_store_short
+; GCN: global_store_dword v
+; GCN: global_store_dwordx2
define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) {
entry:
store i8 72, i8 addrspace(1)* %data, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 2fe58159694..42192289aed 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -133,10 +133,10 @@ entry:
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
@@ -331,10 +331,10 @@ entry:
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
; GCN: s_waitcnt vmcnt(0)
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 816e6836c17..a627bf00465 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -765,16 +765,17 @@ entry:
; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
-; GCN-NOT: buffer_store_dword v33
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NOT: buffer_store_dword v33
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN: s_getpc_b64
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 93fed467114..9ec8b7573ce 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -199,15 +199,14 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NEXT: global_load_dword v4, v[2:3], off
-; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
+; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:4
+; GCN-NEXT: global_load_dword v2, v[2:3], off
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4
-; GCN-NEXT: buffer_store_short_d16_hi v4, off, s[0:3], s9 offset:6
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:4
+; GCN-NEXT: buffer_store_short_d16_hi v2, off, s[0:3], s9 offset:6
+; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:8
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4
; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6
; GCN-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index c65d7fc02d9..941e32aae94 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -236,19 +236,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: flat_load_ubyte v1, v[6:7]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v4, v[6:7]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_e32 v2, v2, v4
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -414,35 +416,34 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v9, v[2:3]
; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v11, v[2:3]
; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0
-; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: flat_load_ubyte v1, v[8:9]
-; VI-NEXT: flat_load_ubyte v7, v[6:7]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v4, v[6:7]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11
-; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; VI-NEXT: v_or_b32_e32 v0, v3, v0
-; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_e32 v4, v3, v4
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0
+; VI-NEXT: v_or_b32_e32 v0, v1, v8
; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_or_b32_e32 v4, v4, v7
; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; VI-NEXT: v_or_b32_e32 v4, v4, v5
; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
@@ -699,23 +700,24 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
-; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: flat_load_ubyte v1, v[6:7]
-; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v6, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: flat_load_ubyte v3, v[4:5]
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; VI-NEXT: v_or_b32_e32 v1, v1, v6
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_e32 v4, v2, v0
-; VI-NEXT: v_or_b32_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9991eb3fcbe..e8a7a1ce518 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -473,8 +473,8 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
; GFX9-NOT: m0
; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index b25a40f443c..a928384457a 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -231,10 +231,10 @@ declare void @func(<4 x float> addrspace(5)* nocapture) #0
; GCN-LABEL: {{^}}undefined_stack_store_reg:
; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
bb:
%tmp = alloca <4 x float>, align 16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
index d29bcef05b9..9a23df97d9a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
@@ -4,8 +4,8 @@
; an unused stack slot, causing ScratchSize to be non-zero.
; GCN-LABEL: store_v3i32:
-; GCN: ds_read_b64
; GCN: ds_read_b32
+; GCN: ds_read_b64
; GCN: ds_write_b32
; GCN: ds_write_b64
; GCN: ScratchSize: 0
@@ -17,8 +17,8 @@ define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %
}
; GCN-LABEL: store_v5i32:
-; GCN: ds_read2_b64
; GCN: ds_read_b32
+; GCN: ds_read2_b64
; GCN: ds_write_b32
; GCN: ds_write2_b64
; GCN: ScratchSize: 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 957d63bbee4..4c45856ce1d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1642,10 +1642,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; SI-NEXT: v_mov_b32_e32 v13, s21
; SI-NEXT: v_mov_b32_e32 v14, s22
; SI-NEXT: v_mov_b32_e32 v15, s23
-; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; SI-NEXT: v_or_b32_e32 v16, s4, v16
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
@@ -1688,10 +1688,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v13, s21
; VI-NEXT: v_mov_b32_e32 v14, s22
; VI-NEXT: v_mov_b32_e32 v15, s23
-; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; VI-NEXT: v_or_b32_e32 v16, s4, v16
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a657b8d01af..d6ececb1b4e 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -855,8 +855,8 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-GFX9: kernarg_segment_byte_size = 28
-; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index b7344cfb33c..6691696924b 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -75,8 +75,8 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-VI: kernarg_segment_byte_size = 28
-; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index ff624ec1d0c..3a87b585995 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -137,22 +137,22 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT: flat_load_dword v8, v[0:1]
-; VI-NEXT: flat_load_ushort v9, v[4:5]
+; VI-NEXT: flat_load_ushort v8, v[4:5]
+; VI-NEXT: flat_load_dword v9, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_max_i16_e32 v1, v8, v2
-; VI-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_max_i16_e32 v0, v8, v0
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v0, v9, v0
-; VI-NEXT: flat_store_dword v[6:7], v1
+; VI-NEXT: v_max_i16_e32 v2, v9, v1
+; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: flat_store_short v[4:5], v0
+; VI-NEXT: flat_store_dword v[6:7], v1
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index c8c118280e9..b3ceb43a50f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -221,26 +221,26 @@ define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
;
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
;
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
@@ -299,10 +299,10 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
;
-; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@@ -352,8 +352,8 @@ define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
;
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
@@ -454,13 +454,13 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
;
-; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -519,8 +519,8 @@ define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buf
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
;
-; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 2f3a3e53ffc..b46e6868923 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -509,8 +509,8 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v11, 0
-; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm
%shift = shl <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -579,8 +579,8 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_mov_b32_e32 v11, 0
-; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm
%shift = lshr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -653,8 +653,8 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_mov_b32_e32 v11, 0
-; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm
%shift = ashr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 3d00d845202..d3cb4ec3ba2 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -10,21 +10,20 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) {
; GCN-NEXT: BB0_1: ; %bb0
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_2 Depth 2
-; GCN-NEXT: v_add_co_u32_e64 v4, vcc_lo, v0, 8
+; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8
; GCN-NEXT: s_mov_b32 s5, exec_lo
-; GCN-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
+; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
-; GCN-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; GCN-NEXT: v_readfirstlane_b32 s8, v2
-; GCN-NEXT: v_readfirstlane_b32 s9, v3
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s10, v4
-; GCN-NEXT: v_readfirstlane_b32 s11, v5
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3]
-; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
+; GCN-NEXT: v_readfirstlane_b32 s8, v4
+; GCN-NEXT: v_readfirstlane_b32 s9, v5
+; GCN-NEXT: v_readfirstlane_b32 s10, v2
+; GCN-NEXT: v_readfirstlane_b32 s11, v3
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GCN-NEXT: s_and_b32 s4, vcc_lo, s4
; GCN-NEXT: s_and_saveexec_b32 s4, s4
; GCN-NEXT: s_nop 0
OpenPOWER on IntegriCloud