AMDGPU: Add pass to lower kernel arguments to loads

This replaces most argument uses with loads, but for now not all. The code in SelectionDAG for calling convention lowering is actively harmful for amdgpu_kernel. It attempts to split the argument types into register legal types, which results in low quality code for arbitary types. Since all kernel arguments are passed in memory, we just want the raw types. I've tried a couple of methods of mitigating this in SelectionDAG, but it's easier to just bypass this problem alltogether. It's possible to hack around the problem in the initial lowering, but the real problem is the DAG then expects to be able to use CopyToReg/CopyFromReg for uses of the arguments outside the block. Exposing the argument loads in the IR also has the advantage that the LoadStoreVectorizer can merge them. I'm not sure the best approach to dealing with the IR argument list is. The patch as-is just leaves the IR arguments in place, so all the existing code will still compute the same kernarg size and pointlessly lowers the arguments. Arguably the frontend should emit kernels with an empty argument list in the first place. Alternatively a dummy array could be inserted as a single argument just to reserve space. This does have some disadvantages. Local pointer kernel arguments can no longer have AssertZext placed on them as the equivalent !range metadata is not valid on pointer typed loads. This is mostly bad for SI which needs to know about the known bits in order to use the DS instruction offset, so in this case this is not done. More importantly, this skips noalias arguments since this pass does not yet convert this to the equivalent !alias.scope and !noalias metadata. Producing this metadata correctly seems to be tricky, although this logically is the same as inlining into a function which doesn't exist. Additionally, exposing these loads to the vectorizer may result in degraded aliasing information if a pointer load is merged with another argument load. I'm also not entirely sure this is preserving the current clover ABI, although I would greatly prefer if it would stop widening arguments and match the HSA ABI. As-is I think it is extending < 4-byte arguments to 4-bytes but doesn't align them to 4-bytes. llvm-svn: 335650
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-26 19:10:00 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-26 19:10:00 +0000
commit: 8c4a35237a58d377eb4ef5e8156f789e40dfd837 (patch)
tree: 829a341e09f6c77dbaad6bde17e9f7382950c04d /llvm/test/CodeGen
parent: 7e991d30c0fc2f2fb0510dfdd0a1e7053e64170e (diff)
download: bcm5719-llvm-8c4a35237a58d377eb4ef5e8156f789e40dfd837.tar.gz
bcm5719-llvm-8c4a35237a58d377eb4ef5e8156f789e40dfd837.zip
126 files changed, 2828 insertions, 1539 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 2097d5119f5..ff33a6e18ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -9,11 +9,11 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -21,11 +21,11 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -36,11 +36,11 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -51,11 +51,11 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -65,11 +65,11 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -79,11 +79,11 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index f673d91192b..894e9c7578c 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out,
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 21bb6513dd8..78d99d881cb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
 ; TRAP-HANDLER-ENABLE:  NumSgprs: 60
-; TRAP-HANDLER-DISABLE: NumSgprs: 76
+; TRAP-HANDLER-DISABLE: NumSgprs: 78
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     i32 addrspace(1)* %out0, i32 %in0,
     i32 addrspace(1)* %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index b5b3e78a142..739e6c1c92c 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -217,7 +217,7 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out,
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -235,7 +235,7 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -381,7 +381,7 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 a
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 14327e30aff..40e30156594 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -12,25 +12,17 @@
 ; CIVI: s_load_dword [[LHS:s[0-9]+]]
 ; CIVI: s_load_dword [[RHS:s[0-9]+]]
 
-; VI: s_ashr_i32
-; VI: s_ashr_i32
-; VI: s_sext_i32_i16
-; VI: s_sext_i32_i16
-; VI: s_ashr_i32
-; VI: s_ashr_i32
-; VI: s_lshl_b32
-; VI: s_and_b32
-; VI: s_or_b32
-
-; CI: s_ashr_i32
-; CI: s_and_b32
-; CI: s_lshr_b32
-; CI: s_sext_i32_i16
-; CI: s_ashr_i32
-; CI: s_ashr_i32
-; CI: s_lshl_b32
-; CI: s_and_b32
-define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_sext_i32_i16
+; CIVI-DAG: s_sext_i32_i16
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_lshl_b32
+; CIVI: s_and_b32
+; CIVI: s_or_b32
+
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
   %result = ashr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 836ba764a5b..943ff9e7e7e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -7,16 +7,16 @@
 ; GFX9-NOT: m0
 ; SICIVI-DAG: s_mov_b32 m0
 
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -70,15 +70,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac
 
 
 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll
new file mode 100644
index 00000000000..c05f29622c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
+
+; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
+
+; ALL-LABEL: {{^}}max_9_sgprs:
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 9
+define amdgpu_kernel void @max_9_sgprs() #0 {
+  %one = load volatile i32, i32 addrspace(4)* undef
+  %two = load volatile i32, i32 addrspace(4)* undef
+  %three = load volatile i32, i32 addrspace(4)* undef
+  %four = load volatile i32, i32 addrspace(4)* undef
+  %five = load volatile i32, i32 addrspace(4)* undef
+  %six = load volatile i32, i32 addrspace(4)* undef
+  %seven = load volatile i32, i32 addrspace(4)* undef
+  %eight = load volatile i32, i32 addrspace(4)* undef
+  %nine = load volatile i32, i32 addrspace(4)* undef
+  %ten = load volatile i32, i32 addrspace(4)* undef
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight)
+  store volatile i32 %one, i32 addrspace(1)* undef
+  store volatile i32 %two, i32 addrspace(1)* undef
+  store volatile i32 %three, i32 addrspace(1)* undef
+  store volatile i32 %four, i32 addrspace(1)* undef
+  store volatile i32 %five, i32 addrspace(1)* undef
+  store volatile i32 %six, i32 addrspace(1)* undef
+  store volatile i32 %seven, i32 addrspace(1)* undef
+  store volatile i32 %eight, i32 addrspace(1)* undef
+  store volatile i32 %nine, i32 addrspace(1)* undef
+  store volatile i32 %ten, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 822ea803194..b98b48d3c37 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,25 +1,37 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
 
 ; If spilling to smem, additional registers are used for the resource
 ; descriptor.
 
+; FIXME: Vectorization can increase required SGPR count beyond limit.
+; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
+
 ; ALL-LABEL: {{^}}max_9_sgprs:
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
-
-                          i32 addrspace(1)* %out2,
-                          i32 addrspace(1)* %out3,
-                          i32 addrspace(1)* %out4,
-                          i32 addrspace(1)* %out5,
-                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
-  store i32 %one, i32 addrspace(1)* %out1
-  store i32 %two, i32 addrspace(1)* %out2
-  store i32 %three, i32 addrspace(1)* %out3
-  store i32 %four, i32 addrspace(1)* %out4
-  store i32 %five, i32 addrspace(1)* %out5
+define amdgpu_kernel void @max_9_sgprs() #0 {
+  %one = load volatile i32, i32 addrspace(4)* undef
+  %two = load volatile i32, i32 addrspace(4)* undef
+  %three = load volatile i32, i32 addrspace(4)* undef
+  %four = load volatile i32, i32 addrspace(4)* undef
+  %five = load volatile i32, i32 addrspace(4)* undef
+  %six = load volatile i32, i32 addrspace(4)* undef
+  %seven = load volatile i32, i32 addrspace(4)* undef
+  %eight = load volatile i32, i32 addrspace(4)* undef
+  %nine = load volatile i32, i32 addrspace(4)* undef
+  %ten = load volatile i32, i32 addrspace(4)* undef
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
+  store volatile i32 %one, i32 addrspace(1)* undef
+  store volatile i32 %two, i32 addrspace(1)* undef
+  store volatile i32 %three, i32 addrspace(1)* undef
+  store volatile i32 %four, i32 addrspace(1)* undef
+  store volatile i32 %five, i32 addrspace(1)* undef
+  store volatile i32 %six, i32 addrspace(1)* undef
+  store volatile i32 %seven, i32 addrspace(1)* undef
+  store volatile i32 %eight, i32 addrspace(1)* undef
+  store volatile i32 %nine, i32 addrspace(1)* undef
+  store volatile i32 %ten, i32 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 0607eca9cd1..057742fcfd4 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -29,7 +29,8 @@ end:
 
 ; GCN-LABEL: {{^}}test_brcc_i1:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
+; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
+; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
 ; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
 ; GCN: s_cmp_eq_u32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index a33ab4e379d..69237cfabb8 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}v_ubfe_sub_i32:
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
@@ -48,10 +48,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
-; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
+; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -63,11 +62,10 @@ define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
-; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
-; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
+; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
+; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -126,10 +124,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
-; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
+; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -141,11 +138,10 @@ define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
-; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
-; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
+; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
+; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 1e74f06bbda..50e72cddfae 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
@@ -119,10 +119,10 @@ entry:
 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
   %and0 = and i64 %a, %b
   %not.a = xor i64 %a, -1
@@ -136,10 +136,10 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN-DAG: v_bfi_b32
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
@@ -155,8 +155,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN-DAG: v_bfi_b32
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index f145869f458..b24f4f11182 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -11,22 +11,32 @@
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; GCN: s_cbranch_vccnz
 
-; GCN: one{{$}}
-; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short
-; GCN: s_endpgm
+; SI: one{{$}}
+; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]]
 
-; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
-; GCN: buffer_store_short v[[B_F16]]
-; GCN: s_endpgm
+; SI: two{{$}}
+; SI:  v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]]
+
+; SI: one{{$}}
+; SI: buffer_store_short v[[CVT]]
+; SI: s_endpgm
+
+
+
+; VI: one{{$}}
+; VI: buffer_store_short v[[A_F16]]
+; VI: s_endpgm
+
+; VI: two{{$}}
+; VI: buffer_store_short v[[B_F16]]
+; VI: s_endpgm
 define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %fcmp = fcmp olt half %a.val, %b.val
   br i1 %fcmp, label %one, label %two
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index b0ee7f323f3..d4284a32c0e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -490,7 +490,7 @@ ret:
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 8d7401125b1..2291527f777 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -9,7 +9,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 6
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -23,7 +23,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 6
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 40548ba4cb9..612f31fd3af 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -123,7 +123,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64:
-; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@@ -133,7 +133,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9796bc780ed..70a81e1b0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
-; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@@ -108,7 +108,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 2d16d4034d3..def3533f49b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -305,14 +305,14 @@ define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %o
 ; but there are some cases when the should be allowed.
 
 ; FUNC-LABEL: {{^}}ctpop_i32_in_br:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58
 ; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 8236ac07a68..3a7e905250c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -308,7 +308,9 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %o
 ; FUNC-LABEL: {{^}}ctpop_i16_in_br:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
 ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+
+; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_short [[RESULT]],
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 298280f755f..00011cdb7aa 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -13,13 +13,13 @@ declare i65 @llvm.ctpop.i65(i65) nounwind readnone
 declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i64:
-; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index c84e53dfa91..2bd69b87d5c 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -58,11 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; GFX89: s_load_dwordx2
-; GFX89: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
@@ -78,8 +75,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
 ; FIXME: Why sometimes vector shift?
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
 ; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI: s_load_dwordx2 s
 
 ; GFX89: s_load_dwordx2 s
 ; GFX89: s_load_dwordx2 s
@@ -87,9 +84,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
 
 
 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
-
-; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 
 ; GCN: {{buffer|global}}_store_short
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index d6363b1af4d..d5337506312 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -27,7 +27,7 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -58,12 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
-; SI: s_load_dword s
-; SI: s_load_dwordx2 s
-; SI: s_load_dword s
-
-; GFX89: s_load_dwordx2
-; GFX89: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 
 ; GCN-NOT: {{buffer|flat|global}}_load
 
@@ -79,8 +75,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 
@@ -100,12 +95,12 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
 ; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI: s_load_dwordx2 s
 
-; GFX89-DAG: s_load_dwordx2
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
-; GFX89-DAG: s_load_dword s
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
+; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
 
 ; GCN-NOT: {{buffer|flat|global}}
 
@@ -113,17 +108,13 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
 ; SICI: buffer_store_short
 ; SICI: buffer_store_short
 
-; SICI: buffer_load_ushort
-; SICI: buffer_store_short
-
 ; GFX9-NOT: s_pack_ll_b32_b16
 ; GFX9-NOT: s_pack_lh_b32_b16
 
 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
-
+; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
 ; GCN: {{buffer|global}}_store_short
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 7f961f1d18a..bf403bf51a7 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
@@ -14,7 +14,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
 ; GCN: s_load_dword s
 ; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
 ; GCN-NOT: {{flat|buffer|global}}
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
@@ -22,8 +23,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -38,8 +39,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -54,24 +55,24 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
-; GCN-NOT: {{flat|buffer|global}}
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* null
+  store volatile i8 %p0, i8 addrspace(1)* null
   ret void
 }
 
@@ -87,25 +88,25 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
-; GCN: s_load_dword [[LOAD0:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
+; GCN-NOT: {{s|flat|buffer|global}}_load
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN-NOT: {{s|flat|buffer|global}}_load
+; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
 ; GCN: buffer_store_byte [[V_ELT2]]
 ; GCN: buffer_store_byte [[V_LOAD0]]
-define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* null
+  store volatile i8 %p0, i8 addrspace(1)* null
   ret void
 }
 
@@ -121,8 +122,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -132,42 +133,36 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
 ; isTypeDesirableForOp in SimplifyDemandedBits
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
 ; VI-NOT: {{flat|buffer|global}}
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
+; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
-; VI: buffer_store_byte [[EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
+; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
+; VI: buffer_store_byte [[ELT]]
+define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %elt = extractelement <2 x i8> %foo, i32 %idx
-  store i8 %elt, i8 addrspace(1)* %out
+  store volatile i8 %elt, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
 ; VI-NOT: {{flat|buffer|global}}
-; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
-; VI: buffer_store_byte [[EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
+; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
+; VI: buffer_store_byte [[V_ELT]]
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@@ -175,16 +170,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out
 
 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
 ; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
   %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
   %p0 = extractelement <4 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@@ -195,7 +190,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
   %p0 = extractelement <8 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 721b4f1775c..6b8eb91ca3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
 }
 
 ; GCN-LABEL: {{^}}s_fabs_v4f16:
-; CI: s_load_dword s[[LO:[0-9]+]]
-; CI: s_load_dword s[[HI:[0-9]+]]
+; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
 ; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
+
 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
 
 ; GCN-LABEL: {{^}}fabs_fold_f16:
 ; GCN: s_load_dword [[IN0:s[0-9]+]]
-; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
+; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
 
 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
 ; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
@@ -62,6 +62,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
 ; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
 
+; GFX89-NOT: and
 ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
 ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 718176b80f0..d20e74ad62e 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -53,11 +53,11 @@ define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x doub
 }
 
 ; SI-LABEL: {{^}}fabs_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
-define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -65,11 +65,11 @@ define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0,
 }
 
 ; SI-LABEL: {{^}}fabs_fn_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
-define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 3c6bdfa77dc..ba72969b826 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -70,10 +70,11 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 }
 
 ; GCN-LABEL: {{^}}fabs_fn_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
 define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
@@ -82,10 +83,11 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl
 }
 
 ; FUNC-LABEL: {{^}}fabs_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
 define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 447c253afa9..30fdb1df826 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fadd half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -65,10 +65,10 @@ entry:
 ; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
 ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@@ -102,13 +102,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
 ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
@@ -133,13 +133,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
 ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index aef898b1a8e..a9211f9905e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @fcmp_f16_lt(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp olt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -42,8 +42,8 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %a.abs = call half @llvm.fabs.f16(half %a.val)
   %b.abs = call half @llvm.fabs.f16(half %b.val)
   %r.val = fcmp olt half %a.abs, %b.abs
@@ -67,8 +67,8 @@ define amdgpu_kernel void @fcmp_f16_eq(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp oeq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -90,8 +90,8 @@ define amdgpu_kernel void @fcmp_f16_le(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ole half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -113,8 +113,8 @@ define amdgpu_kernel void @fcmp_f16_gt(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ogt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -136,8 +136,8 @@ define amdgpu_kernel void @fcmp_f16_lg(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp one half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -159,8 +159,8 @@ define amdgpu_kernel void @fcmp_f16_ge(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp oge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -182,8 +182,8 @@ define amdgpu_kernel void @fcmp_f16_o(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ord half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -205,8 +205,8 @@ define amdgpu_kernel void @fcmp_f16_u(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp uno half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -228,8 +228,8 @@ define amdgpu_kernel void @fcmp_f16_nge(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ult half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -251,8 +251,8 @@ define amdgpu_kernel void @fcmp_f16_nlg(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ueq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -274,8 +274,8 @@ define amdgpu_kernel void @fcmp_f16_ngt(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ule half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -297,8 +297,8 @@ define amdgpu_kernel void @fcmp_f16_nle(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ugt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -320,8 +320,8 @@ define amdgpu_kernel void @fcmp_f16_neq(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp une half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -343,8 +343,8 @@ define amdgpu_kernel void @fcmp_f16_nlt(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp uge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 15d4d2a3667..6f3b3ebfa41 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -30,8 +30,8 @@ define amdgpu_kernel void @test_copysign_f16(
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
 entry:
-  %mag = load half, half addrspace(1)* %arg_mag
-  %sign = load half, half addrspace(1)* %arg_sign
+  %mag = load volatile half, half addrspace(1)* %arg_mag
+  %sign = load volatile half, half addrspace(1)* %arg_sign
   %out = call half @llvm.copysign.f16(half %mag, half %sign)
   store half %out, half addrspace(1)* %arg_out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index e5893e5995a..196da930785 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -8,12 +8,11 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 
 ; Try to identify arg based on higher address.
 ; FUNC-LABEL: {{^}}test_copysign_f32:
-; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
-; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
-; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
-; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
+; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
+
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
 ; GCN: buffer_store_dword [[RESULT]],
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 161b7ad18e5..b681e4a0da4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -6,10 +6,10 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r
 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_copysign_f64:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
@@ -17,15 +17,15 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index 68e25ee7f4b..3cb6090677d 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -64,9 +64,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
 ; SI: v_fma_f32
 ; SI: v_fma_f32
 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].X
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 0494295fc15..199934e5fd9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -25,14 +25,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addr
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
-; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
-; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
 
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 4002712ab16..4d08651dcb4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -1,6 +1,6 @@
-; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
 
 
 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
@@ -44,7 +44,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out,
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index e1d33b639f6..c75c500cdfa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -17,8 +17,8 @@ define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fmul half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -36,7 +36,7 @@ define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fmul half 3.0, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -55,24 +55,24 @@ define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load volatile half, half addrspace(1)* %a
   %r.val = fmul half %a.val, 4.0
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
@@ -82,6 +82,8 @@ entry:
 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
 
+; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -100,13 +102,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 
 ; VI-DAG:  v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
@@ -133,13 +135,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
 ; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -164,13 +166,15 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fmul_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
+; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
+; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
 
+; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
 ; VI: v_mul_f16_sdwa
 ; VI: v_mul_f16_e32
 ; VI: v_mul_f16_sdwa
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d5c7c8ad689..f43c2f8a6b8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -109,8 +109,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
 }
 
 ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
-; CI: s_load_dword s
-; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
+; CI: s_load_dword [[IN:s[0-9]+]]
+; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000
+; CI: s_lshr_b32
 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index bc0e5998018..45fd0f74614 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -55,14 +55,13 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_f64:
-; GCN-DAG: s_load_dwordx2
 ; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
-; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 3f20ca73e92..c72dab08e38 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
-; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}|
 define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
 ; SI-NOT: and
-; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}|
 ; SI-NOT: and
 define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
index 9b4b4d6e942..508eb0bf976 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -48,11 +48,11 @@ define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
 }
 
 ; GCN-LABEL: {{^}}fneg_fold_f64:
-; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
index e2b1950f885..8863b94ab78 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
@@ -13,18 +13,17 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:
 define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
 entry:
 ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CHECK: s_load_dword s2, s[0:1], 0xb
-; CHECK: s_load_dword s0, s[0:1], 0xc
+; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; CHECK: s_mov_b32 s10, -1
-; CHECK: s_waitcnt lgkmcnt(0)
-; CHECK: s_lshl_b32 s1, s2, 2
 ; CHECK: v_mov_b32_e32 v0, 4
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK: s_lshl_b32 s0, s0, 2
+; CHECK: v_add_i32_e32 v1, vcc, s0, v0
+; CHECK: s_lshl_b32 s0, s1, 2
 ; CHECK: s_mov_b32 s11, 0xe8f000
-; CHECK: v_add_i32_e32 v1, vcc, s1, v0
 ; CHECK: v_mov_b32_e32 v2, 7
-; CHECK: s_lshl_b32 s0, s0, 2
 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
 ; CHECK: v_add_i32_e32 v0, vcc, s0, v0
 ; CHECK: s_mov_b32 s7, 0xf000
@@ -35,14 +34,14 @@ entry:
 ; CHECK: s_endpgm
 
   %x = alloca [100 x i32], align 4, addrspace(5)
-  %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  %alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
+  call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
   %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
   store i32 7, i32 addrspace(5)* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
-  %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
-  store i32 %1, i32 addrspace(1)* %a, align 4
-  call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  %ld = load i32, i32 addrspace(5)* %arrayidx2, align 4
+  store i32 %ld, i32 addrspace(1)* %a, align 4
+  call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index fc055a58e75..066e49d7cac 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -17,8 +17,8 @@ define amdgpu_kernel void @fsub_f16(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fsub half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -36,7 +36,7 @@ define amdgpu_kernel void @fsub_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fsub half 1.0, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -54,33 +54,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load volatile half, half addrspace(1)* %a
   %r.val = fsub half %a.val, 2.0
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fsub_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI-DAG:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+
+; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -101,13 +109,13 @@ entry:
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
@@ -135,13 +143,13 @@ entry:
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index 20890894c0c..59c2f1b54f0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -5,7 +5,7 @@
 ; CHECK: s_load_dwordx4
 ; CHECK-NOT: flat_load_dword
 
-define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)*  %arg1) {
+define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
 bb:
   %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
   %tmp3 = fadd float %tmp2, 0.000000e+00
@@ -28,7 +28,7 @@ bb:
 ; CHECK: flat_load_dword
 ; CHECK-NOT: s_load_dwordx4
 
-define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
+define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
@@ -59,7 +59,7 @@ bb:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 
-define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
   store i32 0, i32 addrspace(1)* %out0
   %val = load i32, i32 addrspace(1)* %in
   store i32 %val, i32 addrspace(1)* %out1
@@ -71,7 +71,7 @@ define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i3
 ; CHECK: flat_store_dword
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
   store i32 0, i32 addrspace(1)* %out0
   %val = load i32, i32 addrspace(1)* %in
   store i32 %val, i32 addrspace(1)* %out1
@@ -80,19 +80,20 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0
 
 ; uniform load from global array
 ; CHECK-LABEL:  @global_array
-; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
+; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-
 @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
 
 define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
 entry:
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  store i32 %1, i32 addrspace(1)* %out, align 4
+  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %load1 = load i32, i32 addrspace(1)* %load0, align 4
+  store i32 %load1, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -105,13 +106,13 @@ entry:
 ; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
+define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
   store i32 12, i32 addrspace(1) * %gep
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  store i32 %1, i32 addrspace(1)* %out, align 4
+  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %load1 = load i32, i32 addrspace(1)* %load0, align 4
+  store i32 %load1, i32 addrspace(1)* %out, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index e2c6e616282..107a91262c4 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -22,13 +22,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
 }
 
 ; GCN-LABEL: {{^}}load_v3f16_arg:
-; SI: s_load_dwordx2
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 ; GCN-NOT: {buffer|flat|global}}_load_
 
 
@@ -45,11 +40,7 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
 
 ; FIXME: Why not one load?
 ; GCN-LABEL: {{^}}load_v4f16_arg:
-; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
-; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
-
-; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
-
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
@@ -86,14 +77,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
-; SI: s_load_dwordx2 s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 ; GCN-NOT: _load
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
@@ -116,14 +101,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
+; GCN: s_load_dwordx4
 
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
@@ -154,7 +132,7 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
-; GCN: s_load_dword
+; GCN-DAG: s_load_dword s
 ; GCN: s_lshr_b32
 
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -169,14 +147,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
-; SI: s_load_dword
-; SI: s_load_dword
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-
-; GCN: s_lshr_b32
-
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -191,19 +163,17 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
@@ -212,14 +182,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
-; SI: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NOT: _load_
-
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx4 s
 
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -299,12 +263,13 @@ define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, h
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
 ; GCN: flat_load_dword [[LOAD:v[0-9]+]],
 
-; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+
 ; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
 
-; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
@@ -348,6 +313,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
 
 ; GCN: flat_store_dwordx4
 
@@ -361,7 +327,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
 
 ; VI: v_cvt_f32_f16_e32
 ; VI: v_cvt_f32_f16_sdwa
@@ -430,19 +395,19 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(
 ; XVI-NOT: v_cvt_f32_f16
 
 ; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
-; SI-DAG:      v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; SI-DAG:  v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
-; VI-DAG:  v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-
-; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
-; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; GCN-NOT: v_cvt_f32_f16
+
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
 ; GCN-NOT: v_cvt_f64_f32_e32
 
-; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}}
-; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]]
+; GCN-DAG: flat_store_dwordx4
+; GCN-DAG: flat_store_dwordx2
 ; GCN: s_endpgm
 define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 379dd336971..3dc3f320db8 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -8,7 +8,7 @@
 ; CHECK:  Version: [ 1, 0 ]
 ; CHECK:  Kernels:
 
-; CHECK: - Name:       test
+; CHECK-LABEL: - Name:       test
 ; CHECK:   SymbolName: 'test@kd'
 ; CHECK:   CodeProps:
 ; CHECK:     KernargSegmentSize:      24
@@ -16,8 +16,8 @@
 ; CHECK:     PrivateSegmentFixedSize: 0
 ; CHECK:     KernargSegmentAlign:     8
 ; CHECK:     WavefrontSize:           64
-; CHECK:     NumSGPRs:                6
-; CHECK:     NumVGPRs:                3
+; CHECK:     NumSGPRs:                8
+; CHECK:     NumVGPRs:                6
 ; CHECK:     MaxFlatWorkGroupSize:    256
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
@@ -31,18 +31,24 @@ entry:
   ret void
 }
 
-; CHECK: - Name:       num_spilled_sgprs
+; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs@kd'
 ; CHECK:   CodeProps:
-; CHECK:     NumSpilledSGPRs: 41
+; GFX700:     NumSpilledSGPRs: 40
+; GFX803:     NumSpilledSGPRs: 24
+; GFX900:     NumSpilledSGPRs: 24
 define amdgpu_kernel void @num_spilled_sgprs(
-    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
-    i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
-    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
-    i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
-    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
-    i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
-    i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
+    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
+    i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
+    i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
+    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
+    i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
+    i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
+    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
+    i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
+    i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
+    i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
+    i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
     i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
 entry:
   store i32 %in0, i32 addrspace(1)* %out0
@@ -64,7 +70,7 @@ entry:
   ret void
 }
 
-; CHECK: - Name:       num_spilled_vgprs
+; CHECK-LABEL: - Name:       num_spilled_vgprs
 ; CHECK:   SymbolName: 'num_spilled_vgprs@kd'
 ; CHECK:   CodeProps:
 ; CHECK:     NumSpilledVGPRs: 14
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index c66d834ec7c..bb9da71d1a9 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -344,114 +344,114 @@ define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float
 
 
 ; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -461,40 +461,40 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out,
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_1_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_2_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_16_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -504,7 +504,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, doub
 ; GCN: v_mov_b32_e32 v0, -1
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -514,7 +514,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, d
 ; GCN: v_mov_b32_e32 v0, -2
 ; GCN: v_mov_b32_e32 v1, -1
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -524,29 +524,29 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, d
 ; GCN: v_mov_b32_e32 v0, -16
 ; GCN: v_mov_b32_e32 v1, -1
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_63_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_64_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 3227633496a..84be1b8c715 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -310,9 +310,9 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
 ; GFX9: buffer_store_dword [[REG]]
 
-; VI: buffer_load_dword
+; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
+; VI-DAG: buffer_load_dword
 ; VI-NOT: and
-; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 59cf494d20a..a62ad820c89 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -75,8 +75,9 @@ define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}insertelement_to_sgpr:
 ; GCN-NOT: v_readfirstlane
-define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind {
-  %tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0
+define <4 x float> @insertelement_to_sgpr() nounwind {
+  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
+  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
   %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
   ret <4 x float> %tmp2
 }
@@ -154,11 +155,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
-; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
+; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -201,23 +202,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
-
-; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
-; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
-
-; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
-; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
-; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
+; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
+; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
+; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
+; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
 ; VI: buffer_store_short [[OR]]
-define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -227,68 +222,51 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
 ; isTypeDesirableForOp in SimplifyDemandedBits
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
 
-; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
-; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
-
-; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
+; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
-
-; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
-; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
-; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
-; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
-; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
-; VI: buffer_store_short [[BFI]]
-; VI: buffer_store_byte [[HI2]]
-define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
+; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
+; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
+
+; VI-DAG: buffer_store_short [[BFI]]
+; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
+; VI: buffer_store_byte [[V_HI2]]
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
-; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
 
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
-
-
-; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
-; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
-; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
-; VI: v_or_b32_e32
-; VI: v_or_b32_sdwa
+; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_or_b32_sdwa
-; VI: s_lshl_b32
-; VI: v_bfi_b32
-define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
+; VI: buffer_store_dword [[BFI]]
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
-; VI-NOT: {{buffer|flat|global}}
-; VI: s_load_dword [[IDX:s[0-9]]]
-; VI-NOT: {{buffer|flat|global}}
-; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; VI-NOT: {{buffer|flat|global}}
+; VI-NOT: {{buffer|flat|global}}_load
+; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
+; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
+; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
 ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
 ; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
@@ -307,13 +285,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
 ; GCN: s_load_dwordx2
+; GCN: s_load_dwordx4
 ; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN-NOT: _load_
-
 
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
@@ -368,7 +341,7 @@ endif:
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
-; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
+; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
 
 ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
 
@@ -390,7 +363,7 @@ endif:
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -420,19 +393,18 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
 ; space is also 2x what should be required.
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
-; GCN: SCRATCH_RSRC_DWORD
 
 ; Stack store
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
 
 ; Write element
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
 
 ; Store result
 ; GCN: buffer_store_dwordx4
@@ -447,19 +419,17 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
-; GCN-DAG: SCRATCH_RSRC_DWORD
-
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
 
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
 
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index cb2753bcc08..b6eda9faf3a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,9 +1,9 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
@@ -18,17 +18,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
-; GCN: s_load_dword [[ELT0:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
-; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
-; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -36,29 +36,29 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
-; GCN: s_load_dword [[ELT0:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
-; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
 ; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
-; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]]
 ; CI-DAG: ; use [[SHR]]
 
 
 ; FIXME: Should be able to void mask of upper bits
-; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
-; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
-; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]]
+; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
 
 ; VI-DAG: ; use [[SHR]]
 
 
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
-; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]]
 ; GFX9-DAG: ; use [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -69,16 +69,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
+; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
-; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]]
 
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -88,7 +89,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
 ; GCN: s_load_dword [[VEC:s[0-9]+]],
 
 ; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
@@ -110,7 +111,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
 ; GCN: s_load_dword [[VEC:s[0-9]+]],
 
 ; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
@@ -161,15 +162,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
-; GCN: s_load_dword [[ELT1:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
+; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16
 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 
 ; GCN-NOT: shlr
-; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]]
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -444,12 +446,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -473,12 +474,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -501,17 +501,18 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
-; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
-; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
-define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@@ -531,12 +532,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
 ; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
 
-; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
-; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
+; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
+; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
 ; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
+; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
-; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
+; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@@ -553,17 +555,18 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
-; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
-; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
-define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@@ -583,12 +586,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
 ; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
 
-; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
-; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
+; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
+; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
 ; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
+; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
-; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@@ -611,8 +615,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
-; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 231582513a3..da8c994c530 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -210,8 +210,10 @@ entry:
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
 
-; GCN: s_load_dword s
-; GCN-NOT: {{buffer|flat|global}}_load_
+; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+
+; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
@@ -226,8 +228,7 @@ entry:
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
 
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
@@ -236,6 +237,7 @@ entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
+
 ; FUNC-LABEL: {{^}}v3i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -274,8 +276,8 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; GCN: s_load_dword s
-; GCN-NOT: {{buffer|flat|global}}_load_
+; GCN-DAG: s_load_dwordx2 s
+; GCN-DAG: s_load_dword s
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -290,12 +292,18 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
+
+
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
+
+; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -348,23 +356,16 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI-NOT: {{buffer|flat|global}}_load
 ; SI: s_load_dwordx2 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
-; VI: s_load_dword s
-; VI: s_load_dword s
-
-; VI: v_lshlrev_b16
-; VI: v_or_b32_e32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: s_lshr_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
+; VI: s_load_dwordx2 s
+; VI-NEXT: s_load_dwordx2 s
+; VI-NOT: lshl
+; VI-NOT: _or
+; VI-NOT: _sdwa
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -383,19 +384,14 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
+; SI-NEXT: s_load_dwordx2
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
+; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
 
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
@@ -413,6 +409,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
@@ -462,33 +459,16 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-
-; VI: s_lshr_b32
-; VI: v_lshlrev_b16
-; VI: s_lshr_b32
-; VI: s_lshr_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: v_lshlrev_b16
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: v_lshlrev_b16
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
+; VI: s_load_dwordx4 s
+; VI-NOT: shr
+; VI-NOT: shl
+; VI-NOT: _sdwa
+; VI-NOT: _or_
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -516,27 +496,14 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
+; SI: s_load_dwordx8 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
+; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
+; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
@@ -600,22 +567,21 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
-; MESA-GCN: s_load_dwordx2
-; MESA-GCN: s_load_dwordx2
+; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
+; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+
 ; MESA-GCN: buffer_store_dwordx2
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f64_kernel_arg:
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
-; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
 ; MESA-GCN: buffer_store_dwordx2
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
+
+; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
index 8e0cf316735..5ece33f0195 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
 ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
+define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index ad5d3f58a07..af0e4e998f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -4,8 +4,8 @@ declare half @llvm.fabs.f16(half %a)
 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
 
 ; GCN-LABEL: {{^}}class_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
+; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI:  v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
@@ -33,7 +33,9 @@ entry:
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
@@ -53,7 +55,9 @@ entry:
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fneg = fsub half -0.0, %a.val
@@ -73,7 +77,9 @@ entry:
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index f71b9752e9a..ed4e6d39656 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,4 +1,4 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
@@ -7,14 +7,14 @@ declare float @llvm.fabs.f32(float) #1
 declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}test_class_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -22,14 +22,14 @@ define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32
 }
 
 ; SI-LABEL: {{^}}test_class_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -38,14 +38,14 @@ define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a,
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -54,14 +54,14 @@ define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a,
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -183,14 +183,14 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspac
 }
 
 ; SI-LABEL: {{^}}test_class_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -198,14 +198,14 @@ define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32
 }
 
 ; SI-LABEL: {{^}}test_class_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -214,14 +214,14 @@ define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -230,14 +230,14 @@ define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -268,14 +268,14 @@ define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a)
 
 ; Set all 9 bits of mask
 ; SI-LABEL: {{^}}test_class_full_mask_f64:
-; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
 ; SI-NOT: vcc
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
index 241b708e7ba..4badd8b7532 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
index 8d5c9aa9521..c264dc7ec69 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
index 822e8c2886b..414fe150240 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
index c2b8f3cb28c..cab6c8c0016 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 9cbbdc3ffd1..06e209ab06d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -3,11 +3,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 9018af3633b..dcb4c63949f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @div_fixup_f16(
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -35,8 +35,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_a(
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -54,8 +54,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_b(
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -73,8 +73,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_c(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -90,7 +90,7 @@ define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
-  %c.val = load half, half addrspace(1)* %c
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index b8fcacf46bb..4d9921fd7c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -5,18 +5,20 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fixup_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 25b0ad54421..34b842d8436 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -8,33 +8,36 @@ declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readno
 declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
+; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1
+
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
 ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -43,26 +46,32 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -77,8 +86,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
@@ -87,8 +96,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; SI: s_mov_b64 vcc, 0
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: s_mov_b64 vcc, 0
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -96,8 +105,8 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; SI: s_mov_b64 vcc, -1
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: s_mov_b64 vcc, -1
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 70b01d6977f..db7a91e0c86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -230,13 +230,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -244,13 +244,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -258,14 +258,14 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -273,14 +273,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
-; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
index 1515a584ebb..85aaee3dd2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s[[S_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
 ; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
-define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
+define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
 main_body:
   call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
   ret void
@@ -41,7 +41,6 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
 ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
 
-
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
 ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 0f3cf79e257..49f100f112f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -24,14 +24,14 @@ entry:
 
 ; GCN-LABEL: {{^}}ceil_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 16f167ae9e3..7c7569c984b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -24,26 +24,31 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}cos_v2f16
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
+
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
-; SI-DAG:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
 
 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+
+; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
 
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; VI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; VI-DAG:  v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GCN-NOT: and
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 631bb50bf4b..be1920f3ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -2,11 +2,9 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
 
 ; GCN-LABEL: {{^}}test_debug_value:
-; NOOPT: s_load_dwordx2 s[4:5]
-
-; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE?
-; NOOPT: ; kill: def $sgpr8_sgpr9 killed $sgpr4_sgpr5
-; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef
+; NOOPT: .loc	1 1 42 prologue_end     ; /tmp/test_debug_value.cl:1:42
+; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
 
 ; GCN: flat_store_dword
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 907fd488fa7..081dc617327 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -24,13 +24,13 @@ entry:
 
 ; GCN-LABEL: {{^}}floor_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index f59741426ba..846494f9fe4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -105,16 +105,17 @@ define amdgpu_kernel void @fma_f16_imm_c(
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+
 ; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
@@ -145,8 +146,9 @@ define amdgpu_kernel void @fma_v2f16(
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_a:
-; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 
 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@@ -157,13 +159,14 @@ define amdgpu_kernel void @fma_v2f16(
 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 
-; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
@@ -186,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_b:
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
@@ -195,10 +198,10 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
 ; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
@@ -229,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_c:
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@@ -238,26 +241,31 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
 ; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+
+; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
 ; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
 ; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
-
 ; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fma_v2f16_imm_c(
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 478658c3acc..2d1bdaf58e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -58,8 +58,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -87,56 +87,64 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmuladd_v2f16
-; VI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+
+; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
+; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
-; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
 
-; VI-FLUSH:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+
+; VI-FLUSH:     v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
 ; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
+; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
 
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
 ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
 ; VI-DENORM-NOT: v_and_b32
 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
-
 define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index c57b545dc6d..13fdd288f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -22,8 +22,8 @@ define amdgpu_kernel void @maxnum_f16(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -66,17 +66,16 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
 ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
@@ -89,7 +88,7 @@ entry:
 ; VI-NOT: and
 ; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
+; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
@@ -107,13 +106,13 @@ entry:
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
 ; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
@@ -127,7 +126,6 @@ entry:
 ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
@@ -140,13 +138,13 @@ entry:
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
 ; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -162,7 +160,6 @@ entry:
 ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
@@ -192,8 +189,8 @@ entry:
 ; GCN-LABEL: {{^}}maxnum_v4f16:
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
+; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
 define amdgpu_kernel void @maxnum_v4f16(
     <4 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 8c81fdb118b..b34ad6f6890 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -22,8 +22,8 @@ define amdgpu_kernel void @minnum_f16(
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -66,32 +66,31 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
-; SI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NOT: and
-; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
+; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -106,22 +105,21 @@ entry:
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
 ; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
 ; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
 
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+
 
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
 ; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
@@ -139,26 +137,28 @@ entry:
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
 ; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
 ; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+
+
 ; SIVI-NOT: and
 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
+; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
@@ -188,8 +188,8 @@ entry:
 ; GCN-LABEL: {{^}}minnum_v4f16:
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
+; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
 define amdgpu_kernel void @minnum_v4f16(
     <4 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 0e60f273129..7c7bf3ca2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -25,13 +25,13 @@ entry:
 
 ; GCN-LABEL: {{^}}rint_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: v_and_b32
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 839ff0636f9..aa228886c7c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,10 +1,10 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 declare half @llvm.sin.f16(half %a)
 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
 
-; GCN-LABEL: {{^}}sin_f16
+; GCN-LABEL: {{^}}sin_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]]
@@ -23,16 +23,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}sin_v2f16
+; GCN-LABEL: {{^}}sin_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
-; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
-; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; SI:  v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -40,12 +44,11 @@ entry:
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
 ; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
 ; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; VI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; VI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
 
-; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index a8ff7cbb0de..1e9212e45b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -24,13 +24,13 @@ entry:
 
 ; GCN-LABEL: {{^}}trunc_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: v_and_b32
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 5d584b63c6c..8a21f75386b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -17,7 +17,7 @@
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, i64* %ptr0, align 8
   %tmp4 = load i64, i64* %ptr1, align 8
@@ -38,7 +38,7 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64*
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, i64 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
   %tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
new file mode 100644
index 00000000000..83a2e584312
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
@@ -0,0 +1,1286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; FIXME: Manually added checks for metadata nodes at bottom
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
+; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
+
+define amdgpu_kernel void @kern_noargs() {
+; HSA-LABEL: @kern_noargs(
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noargs(
+; MESA-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
+; HSA-LABEL: @kern_i8(
+; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]] to [[KERN_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i8(
+; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
+; HSA-LABEL: @kern_i16(
+; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]] to [[KERN_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i16(
+; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_f16(half %arg) #0 {
+; HSA-LABEL: @kern_f16(
+; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]] to [[KERN_F16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
+; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_f16(
+; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
+; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store half %arg, half addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
+; HSA-LABEL: @kern_zeroext_i8(
+; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_zeroext_i8(
+; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !1, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
+; HSA-LABEL: @kern_zeroext_i16(
+; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_zeroext_i16(
+; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !2, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
+; HSA-LABEL: @kern_signext_i8(
+; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_signext_i8(
+; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !3, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
+; HSA-LABEL: @kern_signext_i16(
+; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_signext_i16(
+; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
+; HSA-LABEL: @kern_i8_i8(
+; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]] to [[KERN_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i8_i8(
+; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
+  store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
+; HSA-LABEL: @kern_v3i8(
+; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]] to [[KERN_V3I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
+; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
+; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_v3i8(
+; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP5]] to <3 x i8>
+; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_i24(i24 %arg0) {
+; HSA-LABEL: @kern_i24(
+; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]] to [[KERN_I24:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
+; HSA-NEXT:    store i24 [[TMP4]], i24 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i24(
+; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I24:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; MESA-NEXT:    store i24 [[TMP5]], i24 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i24 %arg0, i24 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_i32(i32 %arg0) {
+; HSA-LABEL: @kern_i32(
+; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]] to [[KERN_I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i32(
+; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_f32(float %arg0) {
+; HSA-LABEL: @kern_f32(
+; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]] to [[KERN_F32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_f32(
+; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store float %arg0, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
+; HSA-LABEL: @kern_v3i32(
+; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]] to [[KERN_V3I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_v3i32(
+; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
+; HSA-LABEL: @kern_i32_v3i32(
+; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 1
+; HSA-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i32_v3i32(
+; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 1
+; MESA-NEXT:    [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
+  ret void
+}
+
+%struct.a = type { i32, i8, [4 x i8] }
+%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
+
+define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
+; HSA-LABEL: @kern_struct_a(
+; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_struct_a(
+; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store %struct.a %arg0, %struct.a addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
+; HSA-LABEL: @kern_struct_b_packed(
+; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_struct_b_packed(
+; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
+; HSA-LABEL: @kern_implicit_arg_num_bytes(
+; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_implicit_arg_num_bytes(
+; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
+; HSA-LABEL: @kern_lds_ptr(
+; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_lds_ptr(
+; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
+; MESA-NEXT:    ret void
+;
+  store i32 0, i32 addrspace(3)* %lds, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
+; HSA-LABEL: @kern_lds_ptr_si(
+; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
+; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_lds_ptr_si(
+; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
+; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
+; MESA-NEXT:    ret void
+;
+  store i32 0, i32 addrspace(3)* %lds, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP16]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i8 %arg3, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_v3i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i24
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP7]] to <3 x i8>
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_v3i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i24
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP8]] to <3 x i8>
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_i16(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i16 [[TMP8]], i16 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i16(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i16 [[TMP9]], i16 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i16 %arg1, i16 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
+; HSA-LABEL: @kern_realign_i1_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP12]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP13]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i1 %arg2, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
+; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP12]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP16]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP13]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP17]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i1 %arg2, i1 addrspace(1)* undef
+  store volatile i1 %arg3, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_v3i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP7]] to <3 x i1>
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_v3i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i3
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP8]] to <3 x i1>
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_i16(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i16 [[TMP8]], i16 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i16(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i16 [[TMP9]], i16 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i16 %arg1, i16 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; HSA-NEXT:    [[TMP17:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP17]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 8
+; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
+; HSA-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP21]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP22:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 16
+; HSA-NEXT:    [[TMP24:%.*]] = trunc i32 [[TMP23]] to i8
+; HSA-NEXT:    [[TMP25:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP25]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP26:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP27:%.*]] = lshr i32 [[TMP26]], 24
+; HSA-NEXT:    [[TMP28:%.*]] = trunc i32 [[TMP27]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP16]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP24]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP28]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
+; MESA-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP18]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP19:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 8
+; MESA-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
+; MESA-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP22]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP23:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP24:%.*]] = lshr i32 [[TMP23]], 16
+; MESA-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i8
+; MESA-NEXT:    [[TMP26:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP26]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP27:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP28:%.*]] = lshr i32 [[TMP27]], 24
+; MESA-NEXT:    [[TMP29:%.*]] = trunc i32 [[TMP28]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP21]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP25]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP29]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i8 %arg3, i8 addrspace(1)* undef
+  store volatile i8 %arg5, i8 addrspace(1)* undef
+  store volatile i8 %arg6, i8 addrspace(1)* undef
+  store volatile i8 %arg7, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
+; HSA-LABEL: @kern_realign_f16_f16(
+; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP8]] to half
+; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
+; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_f16_f16(
+; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP9]] to half
+; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
+; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile half %arg0, half addrspace(1)* undef
+  store volatile half %arg1, half addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr_dereferencable(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr_dereferencable(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !5
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !6
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
+; HSA-LABEL: @kern_nonnull_global_ptr(
+; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_nonnull_global_ptr(
+; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
+; HSA-LABEL: @kern_align32_global_ptr(
+; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_align32_global_ptr(
+; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !7
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
+; HSA-LABEL: @kern_noalias_global_ptr(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
+; HSA-LABEL: @kern_noalias_global_ptr_x2(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr_x2(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
+  store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="kaveri" }
+attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+
+; HSA: 0 = !{}
+; HSA: !1 = !{i64 42}
+; HSA: !2 = !{i64 128}
+; HSA: !3 = !{i64 1024}
+
+
+; MESA: !0 = !{}
+; MESA: !1 = !{i32 0, i32 256}
+; MESA: !2 = !{i32 0, i32 65536}
+; MESA: !3 = !{i32 -128, i32 128}
+; MESA: !4 = !{i32 -32768, i32 32768}
+; MESA: !5 = !{i64 42}
+; MESA: !6 = !{i64 128}
+; MESA: !7 = !{i64 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 413c93d533a..e88550fb92c 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,28 +8,14 @@
 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
 
-
-; VI: s_load_dword [[LHS:s[0-9]+]]
-; VI: s_load_dword [[RHS:s[0-9]+]]
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
-; VI-DAG: s_lshl_b32
-; VI: v_or_b32_e32
-
-; CI: s_load_dword s
-; CI-NEXT: s_load_dword s
-; CI-NOT: {{buffer|flat}}
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; CI: s_and_b32
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; CI: s_and_b32
-; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
-; CI: s_lshl_b32
-; CI: v_or_b32_e32
+; CIVI: s_load_dword [[LHS:s[0-9]+]]
+; CIVI: s_load_dword [[RHS:s[0-9]+]]
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
+; CIVI-DAG: s_lshl_b32
+; CIVI: v_or_b32_e32
 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 9fe1af84be0..3528eaff818 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -206,14 +206,14 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
 ; SIFoldOperands should not fold the SGPR copy into the instruction
 ; because the implicit immediate already uses the constant bus.
 ; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN:    s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN:    s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
 ; GCN:    v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
 ; GCN:    {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
 ; GCN:    v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN:    v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
 ; GFX6:   buffer_store_dword [[MUL]]
 ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
-define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 3eefa600f88..7f8bb878d9c 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -83,7 +83,7 @@ define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 6387c9ff6df..b79f477f3ea 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -216,14 +216,14 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou
 
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
 ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -236,14 +236,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspac
 ; Make sure redundant sign_extend_inreg removed.
 
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
 ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index b4b1ab82492..b2d62f5e13a 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
@@ -65,16 +65,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
   ret void
 }
 
-; XXX - should be able to use s_min if we stop unnecessarily doing
-; extloads with mubuf instructions.
-
+; FIXME: Why vector and sdwa for last element?
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
 ; GCN: s_load_dword s
 ; GCN: s_load_dword s
@@ -88,7 +86,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
 ; VI: s_min_i32
 ; VI: s_min_i32
 ; VI: s_min_i32
-; VI: s_min_i32
+; VI: v_min_i32_sdwa
 
 ; GFX9: v_min_i16
 ; GFX9: v_min_i16
@@ -99,7 +97,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
@@ -111,8 +109,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4
 ; GCN: s_load_dword s
 
 ; SI: s_ashr_i32
-; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
+; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
 ; SI: s_min_i32
 ; SI: s_min_i32
@@ -346,8 +344,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs
 }
 
 ; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI: {{buffer|flat|global}}_load_ubyte
+; SI: {{buffer|flat|global}}_load_ubyte
 ; SI: v_min_u32_e32
 
 ; GFX89: {{flat|global}}_load_ubyte
@@ -490,14 +488,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <
 
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
-; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
 ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -510,14 +508,17 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspac
 ; Make sure redundant sign_extend_inreg removed.
 
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]]
+; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]]
+
+; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]]
 ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 9b00507a9c8..9746ee0d89c 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -6,7 +6,7 @@
 ; resulting in losing the store to gptr
 
 ; FUNC-LABEL: {{^}}missing_store_reduced:
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI: ds_read_b64
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 5bc59c9f636..2144b4b4cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index 0250002c33a..678fc3d1daf 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -16,7 +16,8 @@ define i16 @v_mul_i16(i16 %a, i16 %b) {
 
 ; FIXME: Should emit scalar mul or maybe i16 v_mul here
 ; GCN-LABEL: {{^}}s_mul_i16:
-; GCN: v_mul_u32_u24
+; SI: v_mul_u32_u24
+; VI: s_mul_i16
 define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
   %r.val = mul i16 %a, %b
   store volatile i16 %r.val, i16 addrspace(1)* null
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index a3d05d10b4a..b4461586ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -114,7 +114,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -201,10 +201,8 @@ endif:
 
 ; FIXME: Load dwordx4
 ; FUNC-LABEL: {{^}}s_mul_i128:
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx4
 
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
@@ -220,18 +218,23 @@ endif:
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 
-; VI: s_mul_i32
 ; VI: v_mul_hi_u32
 ; VI: s_mul_i32
+; VI: s_mul_i32
+; VI: v_mul_hi_u32
 ; VI: v_mul_hi_u32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
+; VI: s_mul_i32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
-
+; VI: s_mul_i32
 
 
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 3137569e9ca..84f6d4f8a45 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -70,7 +70,7 @@ entry:
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e538dd7a86a..81f72dd1f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -18,8 +18,11 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}test_umul24_i16_sext:
-; GCN: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
+; SI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
+
+; VI: s_mul_i32 [[MUL:s[0-9]+]]
+; VI: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
 define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -46,9 +49,12 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
 }
 
 ; FUNC-LABEL: {{^}}test_umul24_i16:
-; GCN: s_and_b32
-; GCN: v_mul_u32_u24_e32
-; GCN: v_and_b32_e32
+; SI: s_and_b32
+; SI: v_mul_u32_u24_e32
+; SI: v_and_b32_e32
+
+; VI: s_mul_i32
+; VI: s_and_b32
 define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -147,7 +153,7 @@ entry:
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index b2cfeca703a..e436a851767 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -70,14 +70,14 @@
 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
 
-; GCN: ; %Flow1
+; GCN: ; %Flow5
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 
 ; GCN: ; %exit1
 ; GCN: ds_write_b32
 
-; GCN: %Flow2
+; GCN: %Flow6
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 ; GCN-NEXT: s_and_saveexec_b64
diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index 48b9c13c752..eb16f8517ff 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -78,7 +78,7 @@ define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out,
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %ou
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -147,7 +147,7 @@ define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out,
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -171,7 +171,7 @@ define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
index bced3c408c5..763d754c1b1 100644
--- a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -6,7 +6,7 @@
 ; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; GCN:  flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 
-define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
 bb:
   %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
   %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index fc6f070b737..6fc0955b88e 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -4,14 +4,18 @@
 ; Make sure there isn't an extra space between the instruction name and first operands.
 
 ; GCN-LABEL: {{^}}add_f32:
-; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
-; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; SI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]]
+
+; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 8e6885c4fc5..88e01c96e79 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -63,26 +63,26 @@ define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a)
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -92,7 +92,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
 ; SI-NOT: or_b32
@@ -101,7 +101,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)*
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -239,7 +239,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -249,7 +249,7 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i
 ; FUNC-LABEL: {{^}}or_i1:
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
-; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
 define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 49f00e9447d..d31d636cc41 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -10,26 +10,26 @@
 
 ; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
 
-; GCN: def s[8:15]
-; GCN: def s[16:23]
-; GCN: def s[24:31]
-; GCN: def s[32:39]
-; GCN: def s[40:47]
-; GCN: def s[48:55]
-; GCN: def s[56:63]
-; GCN: def s[64:71]
-; GCN: def s[72:79]
-; GCN: def s[80:87]
-; GCN: def s[88:95]
-
-; GCN: v_writelane_b32 v0, s8, 0
-; GCN-NEXT: v_writelane_b32 v0, s9, 1
-; GCN-NEXT: v_writelane_b32 v0, s10, 2
-; GCN-NEXT: v_writelane_b32 v0, s11, 3
-; GCN-NEXT: v_writelane_b32 v0, s12, 4
-; GCN-NEXT: v_writelane_b32 v0, s13, 5
-; GCN-NEXT: v_writelane_b32 v0, s14, 6
-; GCN-NEXT: v_writelane_b32 v0, s15, 7
+; GCN: def s[4:11]
+; GCN: def s[12:19]
+; GCN: def s[20:27]
+; GCN: def s[28:35]
+; GCN: def s[36:43]
+; GCN: def s[44:51]
+; GCN: def s[52:59]
+; GCN: def s[60:67]
+; GCN: def s[68:75]
+; GCN: def s[76:83]
+; GCN: def s[84:91]
+
+; GCN: v_writelane_b32 v0, s4, 0
+; GCN-NEXT: v_writelane_b32 v0, s5, 1
+; GCN-NEXT: v_writelane_b32 v0, s6, 2
+; GCN-NEXT: v_writelane_b32 v0, s7, 3
+; GCN-NEXT: v_writelane_b32 v0, s8, 4
+; GCN-NEXT: v_writelane_b32 v0, s9, 5
+; GCN-NEXT: v_writelane_b32 v0, s10, 6
+; GCN-NEXT: v_writelane_b32 v0, s11, 7
 
 ; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
 ; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
@@ -37,8 +37,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
-; GCN-NEXT: v_writelane_b32 v0, s13, 13
-; GCN-NEXT: v_writelane_b32 v0, s14, 14
+; GCN-NEXT: v_writelane_b32 v0, s9, 13
+; GCN-NEXT: v_writelane_b32 v0, s10, 14
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -47,8 +47,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
-; GCN-NEXT: v_writelane_b32 v0, s13, 21
-; GCN-NEXT: v_writelane_b32 v0, s14, 22
+; GCN-NEXT: v_writelane_b32 v0, s9, 21
+; GCN-NEXT: v_writelane_b32 v0, s10, 22
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -57,8 +57,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
-; GCN-NEXT: v_writelane_b32 v0, s13, 29
-; GCN-NEXT: v_writelane_b32 v0, s14, 30
+; GCN-NEXT: v_writelane_b32 v0, s9, 29
+; GCN-NEXT: v_writelane_b32 v0, s10, 30
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -67,8 +67,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
-; GCN-NEXT: v_writelane_b32 v0, s13, 37
-; GCN-NEXT: v_writelane_b32 v0, s14, 38
+; GCN-NEXT: v_writelane_b32 v0, s9, 37
+; GCN-NEXT: v_writelane_b32 v0, s10, 38
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -77,8 +77,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
-; GCN-NEXT: v_writelane_b32 v0, s13, 45
-; GCN-NEXT: v_writelane_b32 v0, s14, 46
+; GCN-NEXT: v_writelane_b32 v0, s9, 45
+; GCN-NEXT: v_writelane_b32 v0, s10, 46
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -87,90 +87,90 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
-; GCN-NEXT: v_writelane_b32 v0, s13, 53
-; GCN-NEXT: v_writelane_b32 v0, s14, 54
+; GCN-NEXT: v_writelane_b32 v0, s9, 53
+; GCN-NEXT: v_writelane_b32 v0, s10, 54
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
 
-; GCN-NEXT: v_writelane_b32 v0, s88, 56
-; GCN-NEXT: v_writelane_b32 v0, s89, 57
-; GCN-NEXT: v_writelane_b32 v0, s90, 58
-; GCN-NEXT: v_writelane_b32 v0, s91, 59
-; GCN-NEXT: v_writelane_b32 v0, s92, 60
-; GCN-NEXT: v_writelane_b32 v0, s93, 61
-; GCN-NEXT: v_writelane_b32 v0, s94, 62
-; GCN-NEXT: v_writelane_b32 v0, s95, 63
-; GCN-NEXT: v_writelane_b32 v1, s16, 0
-; GCN-NEXT: v_writelane_b32 v1, s17, 1
-; GCN-NEXT: v_writelane_b32 v1, s18, 2
-; GCN-NEXT: v_writelane_b32 v1, s19, 3
-; GCN-NEXT: v_writelane_b32 v1, s20, 4
-; GCN-NEXT: v_writelane_b32 v1, s21, 5
-; GCN-NEXT: v_writelane_b32 v1, s22, 6
-; GCN-NEXT: v_writelane_b32 v1, s23, 7
-; GCN-NEXT: v_writelane_b32 v1, s24, 8
-; GCN-NEXT: v_writelane_b32 v1, s25, 9
-; GCN-NEXT: v_writelane_b32 v1, s26, 10
-; GCN-NEXT: v_writelane_b32 v1, s27, 11
-; GCN-NEXT: v_writelane_b32 v1, s28, 12
-; GCN-NEXT: v_writelane_b32 v1, s29, 13
-; GCN-NEXT: v_writelane_b32 v1, s30, 14
-; GCN-NEXT: v_writelane_b32 v1, s31, 15
-; GCN-NEXT: v_writelane_b32 v1, s32, 16
-; GCN-NEXT: v_writelane_b32 v1, s33, 17
-; GCN-NEXT: v_writelane_b32 v1, s34, 18
-; GCN-NEXT: v_writelane_b32 v1, s35, 19
-; GCN-NEXT: v_writelane_b32 v1, s36, 20
-; GCN-NEXT: v_writelane_b32 v1, s37, 21
-; GCN-NEXT: v_writelane_b32 v1, s38, 22
-; GCN-NEXT: v_writelane_b32 v1, s39, 23
-; GCN-NEXT: v_writelane_b32 v1, s40, 24
-; GCN-NEXT: v_writelane_b32 v1, s41, 25
-; GCN-NEXT: v_writelane_b32 v1, s42, 26
-; GCN-NEXT: v_writelane_b32 v1, s43, 27
-; GCN-NEXT: v_writelane_b32 v1, s44, 28
-; GCN-NEXT: v_writelane_b32 v1, s45, 29
-; GCN-NEXT: v_writelane_b32 v1, s46, 30
-; GCN-NEXT: v_writelane_b32 v1, s47, 31
-; GCN-NEXT: v_writelane_b32 v1, s48, 32
-; GCN-NEXT: v_writelane_b32 v1, s49, 33
-; GCN-NEXT: v_writelane_b32 v1, s50, 34
-; GCN-NEXT: v_writelane_b32 v1, s51, 35
-; GCN-NEXT: v_writelane_b32 v1, s52, 36
-; GCN-NEXT: v_writelane_b32 v1, s53, 37
-; GCN-NEXT: v_writelane_b32 v1, s54, 38
-; GCN-NEXT: v_writelane_b32 v1, s55, 39
-; GCN-NEXT: v_writelane_b32 v1, s56, 40
-; GCN-NEXT: v_writelane_b32 v1, s57, 41
-; GCN-NEXT: v_writelane_b32 v1, s58, 42
-; GCN-NEXT: v_writelane_b32 v1, s59, 43
-; GCN-NEXT: v_writelane_b32 v1, s60, 44
-; GCN-NEXT: v_writelane_b32 v1, s61, 45
-; GCN-NEXT: v_writelane_b32 v1, s62, 46
-; GCN-NEXT: v_writelane_b32 v1, s63, 47
-; GCN-NEXT: v_writelane_b32 v1, s64, 48
-; GCN-NEXT: v_writelane_b32 v1, s65, 49
-; GCN-NEXT: v_writelane_b32 v1, s66, 50
-; GCN-NEXT: v_writelane_b32 v1, s67, 51
-; GCN-NEXT: v_writelane_b32 v1, s68, 52
-; GCN-NEXT: v_writelane_b32 v1, s69, 53
-; GCN-NEXT: v_writelane_b32 v1, s70, 54
-; GCN-NEXT: v_writelane_b32 v1, s71, 55
-; GCN-NEXT: v_writelane_b32 v1, s72, 56
-; GCN-NEXT: v_writelane_b32 v1, s73, 57
-; GCN-NEXT: v_writelane_b32 v1, s74, 58
-; GCN-NEXT: v_writelane_b32 v1, s75, 59
-; GCN-NEXT: v_writelane_b32 v1, s76, 60
-; GCN-NEXT: v_writelane_b32 v1, s77, 61
-; GCN-NEXT: v_writelane_b32 v1, s78, 62
-; GCN-NEXT: v_writelane_b32 v1, s79, 63
-; GCN-NEXT: v_writelane_b32 v2, s80, 0
-; GCN-NEXT: v_writelane_b32 v2, s81, 1
-; GCN-NEXT: v_writelane_b32 v2, s82, 2
-; GCN-NEXT: v_writelane_b32 v2, s83, 3
-; GCN-NEXT: v_writelane_b32 v2, s84, 4
-; GCN-NEXT: v_writelane_b32 v2, s85, 5
-; GCN-NEXT: v_writelane_b32 v2, s86, 6
-; GCN-NEXT: v_writelane_b32 v2, s87, 7
+; GCN-NEXT: v_writelane_b32 v0, s84, 56
+; GCN-NEXT: v_writelane_b32 v0, s85, 57
+; GCN-NEXT: v_writelane_b32 v0, s86, 58
+; GCN-NEXT: v_writelane_b32 v0, s87, 59
+; GCN-NEXT: v_writelane_b32 v0, s88, 60
+; GCN-NEXT: v_writelane_b32 v0, s89, 61
+; GCN-NEXT: v_writelane_b32 v0, s90, 62
+; GCN-NEXT: v_writelane_b32 v0, s91, 63
+; GCN-NEXT: v_writelane_b32 v1, s12, 0
+; GCN-NEXT: v_writelane_b32 v1, s13, 1
+; GCN-NEXT: v_writelane_b32 v1, s14, 2
+; GCN-NEXT: v_writelane_b32 v1, s15, 3
+; GCN-NEXT: v_writelane_b32 v1, s16, 4
+; GCN-NEXT: v_writelane_b32 v1, s17, 5
+; GCN-NEXT: v_writelane_b32 v1, s18, 6
+; GCN-NEXT: v_writelane_b32 v1, s19, 7
+; GCN-NEXT: v_writelane_b32 v1, s20, 8
+; GCN-NEXT: v_writelane_b32 v1, s21, 9
+; GCN-NEXT: v_writelane_b32 v1, s22, 10
+; GCN-NEXT: v_writelane_b32 v1, s23, 11
+; GCN-NEXT: v_writelane_b32 v1, s24, 12
+; GCN-NEXT: v_writelane_b32 v1, s25, 13
+; GCN-NEXT: v_writelane_b32 v1, s26, 14
+; GCN-NEXT: v_writelane_b32 v1, s27, 15
+; GCN-NEXT: v_writelane_b32 v1, s28, 16
+; GCN-NEXT: v_writelane_b32 v1, s29, 17
+; GCN-NEXT: v_writelane_b32 v1, s30, 18
+; GCN-NEXT: v_writelane_b32 v1, s31, 19
+; GCN-NEXT: v_writelane_b32 v1, s32, 20
+; GCN-NEXT: v_writelane_b32 v1, s33, 21
+; GCN-NEXT: v_writelane_b32 v1, s34, 22
+; GCN-NEXT: v_writelane_b32 v1, s35, 23
+; GCN-NEXT: v_writelane_b32 v1, s36, 24
+; GCN-NEXT: v_writelane_b32 v1, s37, 25
+; GCN-NEXT: v_writelane_b32 v1, s38, 26
+; GCN-NEXT: v_writelane_b32 v1, s39, 27
+; GCN-NEXT: v_writelane_b32 v1, s40, 28
+; GCN-NEXT: v_writelane_b32 v1, s41, 29
+; GCN-NEXT: v_writelane_b32 v1, s42, 30
+; GCN-NEXT: v_writelane_b32 v1, s43, 31
+; GCN-NEXT: v_writelane_b32 v1, s44, 32
+; GCN-NEXT: v_writelane_b32 v1, s45, 33
+; GCN-NEXT: v_writelane_b32 v1, s46, 34
+; GCN-NEXT: v_writelane_b32 v1, s47, 35
+; GCN-NEXT: v_writelane_b32 v1, s48, 36
+; GCN-NEXT: v_writelane_b32 v1, s49, 37
+; GCN-NEXT: v_writelane_b32 v1, s50, 38
+; GCN-NEXT: v_writelane_b32 v1, s51, 39
+; GCN-NEXT: v_writelane_b32 v1, s52, 40
+; GCN-NEXT: v_writelane_b32 v1, s53, 41
+; GCN-NEXT: v_writelane_b32 v1, s54, 42
+; GCN-NEXT: v_writelane_b32 v1, s55, 43
+; GCN-NEXT: v_writelane_b32 v1, s56, 44
+; GCN-NEXT: v_writelane_b32 v1, s57, 45
+; GCN-NEXT: v_writelane_b32 v1, s58, 46
+; GCN-NEXT: v_writelane_b32 v1, s59, 47
+; GCN-NEXT: v_writelane_b32 v1, s60, 48
+; GCN-NEXT: v_writelane_b32 v1, s61, 49
+; GCN-NEXT: v_writelane_b32 v1, s62, 50
+; GCN-NEXT: v_writelane_b32 v1, s63, 51
+; GCN-NEXT: v_writelane_b32 v1, s64, 52
+; GCN-NEXT: v_writelane_b32 v1, s65, 53
+; GCN-NEXT: v_writelane_b32 v1, s66, 54
+; GCN-NEXT: v_writelane_b32 v1, s67, 55
+; GCN-NEXT: v_writelane_b32 v1, s68, 56
+; GCN-NEXT: v_writelane_b32 v1, s69, 57
+; GCN-NEXT: v_writelane_b32 v1, s70, 58
+; GCN-NEXT: v_writelane_b32 v1, s71, 59
+; GCN-NEXT: v_writelane_b32 v1, s72, 60
+; GCN-NEXT: v_writelane_b32 v1, s73, 61
+; GCN-NEXT: v_writelane_b32 v1, s74, 62
+; GCN-NEXT: v_writelane_b32 v1, s75, 63
+; GCN-NEXT: v_writelane_b32 v2, s76, 0
+; GCN-NEXT: v_writelane_b32 v2, s77, 1
+; GCN-NEXT: v_writelane_b32 v2, s78, 2
+; GCN-NEXT: v_writelane_b32 v2, s79, 3
+; GCN-NEXT: v_writelane_b32 v2, s80, 4
+; GCN-NEXT: v_writelane_b32 v2, s81, 5
+; GCN-NEXT: v_writelane_b32 v2, s82, 6
+; GCN-NEXT: v_writelane_b32 v2, s83, 7
 ; GCN: s_cbranch_scc1
 
 
@@ -393,24 +393,25 @@ ret:
 ; into the next available VGPR.
 
 ; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
-; GCN: def s[24:39]
-
-; GCN: v_writelane_b32 v0, s24, 50
-; GCN-NEXT: v_writelane_b32 v0, s25, 51
-; GCN-NEXT: v_writelane_b32 v0, s26, 52
-; GCN-NEXT: v_writelane_b32 v0, s27, 53
-; GCN-NEXT: v_writelane_b32 v0, s28, 54
-; GCN-NEXT: v_writelane_b32 v0, s29, 55
-; GCN-NEXT: v_writelane_b32 v0, s30, 56
-; GCN-NEXT: v_writelane_b32 v0, s31, 57
-; GCN-NEXT: v_writelane_b32 v0, s32, 58
-; GCN-NEXT: v_writelane_b32 v0, s33, 59
-; GCN-NEXT: v_writelane_b32 v0, s34, 60
-; GCN-NEXT: v_writelane_b32 v0, s35, 61
-; GCN-NEXT: v_writelane_b32 v0, s36, 62
-; GCN-NEXT: v_writelane_b32 v0, s37, 63
-; GCN-NEXT: v_writelane_b32 v1, s38, 0
-; GCN-NEXT: v_writelane_b32 v1, s39, 1
+; GCN: def s[4:19]
+; GCN: def s[20:35]
+
+; GCN: v_writelane_b32 v0, s4, 50
+; GCN-NEXT: v_writelane_b32 v0, s5, 51
+; GCN-NEXT: v_writelane_b32 v0, s6, 52
+; GCN-NEXT: v_writelane_b32 v0, s7, 53
+; GCN-NEXT: v_writelane_b32 v0, s8, 54
+; GCN-NEXT: v_writelane_b32 v0, s9, 55
+; GCN-NEXT: v_writelane_b32 v0, s10, 56
+; GCN-NEXT: v_writelane_b32 v0, s11, 57
+; GCN-NEXT: v_writelane_b32 v0, s12, 58
+; GCN-NEXT: v_writelane_b32 v0, s13, 59
+; GCN-NEXT: v_writelane_b32 v0, s14, 60
+; GCN-NEXT: v_writelane_b32 v0, s15, 61
+; GCN-NEXT: v_writelane_b32 v0, s16, 62
+; GCN-NEXT: v_writelane_b32 v0, s17, 63
+; GCN-NEXT: v_writelane_b32 v1, s18, 0
+; GCN-NEXT: v_writelane_b32 v1, s19, 1
 
 ; GCN: v_readlane_b32 s4, v0, 50
 ; GCN-NEXT: v_readlane_b32 s5, v0, 51
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 76508731aca..aae8e649def 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -40,8 +40,7 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
 
 ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
 ; GCN: s_load_dword s
-; GCN-NEXT: s_load_dword s
-; GCN-NEXT: s_load_dword s
+; GCN-NEXT: s_load_dwordx2 s
 ; GCN-NOT: {{buffer|flat|global}}
 
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index b6d6006dc40..f57ca3ad600 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -203,8 +203,11 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i1
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
+  %a = load volatile i16, i16 addrspace(1)* undef
+  %b = load volatile i16, i16 addrspace(1)* undef
+  %c = load volatile i16, i16 addrspace(1)* undef
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -233,8 +236,31 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
+  %a = load volatile i8, i8 addrspace(1)* undef
+  %b = load volatile i8, i8 addrspace(1)* undef
+  %c = load volatile i8, i8 addrspace(1)* undef
+  %icmp0 = icmp ugt i8 %a, %b
+  %sub0 = sub i8 %a, %b
+  %sub1 = sub i8 %b, %a
+  %ret0 = select i1 %icmp0, i8 %sub0, i8 %sub1
+
+  %ret = add i8 %ret0, %c
+
+  store i8 %ret, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
+; GCN: s_load_dword
+; GCN: s_bfe_u32
+; GCN: s_sub_i32
+; GCN: s_and_b32
+; GCN: s_sub_i32
+; GCN: s_lshr_b32
+; GCN: v_add_i32_e32
+define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 118bb9aa66e..0645bb474ca 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -2,14 +2,11 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}cluster_arg_loads:
-; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+
+; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
 define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
@@ -42,7 +39,7 @@ define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
     i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
 entry:
-  %value = add i64 %arg125, %arg126
+  %value = add i64 %arg124, %arg126
   store i64 %value, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 2072cf5e887..ee32dce1a60 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
 
-; SI: NumSgprs: {{[1-9]$}}
-; SI: NumVgprs: {{[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}
+
+; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}}
+; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}}
 
 ; stores may alias loads
 ; VI: NumSgprs: {{[0-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index f784e225ded..8e60aa1811f 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
 
@@ -18,12 +18,12 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1
 ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
 ; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
-; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
-; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
 ; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
-define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 540eb9ca93b..33028f17531 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -134,8 +134,9 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
 }
 
 ; GCN-LABEL: {{^}}regression:
-; GCN: v_cmp_neq_f32_e64
-; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0
+; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
+; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
+; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 9030baa04c5..7d1f75aad51 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @select_f16(
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -54,9 +54,9 @@ define amdgpu_kernel void @select_f16_imm_a(
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half 0xH3800, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -84,9 +84,9 @@ define amdgpu_kernel void @select_f16_imm_b(
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, 0xH3800
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -115,9 +115,9 @@ define amdgpu_kernel void @select_f16_imm_c(
     half addrspace(1)* %b,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half 0xH3800, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -145,9 +145,9 @@ define amdgpu_kernel void @select_f16_imm_d(
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half 0xH3800
   store half %r.val, half addrspace(1)* %r
@@ -197,10 +197,10 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 
-; SI: v_cmp_lt_f32_e32 vcc, 0.5
-; SI: v_cndmask_b32_e32
 ; SI: v_cmp_gt_f32_e32
 ; SI: v_cndmask_b32_e32
+  ; SI: v_cmp_lt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_lt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -233,10 +233,10 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 
-; SI: v_cmp_gt_f32_e32 vcc, 0.5
-; SI: v_cndmask_b32_e32
 ; SI: v_cmp_lt_f32_e32
 ; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_gt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -272,7 +272,7 @@ entry:
 ; SI: v_cmp_nlt_f32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cmp_nlt_f32_e32
-; SI: v_cndmask_b32_e32
+; SI-DAG: v_cndmask_b32_e32
 
 ; VI: v_cmp_nlt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -280,8 +280,8 @@ entry:
 ; VI: v_cmp_nlt_f16_e32
 ; VI: v_cndmask_b32_e32
 
-; SI:  v_cvt_f16_f32_e32
-; SI:  v_cvt_f16_f32_e32
+; SI-DAG: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
 ; GCN: s_endpgm
 define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index caddb6f6821..a0c5f3b6372 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -180,16 +180,14 @@ define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1
   ret void
 }
 
-; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: v_cmp_ne_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+; FUNC-LABEL: {{^}}v_cmp_sext_k_neg1_i8_sext_arg:
+; GCN: v_cmp_ne_u32_e32 vcc, -1, v0
+; GCN-NEXT: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_byte [[SELECT]]
+define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 575938b5a5c..a1bd5b06407 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -38,37 +38,37 @@ endif:
 ; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]]
 
 ; SI: ; %bb.1: ; %else
-; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf
+; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e
 ; SI-NOT: add
 ; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; SI: [[IF]]: ; %if
-; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI-NOT: add
 
 ; SI: [[ENDIF]]: ; %endif
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
 entry:
-  %0 = icmp eq i32 %a, 0
-  br i1 %0, label %if, label %else
+  %cmp0 = icmp eq i32 %a, 0
+  br i1 %cmp0, label %if, label %else
 
 if:
-  %1 = add i32 %b, %c
+  %add0 = add i32 %b, %c
   br label %endif
 
 else:
-  %2 = add i32 %d, %e
+  %add1 = add i32 %d, %e
   br label %endif
 
 endif:
-  %3 = phi i32 [%1, %if], [%2, %else]
-  %4 = add i32 %3, %a
-  store i32 %4, i32 addrspace(1)* %out
+  %phi = phi i32 [%add0, %if], [%add1, %else]
+  %add2 = add i32 %phi, %a
+  store i32 %add2, i32 addrspace(1)* %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 14f46464294..ade2803a80d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -275,11 +275,11 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_shl_32_i64:
-; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 083ec9edc06..ce6125d52cc 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -14,10 +14,12 @@
 ; VI: s_lshr_b32
 ; VI: s_and_b32
 ; VI: s_and_b32
+; VI: s_lshl_b32
+; VI: s_lshl_b32
+; VI: s_lshl_b32
 ; VI: s_and_b32
 ; VI: s_or_b32
 
-
 ; CI: s_load_dword s
 ; CI: s_load_dword s
 ; CI: s_lshr_b32
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 26cdae3cbb0..5270de2314d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -54,14 +54,13 @@ define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}test_add_shl_add_constant:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], s[[Y]]
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -70,15 +69,14 @@ define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32
 }
 
 ; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], s[[Y]], [[SHL3]]
 ; SI: s_addk_i32 [[TMP]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 429493c85fb..dfdd7753a4d 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -55,6 +55,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspa
 }
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index 9e10f049c60..22f11da4648 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -370,7 +370,7 @@ bb:
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -385,7 +385,7 @@ bb:
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index 96a318fef02..167a179216c 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -193,7 +193,7 @@ define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 %val0, [8 x i32], i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 6529bf4afc6..598048dcff9 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -8,34 +8,15 @@
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
 
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_sub_i32
-; VI: s_sub_i32
-; VI: s_max_i32
-; VI: s_max_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_and_b32
-; SI: s_or_b32
-
-; CI-NOT: {{buffer|flat}}_load
-; CI: s_load_dword s
-; CI-NOT: {{buffer|flat}}_load
-; CI: s_lshr_b32
-; CI: s_ashr_i32
-; CI: s_sext_i32_i16
-; CI: s_sub_i32
-; CI: s_sub_i32
-; CI: s_sext_i32_i16
-; CI: s_sext_i32_i16
-; CI: s_max_i32
-; CI: s_max_i32
-; CI: s_lshl_b32
-; CI: s_add_i32
-; CI: s_add_i32
-; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff
-; CI: s_or_b32
-
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_sub_i32
+; CIVI: s_sub_i32
+; CIVI: s_max_i32
+; CIVI: s_max_i32
+; CIVI: s_add_i32
+; CIVI: s_add_i32
+; CIVI: s_and_b32
+; CIVI: s_or_b32
 define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
@@ -61,6 +42,17 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]]  dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_and_b32
 ; VI: v_or_b32_e32
+
+; CI: buffer_load_dword v
+; CI: v_lshrrev_b32_e32
+; CI: v_sub_i32_e32
+; CI: v_bfe_i32
+; CI: v_bfe_i32
+; CI: v_max_i32
+; CI: v_max_i32
+; CI: v_add_i32
+; CI: v_add_i32
+; CI: v_or_b32
 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
@@ -115,10 +107,8 @@ define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c
 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]]
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
-
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]]
-
 ; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 8020b9767cc..d16dcd9ccac 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -46,10 +46,10 @@ entry:
 ; GCN-LABEL: {{^}}smrd3:
 ; FIXME: There are too many copies here because we don't fold immediates
 ;        through REG_SEQUENCE
-; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
+; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(4)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296
   %tmp1 = load i32, i32 addrspace(4)* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ee8273779e5..36e332c5798 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -231,11 +231,11 @@ define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> ad
 }
 
 ; GCN-LABEL: {{^}}s_ashr_32_i64:
-; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}}
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -258,11 +258,11 @@ define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1
 }
 
 ; GCN-LABEL: {{^}}s_ashr_63_i64:
-; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}}
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 8878b453855..83940eadc3f 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -189,11 +189,11 @@ define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> ad
 
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_lshr_32_i64:
-; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
+; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 280ce628c18..5bcf78f11ac 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -18,9 +18,9 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 }
 
 ; GCN-LABEL: {{^}}local_store_i55:
-; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+$}}
 
 ; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
 ; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 908d13eb017..fb1cbe7410f 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -6,9 +6,9 @@
 declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; FUNC-LABEL: {{^}}s_sub_i32:
-; GCN: s_load_dword [[A:s[0-9]+]]
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: s_sub_i32 s{{[0-9]+}}, [[A]], [[B]]
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]]
 define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = sub i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 6a9d8ea0093..02c847e2c61 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -4,20 +4,23 @@
 target triple="amdgcn--"
 
 ; CHECK-LABEL: foobar:
-; CHECK: s_load_dword s2, s[0:1], 0x9
-; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK: v_mbcnt_lo_u32_b32_e64
+; CHECK: s_load_dwordx2 s[2:3], s[0:1], 0x9
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; BB0_1:
-; CHECK: s_load_dword s0, s[0:1], 0xa
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; BB0_2:
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+
+; CHECK: BB0_1:
+; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr2_sgpr3 killed $exec
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+
+; CHECK: BB0_2:
 ; CHECK: s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT: s_mov_b32 s7, 0xf000
-; CHECK-NEXT: s_mov_b32 s6, -1
-; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; CHECK-NEXT: s_mov_b32 s3, 0xf000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0
 ; CHECK-NEXT: s_endpgm
 define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index 4ea2352f57f..112c4995356 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -1,37 +1,38 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 
-; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
+; GCN-LABEL: {{^}}global_truncstore_i32_to_i1:
+; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; GCN: buffer_store_byte [[VREG]],
 define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
-; SI: buffer_store_byte
+; GCN-LABEL: {{^}}global_truncstore_i64_to_i1:
+; GCN: buffer_store_byte
 define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
+; FIXME: VGPR on VI
+; GCN-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
+; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; GCN: buffer_store_byte [[VREG]],
 define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
-; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; GCN-LABEL: {{^}}global_truncstore_i16_to_i1:
 define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index b390958ab4c..7c609d7ad9d 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -1,10 +1,10 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI  %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI  %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i32], i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@ define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %i
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32], i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -94,12 +94,12 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a)
 }
 
 ; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index b85c6bffe7e..acdf60dd62d 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 350be19d6e0..071fc07ea59 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -368,7 +368,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -383,7 +383,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index e5fe0bd9a9d..d30aeb56c33 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -442,7 +442,7 @@ define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 ad
 ; ALIGNED: buffer_load_ushort
 ; ALIGNED: buffer_load_ushort
 
-; UNALIGNED: s_load_dwordx2
+; UNALIGNED: s_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx2
 define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(4)* %p, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 33a42022705..b5435ef14d6 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -195,7 +195,7 @@ if.end:                                           ; preds = %if.else, %if.then
 
 ; GCN-LABEL: {{^}}uniform_if_else:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2
 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
@@ -248,10 +248,10 @@ ENDIF:                                            ; preds = %IF, %main_body
 }
 
 ; GCN-LABEL: {{^}}icmp_users_different_blocks:
-; GCN: s_load_dword [[COND:s[0-9]+]]
-; GCN: s_cmp_lt_i32 [[COND]], 1
+; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}}
+; GCN: s_cmp_lt_i32 s[[COND0]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
-; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, [[COND]], 0{{$}}
+; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}}
 ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
@@ -432,8 +432,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_eq:
 ; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
-
-; SI: v_cmp_eq_u64_e64
+; SI-DAG: v_cmp_eq_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -465,7 +464,7 @@ done:
 ; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 
-; SI: v_cmp_ne_u64_e64
+; SI-DAG: v_cmp_ne_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -494,8 +493,8 @@ done:
 }
 
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt:
-; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0
-; GCN: v_cmp_gt_i64_e64
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
+; GCN-DAG: v_cmp_gt_i64_e64
 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 3cce2c96e83..9e8c47b22d9 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -28,12 +28,10 @@ define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %o
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
@@ -42,8 +40,7 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
 ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 
@@ -53,11 +50,11 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 
 ; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
 ; GCN-NOT: v_mov_b32
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SA]], [[VA0]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SA]], [[VA1]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
@@ -71,12 +68,10 @@ define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, fl
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
@@ -85,12 +80,10 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
@@ -152,11 +145,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR0]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], [[SGPR1]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -180,11 +173,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -208,11 +201,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -225,14 +218,14 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr
 }
 
 ; GCN-LABEL: {{^}}test_s0_s1_k_f32:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
-; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
+; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]]
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
@@ -246,8 +239,8 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a,
 
 ; FIXME: Immediate in SGPRs just copied to VGPRs
 ; GCN-LABEL: {{^}}test_s0_s1_k_f64:
-; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}}
 ; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
 ; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
 
@@ -261,7 +254,7 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a,
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index bd7738e5846..53ef9d536cb 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -42,12 +42,12 @@ define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f)
 ; (select (cmp (sgprX, constant)), constant, sgprZ)
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
-; GCN: s_load_dword [[X:s[0-9]+]]
-; GCN: s_load_dword [[Z:s[0-9]+]]
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}
+; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -73,12 +73,11 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[Z:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -315,10 +314,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %ou
 
 ; Different types compared vs. selected
 ; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
-; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_dwordx2
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
+; GCN-DAG: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2
 
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index fe5e5a18ab5..bf9ae67e9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -350,8 +350,8 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_v2f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-DAG:  v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -390,8 +390,8 @@ entry:
 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
 
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -418,8 +418,8 @@ entry:
 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 
 ; VI-NOT: v_mac_f16
@@ -452,8 +452,8 @@ entry:
 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
 
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index ab47cc99a21..237c1869df7 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -35,9 +35,9 @@ define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
 
   %t0.val = fmul half %a.val, %b.val
   %t1.val = fmul half %a.val, %c.val
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 8dab6faedf4..90b55196359 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
-; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
+; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock4
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 ; SI: s_and_saveexec_b64
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 847a1d73932..df327066006 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}xor_v2i32:
@@ -40,9 +40,9 @@ define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
 ; FUNC-LABEL: {{^}}xor_i1:
 ; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
-; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
-; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]]
+; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
+; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI: s_xor_b64 [[XOR:vcc]], [[CMP1]], [[CMP0]]
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
@@ -173,26 +173,26 @@ endif:
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
-; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b
+; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -202,25 +202,25 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %o
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: xor_b32
-; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63
+; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63
 ; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]]
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f256d89f0cb..ee9bbb67c0e 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 
 ; R600: {{^}}s_mad_zext_i32_to_i64:
 ; R600: MEM_RAT_CACHELESS STORE_RAW
 ; R600: MEM_RAT_CACHELESS STORE_RAW
 
-; SI: {{^}}s_mad_zext_i32_to_i64:
-; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
-; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+; GCN: {{^}}s_mad_zext_i32_to_i64:
+; GCN: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
+; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
@@ -18,8 +18,8 @@ entry:
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
-; SI: v_cndmask_b32
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i32
+; GCN: v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
@@ -28,17 +28,17 @@ entry:
   ret void
 }
 
-; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
+; GCN-LABEL: {{^}}s_arg_zext_i1_to_i64:
 define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64:
-; SI: s_mov_b32 s{{[0-9]+}}, 0
-; SI: v_cmp_eq_u32
-; SI: v_cndmask_b32
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
+; GCN: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: v_cmp_eq_u32
+; GCN: v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
@@ -46,10 +46,20 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a,
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI: buffer_store_short [[RESULT]]
-define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+; FIXME: Why different commute?
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i16
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_load_dword [[B:s[0-9]+]]
+
+; SI: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
+; SI: v_cmp_eq_u32_e32 vcc, [[B]], [[V_A]]
+
+; VI: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; VI: v_cmp_eq_u32_e32 vcc, [[A]], [[V_B]]
+
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_short [[RESULT]]
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-26 19:10:00 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-26 19:10:00 +0000
commit	8c4a35237a58d377eb4ef5e8156f789e40dfd837 (patch)
tree	829a341e09f6c77dbaad6bde17e9f7382950c04d /llvm/test/CodeGen
parent	7e991d30c0fc2f2fb0510dfdd0a1e7053e64170e (diff)
download	bcm5719-llvm-8c4a35237a58d377eb4ef5e8156f789e40dfd837.tar.gz bcm5719-llvm-8c4a35237a58d377eb4ef5e8156f789e40dfd837.zip