[AMDGPU] Support for v3i32/v3f32

Added support for dwordx3 for most load/store types, but not DS, and not intrinsics yet. SI (gfx6) does not have dwordx3 instructions, so they are not enabled there. Some of this patch is from Matt Arsenault, also of AMD. Differential Revision: https://reviews.llvm.org/D58902 Change-Id: I913ef54f1433a7149da8d72f4af54dbb13436bd9 llvm-svn: 356659
author: Tim Renouf <tpr.llvm@botech.co.uk> 2019-03-21 12:01:21 +0000
committer: Tim Renouf <tpr.llvm@botech.co.uk> 2019-03-21 12:01:21 +0000
commit: 361b5b2193421824925a72669f1d06cd63c3d9a7 (patch)
tree: 76cf94d30c3a4d9caf4150a96b93e988ac360445 /llvm/test
parent: 92cbcfc325e08c07d5b0d5157f95ec0c90124e70 (diff)
download: bcm5719-llvm-361b5b2193421824925a72669f1d06cd63c3d9a7.tar.gz
bcm5719-llvm-361b5b2193421824925a72669f1d06cd63c3d9a7.zip
19 files changed, 384 insertions, 111 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
index af87c0deb55..8751c61dcd4 100644
--- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
@@ -174,10 +174,8 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
 ; GCN: s_swappc
-; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
-; GFX7-DAG: flat_store_dword {{.*}}, v2
-; GFX89-DAG: buffer_store_dwordx2 v[0:1]
-; GFX89-DAG: buffer_store_dword v2
+; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2]
+; GFX89-DAG: buffer_store_dwordx3 v[0:2]
 define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
   %val = call <3 x i32> @external_v3i32_func_void()
   store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
@@ -254,10 +252,8 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_v3f32_func_void:
 ; GCN: s_swappc
-; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
-; GFX7-DAG: flat_store_dword {{.*}}, v2
-; GFX89-DAG: buffer_store_dwordx2 v[0:1]
-; GFX89-DAG: buffer_store_dword v2
+; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2]
+; GFX89-DAG: buffer_store_dwordx3 v[0:2]
 define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
   %val = call <3 x float> @external_v3f32_func_void()
   store volatile <3 x float> %val, <3 x float> addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index 626a6e2c5b8..cd3aeb48faa 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -1,4 +1,5 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCNX3 %s
 
 ; FIXME: Most of these cases that don't trigger because of broken cost
 ; heuristics. Should not need -stress-early-ifcvt
@@ -60,8 +61,9 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
-; GCN-DAG: buffer_store_dword v
-; GCN-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
+; GCNX3: buffer_store_dwordx3
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index 6061f53e959..11826d124df 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -316,10 +316,10 @@ endif:
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
-; GCN: s_add_i32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
-; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_cselect_b32 s
+; GCN-NEXT: s_cselect_b32 s
+; GCN-NEXT: s_cselect_b32 s
 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index e4a18bf6fa1..442cdc92c30 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -161,8 +161,7 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 {
 }
 
 ; GCN-LABEL: {{^}}void_func_v3i32:
-; GCN-DAG: buffer_store_dword v2, off
-; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
   store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
   ret void
@@ -356,8 +355,7 @@ define void @void_func_v2f32(<2 x float> %arg0) #0 {
 }
 
 ; GCN-LABEL: {{^}}void_func_v3f32:
-; GCN-DAG: buffer_store_dword v2, off
-; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3f32(<3 x float> %arg0) #0 {
   store <3 x float> %arg0, <3 x float> addrspace(1)* undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index f0927dec271..f9631e615c9 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -140,7 +140,7 @@ define <2 x i32> @v2i32_func_void() #0 {
 }
 
 ; GCN-LABEL: {{^}}v3i32_func_void:
-; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN: buffer_load_dwordx3 v[0:2], off
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <3 x i32> @v3i32_func_void() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 49b4d71bcf2..1908015f477 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -78,14 +78,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)*
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
 ; GCN: s_load_dwordx2 s
-; GCN: s_load_dwordx2 s
-; GCN-NOT: _load
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
+; GCN: s_load_dwordx2 s
+; GCN-NOT: _load
 ; GCN: v_cvt_f32_f16_e32
 ; GCN-NOT: v_cvt_f32_f16
-; GCN-DAG: _store_dword
-; GCN-DAG: _store_dwordx2
+; GCN-DAG: _store_dwordx3
 ; GCN: s_endpgm
 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x float>
@@ -472,7 +471,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
-; GCN: flat_load_dwordx4
+; GCN: flat_load_dwordx3
 ; GCN-DAG: v_cvt_f16_f32_e32
 ; SI-DAG:  v_cvt_f16_f32_e32
 ; VI-DAG:  v_cvt_f16_f32_sdwa
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index bac1a9e3f41..3dea940ec3f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1778,29 +1778,29 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s8, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX8-NEXT:    s_and_b32 s6, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT:    s_and_b32 s7, s1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80010
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NEXT:    v_mul_u32_u24_e32 v4, s6, v4
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s5, v5
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80010
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_and_b32 s5, s0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:    v_mul_u32_u24_e32 v4, s5, v4
+; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s4, v5
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index 77557a58409..88a9405f0ac 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=SI-NOHSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC -check-prefix=GCNX3-HSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=GCNX3-NOHSA %s
 
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
@@ -30,8 +30,9 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCNX3-HSA: flat_load_dwordx3
 
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 27081b8379b..fcbf8a1fc4a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,8 +1,8 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}global_load_i32:
@@ -30,8 +30,9 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; GCNX3-NOHSA: buffer_load_dwordx3
+; GCNX3-HSA: {{flat|global}}_load_dwordx3
 
 ; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 03e872192d1..fbbb8f80ccb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -146,12 +146,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; FIXME: Should be packed into 2 registers per argument?
 ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
 ; GCN: s_waitcnt
-; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX9-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp
+; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}}
 ; GFX9-NEXT: s_setpc_b64
 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 7fb0e354df2..a5d111b4977 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -275,8 +275,7 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx2
-; SI-DAG: buffer_load_dword v
+; SI-DAG: buffer_load_dwordx4
 ; CI-DAG: buffer_load_dwordx3
 ; GCN: s_waitcnt
 ; SI-DAG: buffer_store_dwordx2
@@ -566,8 +565,8 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
 
 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 ; GCN: buffer_store_dwordx4
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dword v
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
 ; CI: buffer_store_dwordx3
 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
@@ -615,8 +614,7 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*
 
 ; GCN-LABEL: {{^}}copy_v3i32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
@@ -650,8 +648,7 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou
 
 ; GCN-LABEL: {{^}}copy_v3f32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
diff --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
new file mode 100644
index 00000000000..3c55dc8ef91
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
@@ -0,0 +1,216 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
+
+; CHECK-LABEL: spill_v2i32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+
+define void @spill_v2i32() {
+entry:
+  %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v2f32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+
+define void @spill_v2f32() {
+entry:
+  %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v3i32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+
+define void @spill_v3i32() {
+entry:
+  %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v3f32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+
+define void @spill_v3f32() {
+entry:
+  %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v4i32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+
+define void @spill_v4i32() {
+entry:
+  %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v4f32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+
+define void @spill_v4f32() {
+entry:
+  %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v5i32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
+
+define void @spill_v5i32() {
+entry:
+  %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+; CHECK-LABEL: spill_v5f32:
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
+
+define void @spill_v5f32() {
+entry:
+  %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
+
+  %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
+  %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr
+
+  ; Force %a to spill.
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+  %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
+  store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr
+
+  ret void
+}
+
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index 6ead8cc409e..26923f8c3eb 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -112,6 +112,7 @@ define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 
 ; GFX89: v_cndmask_b32_e32
 ; GFX89: cndmask
+; VI: cndmask
 ; GFX89-NOT: cndmask
 define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
   %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
@@ -230,6 +231,21 @@ define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x f
   ret void
 }
 
+; GCN-LABEL: {{^}}s_select_v3f32:
+; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+
+; GCN: buffer_store_dwordx
+define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
 ; GCN-LABEL: {{^}}s_select_v4f32:
 ; GCN: s_load_dwordx4
 ; GCN: s_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index cb4601ac165..c95beaea5a7 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,SI %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,CI %s
 
 ; Check that an addrspace(1) (const) load with various combinations of
 ; uniform, nonuniform and constant address components all load with an
@@ -8,7 +9,8 @@
 
 ; GCN-LABEL: {{^}}nonuniform_uniform:
 ; GCN-NOT: readfirstlane
-; SICI: buffer_load_dwordx4 {{.*}} addr64
+; SI: buffer_load_dwordx4 {{.*}} addr64
+; CI: buffer_load_dwordx3 {{.*}} addr64
 
 define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 .entry:
@@ -21,7 +23,8 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 
 ; GCN-LABEL: {{^}}uniform_nonuniform:
 ; GCN-NOT: readfirstlane
-; SICI: buffer_load_dwordx4 {{.*}} addr64
+; SI: buffer_load_dwordx4 {{.*}} addr64
+; CI: buffer_load_dwordx3 {{.*}} addr64
 
 define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
 .entry:
@@ -35,7 +38,8 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
 
 ; GCN-LABEL: {{^}}const_nonuniform:
 ; GCN-NOT: readfirstlane
-; SICI: buffer_load_dwordx4 {{.*}} addr64
+; SI: buffer_load_dwordx4 {{.*}} addr64
+; CI: buffer_load_dwordx3 {{.*}} addr64
 
 define amdgpu_ps float @const_nonuniform(i32 %arg18) {
 .entry:
@@ -49,7 +53,8 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) {
 
 ; GCN-LABEL: {{^}}nonuniform_nonuniform:
 ; GCN-NOT: readfirstlane
-; SICI: buffer_load_dwordx4 {{.*}} addr64
+; SI: buffer_load_dwordx4 {{.*}} addr64
+; CI: buffer_load_dwordx3 {{.*}} addr64
 
 define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 50bc5889669..d52a7ffcbe6 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -37,13 +37,12 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32
 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 ; SI-LABEL: test_s_sext_i32_to_i64:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mul_i32 s4, s4, s5
-; SI-NEXT:    s_add_i32 s4, s4, s2
+; SI-NEXT:    s_mul_i32 s2, s4, s5
+; SI-NEXT:    s_add_i32 s4, s2, s6
 ; SI-NEXT:    s_ashr_i32 s5, s4, 31
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
@@ -54,16 +53,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a
 ; VI-LABEL: test_s_sext_i32_to_i64:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mul_i32 s1, s2, s3
-; VI-NEXT:    s_add_i32 s1, s1, s0
-; VI-NEXT:    s_ashr_i32 s0, s1, 31
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    s_add_i32 s0, s0, s2
+; VI-NEXT:    s_ashr_i32 s1, s0, 31
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 entry:
@@ -292,14 +290,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
 ; SI-LABEL: v_sext_i1_to_i16_with_and:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v0
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -308,14 +305,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
 ; VI-LABEL: v_sext_i1_to_i16_with_and:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v1
 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index b8824be4725..5a55ce51440 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -43,6 +43,54 @@ ret:
   ret void
 }
 
+; ALL-LABEL: {{^}}spill_sgpr_x3:
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
+; SMEM: s_buffer_store_dword s
+; SMEM: s_buffer_store_dword s
+; SMEM: s_buffer_store_dword s
+; SMEM: s_cbranch_scc1
+
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
+; SMEM: s_buffer_load_dword s
+; SMEM: s_buffer_load_dword s
+; SMEM: s_buffer_load_dword s
+; SMEM: s_dcache_wb
+; SMEM: s_endpgm
+
+; FIXME: Should only need 4 bytes
+; SMEM: ScratchSize: 16
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
+
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 {
+  %wide.sgpr = call <3 x i32>  asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<3 x i32> %wide.sgpr) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
 ; ALL-LABEL: {{^}}spill_sgpr_x4:
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index c0b9f6cdc33..a8843365421 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -273,13 +273,12 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}store_v3i32:
-; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dword v
+; SI-DAG: buffer_store_dwordx2
 
-; VI-DAG: buffer_store_dwordx3
+; VI: buffer_store_dwordx3
 
-; GFX9-DAG: global_store_dwordx2
-; GFX9-DAG: global_store_dword v
+; GFX9: global_store_dwordx3
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index da57155f33e..c96d0324260 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -135,7 +135,7 @@ entry:
 
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
-; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
+; GCN: v_ma{{[cd]}}_f32{{[_e32]*}} v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
 define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index bcebc9f2d8b..0c52daca047 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -22,25 +22,25 @@
 ; MESA-NOT: s_mov_b32 s3
 ; HSA-NOT: s_mov_b32 s7
 
-; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCNMESA-DAG: s_mov_b32 s14, -1
-; SIMESA-DAG: s_mov_b32 s15, 0xe8f000
-; VIMESA-DAG: s_mov_b32 s15, 0xe80000
-; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000
+; GCNMESA-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GCNMESA-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GCNMESA-DAG: s_mov_b32 s18, -1
+; SIMESA-DAG: s_mov_b32 s19, 0xe8f000
+; VIMESA-DAG: s_mov_b32 s19, 0xe80000
+; GFX9MESA-DAG: s_mov_b32 s19, 0xe00000
 
 
-; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}}
 
-; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
-; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}}
author	Tim Renouf <tpr.llvm@botech.co.uk>	2019-03-21 12:01:21 +0000
committer	Tim Renouf <tpr.llvm@botech.co.uk>	2019-03-21 12:01:21 +0000
commit	361b5b2193421824925a72669f1d06cd63c3d9a7 (patch)
tree	76cf94d30c3a4d9caf4150a96b93e988ac360445 /llvm/test
parent	92cbcfc325e08c07d5b0d5157f95ec0c90124e70 (diff)
download	bcm5719-llvm-361b5b2193421824925a72669f1d06cd63c3d9a7.tar.gz bcm5719-llvm-361b5b2193421824925a72669f1d06cd63c3d9a7.zip