5 files changed, 250 insertions, 28 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index aba0b63a254..617204fdf33 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,16 +1,20 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
 
-; CHECK-LABEL: {{^}}max_14_sgprs:
+; If spilling to smem, additional registers are used for the resource
+; descriptor.
+
+; ALL-LABEL: {{^}}max_14_sgprs:
 
 ; FIXME: Should be ablo to skip this copying of the private segment
 ; buffer because all the SGPR spills are to VGPRs.
 
-; CHECK: s_mov_b64 s[6:7], s[2:3]
-; CHECK: s_mov_b64 s[4:5], s[0:1]
-
-; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 14
+; ALL: s_mov_b64 s[6:7], s[2:3]
+; ALL: s_mov_b64 s[4:5], s[0:1]
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 14
 define void @max_14_sgprs(i32 addrspace(1)* %out1,
+
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
                           i32 addrspace(1)* %out4,
@@ -31,7 +35,7 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1,
 ; ---------------------
 ; total: 14
 
-; + reserved vcc, flat_scratch = 18
+; + reserved vcc, xnack, flat_scratch = 20
 
 ; Because we can't handle re-using the last few input registers as the
 ; special vcc etc. registers (as well as decide to not use the unused
@@ -40,14 +44,14 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1,
 
 ; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
 ; TOSGPR: SGPRBlocks: 2
-; TOSGPR: NumSGPRsForWavesPerEU: 18
+; TOSGPR: NumSGPRsForWavesPerEU: 20
 
 ; TOSMEM: s_mov_b64 s[6:7], s[2:3]
-; TOSMEM: s_mov_b32 s9, s13
 ; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s13
 
 ; TOSMEM: SGPRBlocks: 2
-; TOSMEM: NumSGPRsForWavesPerEU: 18
+; TOSMEM: NumSGPRsForWavesPerEU: 20
 define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
@@ -79,12 +83,12 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
 ; ; swapping the order the registers are copied from what normally
 ; ; happens.
 
-; TOSMEM: s_mov_b64 s[6:7], s[2:3]
-; TOSMEM: s_mov_b64 s[4:5], s[0:1]
-; TOSMEM: s_mov_b32 s3, s11
+; TOSMEM: s_mov_b32 s5, s11
+; TOSMEM: s_add_u32 m0, s5,
+; TOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0
 
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 16
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 18
 define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 7bc4d735feb..83313ed5327 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index c3d9ee7f13f..f267eb47559 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -1,14 +1,44 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
 
 ; Make sure this doesn't crash.
-; CHECK: {{^}}test:
+; ALL-LABEL: {{^}}test:
+; ALL: s_mov_b32 s92, SCRATCH_RSRC_DWORD0
+; ALL: s_mov_b32 s91, s3
+
 ; Make sure we are handling hazards correctly.
-; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
-; CHECK-NEXT: s_nop 4
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
-; CHECK: s_endpgm
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; SGPR-NEXT: s_waitcnt vmcnt(0)
+; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
+; SGPR-NEXT: s_nop 4
+; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
+
+
+; Make sure scratch wave offset register is correctly incremented and
+; then restored.
+; SMEM: s_mov_b32 m0, s91{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x100{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x200{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x300{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+
+
+; SMEM: s_mov_b32 m0, s91{{$}}
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x100{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x200{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x300{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+
+; ALL: s_endpgm
 define void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 74e33d11bed..c5ef75e5fb7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -1,12 +1,13 @@
 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
 ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
 
 ; XXX - Why does it like to use vcc?
 
 ; GCN-LABEL: {{^}}spill_m0:
-; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0
 
 ; GCN: s_cmp_lg_u32
 
@@ -16,6 +17,13 @@
 ; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0
 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
 ; TOVMEM: s_waitcnt vmcnt(0)
+
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM-NOT: vcc_hi
+; TOSMEM: s_buffer_store_dword vcc_hi, s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_waitcnt lgkmcnt(0)
+
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ENDIF]]:
@@ -27,6 +35,11 @@
 ; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, vcc_hi
 
+; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_buffer_load_dword vcc_hi, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM-NOT: vcc_hi
+; TOSMEM: s_mov_b32 m0, vcc_hi
+
 ; GCN: s_add_i32 m0, m0, 1
 define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
@@ -48,6 +61,8 @@ endif:
 
 ; GCN-LABEL: {{^}}spill_m0_lds:
 ; GCN-NOT: v_readlane_b32 m0
+; GCN-NOT: s_buffer_store_dword m0
+; GCN-NOT: s_buffer_load_dword m0
 define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 {
 main_body:
   %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir
new file mode 100644
index 00000000000..af71086e542
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir
@@ -0,0 +1,173 @@
+# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
+
+--- |
+  define void @basic_insert_dcache_wb() {
+    ret void
+  }
+
+  define void @explicit_flush_after() {
+    ret void
+  }
+
+  define void @explicit_flush_before() {
+    ret void
+  }
+
+  define void @no_scalar_store() {
+    ret void
+  }
+
+  define void @multi_block_store() {
+  bb0:
+    br i1 undef, label %bb1, label %bb2
+
+   bb1:
+     ret void
+
+   bb2:
+    ret void
+  }
+
+  define void @one_block_store() {
+  bb0:
+    br i1 undef, label %bb1, label %bb2
+
+   bb1:
+     ret void
+
+   bb2:
+    ret void
+  }
+
+  define amdgpu_ps float @si_return() {
+    ret float undef
+  }
+
+...
+---
+# CHECK-LABEL: name: basic_insert_dcache_wb
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: basic_insert_dcache_wb
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+    S_ENDPGM
+...
+---
+# Already has an explicitly requested flush after the last store.
+# CHECK-LABEL: name: explicit_flush_after
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: explicit_flush_after
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+    S_DCACHE_WB
+    S_ENDPGM
+...
+---
+# Already has an explicitly requested flush before the last store.
+# CHECK-LABEL: name: explicit_flush_before
+# CHECK: bb.0:
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: explicit_flush_before
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_DCACHE_WB
+    S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+    S_ENDPGM
+...
+---
+# CHECK-LABEL: no_scalar_store
+# CHECK: bb.0
+# CHECK-NEXT: S_ENDPGM
+name: no_scalar_store
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_ENDPGM
+...
+
+# CHECK-LABEL: name: multi_block_store
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.1:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: multi_block_store
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+    S_ENDPGM
+
+  bb.1:
+    S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
+    S_ENDPGM
+...
+...
+
+# This one should be able to omit the flush in the storeless block but
+# this isn't handled now.
+
+# CHECK-LABEL: name: one_block_store
+# CHECK: bb.0:
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.1:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: one_block_store
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_ENDPGM
+
+  bb.1:
+    S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
+    S_ENDPGM
+...
+---
+# CHECK-LABEL: name: si_return
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_WAITCNT
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: SI_RETURN
+
+name: si_return
+tracksRegLiveness: false
+
+body: |
+  bb.0:
+    S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+    SI_RETURN undef %vgpr0
+...