diff options
8 files changed, 82 insertions, 42 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 2d62abd2b88..cfff423de9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -73,7 +73,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // Set defaults if needed. if (MaxPrivateElementSize == 0) - MaxPrivateElementSize = 16; + MaxPrivateElementSize = 4; return *this; } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll index 1f2afa4422f..82c4238e2cd 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -1,14 +1,20 @@ -; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s declare void @llvm.amdgcn.s.barrier() #1 ; SI-LABEL: {{^}}private_access_f64_alloca: -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 +; SI-ALLOCA16: buffer_store_dwordx2 +; SI-ALLOCA16: buffer_load_dwordx2 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 @@ -25,8 +31,17 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-LABEL: {{^}}private_access_v2f64_alloca: -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 +; SI-ALLOCA16: buffer_store_dwordx4 +; SI-ALLOCA16: buffer_load_dwordx4 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_write_b64 @@ -45,8 +60,14 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out ; SI-LABEL: {{^}}private_access_i64_alloca: -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 +; SI-ALLOCA16: buffer_store_dwordx2 +; SI-ALLOCA16: buffer_load_dwordx2 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v + ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 @@ -63,8 +84,18 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-LABEL: {{^}}private_access_v2i64_alloca: -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 +; SI-ALLOCA16: buffer_store_dwordx4 +; SI-ALLOCA16: buffer_load_dwordx4 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v + +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_write_b64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 5cab2679b4a..179bf771d6c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 2a120bdd57e..d88c5837672 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -7,11 +7,11 @@ ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1 +; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0x88f000 +; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0x880000 ; GCNHSA: .amd_kernel_code_t diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index eb704c3b5f7..2bb67ca0c7a 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -2,11 +2,11 @@ ; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s ; ALL-LABEL: {{^}}large_alloca_pixel_shader: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s10, -1 +; CI-DAG: s_mov_b32 s11, 0x88f000 +; VI-DAG: s_mov_b32 s11, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen @@ -23,11 +23,11 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { } ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s10, -1 +; CI-DAG: s_mov_b32 s11, 0x88f000 +; VI-DAG: s_mov_b32 s11, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll index 8a2fcb70cb6..b49078565b7 100644 --- a/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s -; CHECK: NumVgprs: 63 +; CHECK: NumVgprs: 64 define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 { main_body: %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 25dadf896a8..190ad62d0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -21,16 +21,23 @@ ; GCNMESA-DAG: s_mov_b32 s16, s3 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCNMESA-DAG: s_mov_b32 s14, -1 -; SIMESA-DAG: s_mov_b32 s15, 0x98f000 -; VIMESA-DAG: s_mov_b32 s15, 0x980000 +; SIMESA-DAG: s_mov_b32 s15, 0x88f000 +; VIMESA-DAG: s_mov_b32 s15, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} -; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} + +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 0d71e494c15..767aca22504 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -8,18 +8,20 @@ ; intermediate register class copies. ; FIXME: The same register is initialized to 0 for every spill. +; FIXME: The unused arguments are removed ; GCN-LABEL: {{^}}main: -; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s14, -1 -; SI-DAG: s_mov_b32 s15, 0x98f000 -; VI-DAG: s_mov_b32 s15, 0x980000 +; GCN-DAG: s_mov_b32 s6, s12 +; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s18, -1 +; SI-DAG: s_mov_b32 s19, 0x88f000 +; VI-DAG: s_mov_b32 s19, 0x880000 -; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload +; s6 is offset system SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 |

