AMDGPU: Switch backend default max workgroup size to 1024

Previously this would default to 256, not the maximum supported size of 1024. Using a maximum lower than the hardware maximum requires language runtimes to enforce this limit for correctness, which no language has correctly done. Switch the default to the conservatively correct maximum, and force frontends to opt-in to the more optimal 256 default maximum. I don't really understand why the changes in occupancy-levels.ll increased the computed occupancy, which I expected to decrease. I'm not sure if these tests should be forcing the old maximum.
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-08-27 12:34:40 -0400
committer: Matt Arsenault <arsenm2@gmail.com> 2019-11-13 07:11:02 +0530
commit: 4b472139513ba460595804f8113497844b41fbcc (patch)
tree: 3b9cdac709fc3a0239a04c863dafda580d440f4a /llvm/test/CodeGen/AMDGPU
parent: 25c5da5a426168b38fb3e9baa918faa75e4a92b4 (diff)
download: bcm5719-llvm-4b472139513ba460595804f8113497844b41fbcc.tar.gz
bcm5719-llvm-4b472139513ba460595804f8113497844b41fbcc.zip
11 files changed, 57 insertions, 17 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 199a96c6443..d26f51302a9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -412,7 +412,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
   store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
@@ -556,7 +556,8 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 
 ; HSAOPT: !0 = !{}
 ; HSAOPT: !1 = !{i32 0, i32 257}
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 8e7f40abc79..b301384b715 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -43,7 +43,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind convergent }
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 0eed325bcce..14e8e609e5f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -8,11 +8,11 @@
 ; CHECK: ---
 ; CHECK:  amdhsa.kernels:
 
-; CHECK:   - .args:           
+; CHECK:   - .args:
 ; CHECK:     .group_segment_fixed_size: 0
 ; CHECK:     .kernarg_segment_align: 8
 ; CHECK:     .kernarg_segment_size: 24
-; CHECK:     .max_flat_workgroup_size: 256
+; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
 ; WAVE64:    .sgpr_count:     8
@@ -33,6 +33,20 @@ entry:
   ret void
 }
 
+; CHECK:   - .args:
+; CHECK:     .max_flat_workgroup_size: 256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 40
 ; GFX803:   .sgpr_spill_count: 24
@@ -149,3 +163,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 1678df8bccb..11dc60a5e2a 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -18,7 +18,7 @@
 ; CHECK:     WavefrontSize:           64
 ; CHECK:     NumSGPRs:                8
 ; CHECK:     NumVGPRs:                6
-; CHECK:     MaxFlatWorkGroupSize:    256
+; CHECK:     MaxFlatWorkGroupSize:    1024
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
@@ -31,6 +31,29 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: - Name:       test_max_flat_workgroup_size
+; CHECK:   SymbolName: 'test_max_flat_workgroup_size@kd'
+; CHECK:   CodeProps:
+; CHECK:     KernargSegmentSize:      24
+; CHECK:     GroupSegmentFixedSize:   0
+; CHECK:     PrivateSegmentFixedSize: 0
+; CHECK:     KernargSegmentAlign:     8
+; CHECK:     WavefrontSize:           64
+; CHECK:     NumSGPRs:                8
+; CHECK:     NumVGPRs:                6
+; CHECK:     MaxFlatWorkGroupSize:    256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs@kd'
 ; CHECK:   CodeProps:
@@ -144,3 +167,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index 6f5f4ca13b5..669988d3878 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -39,7 +39,7 @@ entry:
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-attributes #0 = { norecurse nounwind }
+attributes #0 = { norecurse nounwind "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 
 !0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 4f509c03ceb..eae3f11ba69 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -262,8 +262,8 @@ define amdgpu_kernel void @used_lds_6552() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_6556:
-; GFX9:       ; Occupancy: 9
-; GFX1010W64: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
 ; GFX1010W32: ; Occupancy: 20
 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6556() {
@@ -273,9 +273,9 @@ define amdgpu_kernel void @used_lds_6556() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_13112:
-; GFX9:       ; Occupancy: 4
-; GFX1010W64: ; Occupancy: 9
-; GFX1010W32: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
+; GFX1010W32: ; Occupancy: 20
 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_13112() {
   %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 5e03b8a6e41..ddd44fd17ce 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -300,4 +300,4 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT: !0 = !{i32 0, i32 257}
 ; OPT: !1 = !{i32 0, i32 256}
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
index f00b3de857f..32395a1778a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
@@ -18,4 +18,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 8d12a725594..2bf668d618e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -64,4 +64,4 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index d196897d67d..2fecbcb9d7a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -201,4 +201,4 @@ for.body:                                         ; preds = %for.body, %for.body
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 28e4925f950..007cc6f3c98 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -131,5 +131,5 @@ bb:
   ret void
 }
 
-attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { norecurse nounwind }
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-08-27 12:34:40 -0400
committer	Matt Arsenault <arsenm2@gmail.com>	2019-11-13 07:11:02 +0530
commit	4b472139513ba460595804f8113497844b41fbcc (patch)
tree	3b9cdac709fc3a0239a04c863dafda580d440f4a /llvm/test/CodeGen/AMDGPU
parent	25c5da5a426168b38fb3e9baa918faa75e4a92b4 (diff)
download	bcm5719-llvm-4b472139513ba460595804f8113497844b41fbcc.tar.gz bcm5719-llvm-4b472139513ba460595804f8113497844b41fbcc.zip