diff options
| author | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2019-06-06 21:13:02 +0000 |
|---|---|---|
| committer | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2019-06-06 21:13:02 +0000 |
| commit | 37bd9bd13750a368c0468f0768f29edc37dc540a (patch) | |
| tree | a1389c1689b8a5bd10f976aa66e541bb47ea255f /llvm/test | |
| parent | 169fc2b0209d5574fca0927a707706ea2d5f5a09 (diff) | |
| download | bcm5719-llvm-37bd9bd13750a368c0468f0768f29edc37dc540a.tar.gz bcm5719-llvm-37bd9bd13750a368c0468f0768f29edc37dc540a.zip | |
[AMDGPU] Partial revert for the ba447bae7448435c9986eece0811da1423972fdd
"Divergence driven ISel. Assign register class for cross block values
according to the divergence."
that discovered the design flaw leading to several issues that
required to be solved before.
This change reverts AMDGPU specific changes and keeps common part
unaffected.
llvm-svn: 362749
Diffstat (limited to 'llvm/test')
33 files changed, 165 insertions, 179 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll index 454c56cbca5..3d457fdd50e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -5,12 +5,11 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind { ; GCN-LABEL: atomic_nand_i32_lds: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB0_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18,6 +17,7 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind { ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB0_1 @@ -33,12 +33,11 @@ define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind { ; GCN-LABEL: atomic_nand_i32_global: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_dword v2, v[0:1], off +; GCN-NEXT: global_load_dword v3, v[0:1], off ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -46,6 +45,7 @@ define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB1_1 @@ -61,12 +61,11 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind { ; GCN-LABEL: atomic_nand_i32_flat: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_dword v2, v[0:1] +; GCN-NEXT: flat_load_dword v3, v[0:1] ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -75,6 +74,7 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind { ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB2_1 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index a2facaafb41..45ed056567c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -99,7 +99,7 @@ bb3: ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch: ; GCN: s_load_dword [[CND:s[0-9]+]] - +; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0 ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]] ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]] @@ -117,7 +117,6 @@ bb3: ; GCN: v_nop_e64 ; GCN: [[ENDBB]]: -; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll b/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll index c9c801fb191..e6f68417803 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-uniformity.ll @@ -8,8 +8,8 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: ; %LOOP49 -; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; CHECK: s_cbranch_scc1 +; CHECK: v_cmp_ne_u32_e32 vcc, +; CHECK: s_cbranch_vccnz ; CHECK: ; %ENDIF53 define amdgpu_vs float @main(i32 %in) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index 81ca354574d..db85233268f 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -21,7 +21,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-NEXT: v_and_b32_e32 v0, v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s0, v0 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: main: @@ -42,7 +42,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; VI-NEXT: v_and_b32_e32 v0, v2, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, s0, v0 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v0 ; VI-NEXT: ; return to shader part epilog bb: %tmp = fptosi float %arg0 to i32 diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 15e807a3e02..41ecdd403d7 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -89,7 +89,7 @@ endif: } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 12{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: @@ -123,9 +123,10 @@ endif: ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] -; GCN: s_cmp_lg_u32 +; GCN: v_cmp_ne_u32_e32 vcc, +; GCN: s_and_b64 vcc, exec, vcc ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_scc1 [[LOOP]] +; GCN-NEXT: s_cbranch_vccnz [[LOOP]] ; GCN: [[END]]: diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 08a95ecbf5a..8d21050ebee 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -13,50 +13,55 @@ define amdgpu_ps void @main(i32, float) { ; CHECK: ; %bb.0: ; %start ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 m0, s0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x -; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b64 s[2:3], 0 -; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 +; CHECK-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3 +; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: BB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], exec -; CHECK-NEXT: s_cmp_lt_u32 s0, 32 -; CHECK-NEXT: s_mov_b64 s[6:7], -1 -; CHECK-NEXT: s_cbranch_scc0 BB0_5 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 32, v1 +; CHECK-NEXT: s_and_b64 vcc, exec, vcc +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec +; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_vccz BB0_5 ; CHECK-NEXT: ; %bb.2: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; CHECK-NEXT: ; mask branch BB0_4 ; CHECK-NEXT: BB0_3: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_add_i32 s0, s0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], exec, -1 +; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 +; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: BB0_4: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_mov_b64 s[6:7], 0 -; CHECK-NEXT: BB0_5: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_branch BB0_6 +; CHECK-NEXT: BB0_5: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: BB0_6: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[4:5] -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[2:3] -; CHECK-NEXT: s_mov_b64 s[2:3], s[8:9] +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_cbranch_execnz BB0_1 -; CHECK-NEXT: ; %bb.6: ; %Flow2 +; CHECK-NEXT: ; %bb.7: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] -; CHECK-NEXT: ; mask branch BB0_8 -; CHECK-NEXT: BB0_7: ; %if1 +; this is the divergent branch with the condition not marked as divergent +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; CHECK-NEXT: ; mask branch BB0_9 +; CHECK-NEXT: BB0_8: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 -; CHECK-NEXT: BB0_8: ; %endloop +; CHECK-NEXT: BB0_9: ; %endloop ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; CHECK-NEXT: s_endpgm -; this is the divergent branch with the condition not marked as divergent start: %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0) br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll index fe8f31a0cd2..a39833455a1 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -13,9 +13,9 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) - ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 - ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY1]], %subreg.sub2 ; GCN: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index badaa16bbfc..f96019dba6d 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -48,8 +48,8 @@ define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -62,10 +62,10 @@ define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 01499e681ea..a3f176b3ef0 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -85,15 +85,15 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -121,15 +121,15 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { } ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} @@ -156,15 +156,15 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* % } ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} @@ -194,15 +194,15 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* % ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -231,6 +231,8 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace } ; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: +; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -238,12 +240,9 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 - -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -274,6 +273,8 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { } ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: +; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -281,12 +282,9 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 - -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll index 075115a2ee6..ca80c4edbfb 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -33,13 +33,9 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace( ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] -; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] - -; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] - -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[B]], [[VA]] +; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] ; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] ; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index a621b04a346..0ff5d9652c1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs @@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: ; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}| ; SI-NOT: and define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) @@ -85,8 +85,8 @@ define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrs ; FIXME: In this case two uses of the constant should be folded ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs @@ -96,10 +96,10 @@ define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll index 6e4635ec438..48647a2cdb8 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.ll @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 @@ -55,10 +55,10 @@ define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x flo } ; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index 87c9a565f08..ae78a1ecf32 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -4,11 +4,17 @@ ; SI-LABEL: {{^}}i1_copy_from_loop: ; ; SI: ; %for.body -; SI: v_cmp_lt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], s{{[0-9+]}}, 4 +; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4, +; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec +; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]] + +; SI: ; %Flow1 +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec ; SI: ; %Flow ; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec +; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec ; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]] ; SI: ; %for.end diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll index c65683d4fab..0aacbbfda18 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll @@ -7,6 +7,7 @@ ; GCN: s_cbranch_scc1 [[PREEXIT:BB[0-9_]+]] ; GCN: ; %blocka +; GCN: s_xor_b64 s[{{[0-9:]+}}], exec, -1 ; GCN: s_cmp_eq_u32 s1, 0 ; GCN: s_cbranch_scc1 [[EXIT:BB[0-9_]+]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 2584f30573f..47e080a94ba 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -11,12 +11,12 @@ ; GCN-LABEL: {{^}}insertelement_v4f32_0: ; GCN: s_load_dwordx4 -; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 -; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] - ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll index 60ec52c229b..2a5e81a6dd6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -387,7 +387,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val: ; SI-NOT: v0 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0 +; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll index 05b074bfe2d..c47d02f716b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -53,8 +53,8 @@ define amdgpu_kernel void @test_fabs_fmed3(float addrspace(1)* %out, float %src0 } ; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0: -; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] +; GCN: s_brev_b32 [[NEG0:s[0-9]+]], 1 +; GCN: v_med3_f32 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0) %neg.med3 = fsub float -0.0, %med3 @@ -88,8 +88,8 @@ define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0(float addrspace(1)* %out, ; GCN-LABEL: {{^}}test_fneg_fmed3_r_inv2pi_0_foldable_user: ; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983 -; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]] +; GCN-DAG: s_mov_b32 [[NEG_INV:s[0-9]+]], 0xbe22f983 +; GCN: v_med3_f32 [[MED3:v[0-9]+]], -v{{[0-9]+}}, [[NEG_INV]], [[NEG0]] ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]] define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll index a7fb618c234..18ede50f40c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -42,8 +42,6 @@ define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { ; VI-OPT: s_mov_b32 ; VI-OPT: s_mov_b32 ; VI-NOOPT: s_waitcnt -; VI-NOOPT-NEXT: v_mov_b32_e32 -; VI-NOOPT-NEXT: s_nop 0 ; VI-NOOPT-NEXT: s_nop 0 ; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI-OPT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll index 83bc8b23472..bc04f6f28f6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll index 1f46613a8db..2cab9c28db3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 5c2ec5021f1..f37b3a3637a 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -26,9 +26,10 @@ ; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec -; GCN: s_cmp_gt_i32 s4, -1 -; GCN: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: v_cmp_lt_i32_e32 vcc, -1 +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec +; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %bb4 ; GCN: buffer_load_dword @@ -38,7 +39,6 @@ ; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]] ; GCN: [[FLOW]]: ; %Flow -; GCN: ; in Loop: Header=BB0_1 Depth=1 ; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]] ; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]] ; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]] diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 8e4b6806f98..eed02187664 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -17,7 +17,6 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -80,7 +79,6 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -108,7 +106,6 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -237,12 +234,9 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; On GFX10+ we can use two scalar operands. ; GCN-LABEL: {{^}}madak_constant_bus_violation: ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} - +; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] -; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 -; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 -; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] -; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 361b8035f61..7b77b4430d4 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -155,9 +155,8 @@ entry: ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] -; CHECK-O0: v_readlane_b32 s[[S1:[0-9]+]], v{{[0-9]+}}, 4 -; CHECK-O0: v_readlane_b32 s[[S2:[0-9]+]], v{{[0-9]+}}, 5 -; CHECK-O0: s_mov_b64 exec, s{{\[}}[[S1]]:[[S2]]{{\]}} + +; CHECK-O0: s_mov_b64 exec, [[SAVEEXEC]] ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index ddda7baef74..4c1a769d599 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -96,6 +96,7 @@ ENDIF: ; preds = %LOOP ; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]] ; GCN: ; %LeafBlock1 +; GCN: s_mov_b64 ; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: ; %case1 @@ -108,6 +109,8 @@ ENDIF: ; preds = %LOOP ; GCN: s_mov_b64 [[BREAK]], -1{{$}} +; GCN: [[FLOW]]: ; %Flow + ; GCN: ; %case0 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], ; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec @@ -115,7 +118,7 @@ ENDIF: ; preds = %LOOP ; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec ; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]] -; GCN: [[FLOW]]: ; %Flow4 +; GCN: ; %Flow4 ; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]] ; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]] ; GCN: s_andn2_b64 exec, exec, [[LEFT]] diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll index 24df126e4ca..f773357976c 100644 --- a/llvm/test/CodeGen/AMDGPU/select-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -135,8 +135,8 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo ; GCN-LABEL: {{^}}regression: ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 -; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 -; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 +; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} +; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}} define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 3d5c3285cba..e0971b8456f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -104,8 +104,7 @@ endif: ; SI: ; %else ; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e32 vcc, 0, [[AVAL]] -; SI: s_and_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], vcc, exec +; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]] ; SI: ; %if ; SI: buffer_load_dword [[AVAL:v[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir index 03e81a0431c..3ec7a6678a9 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -16,7 +16,7 @@ registers: body: | ; GCN-LABEL: name: phi_visit_order - ; GCN: S_ADD_I32 + ; GCN: V_ADD_I32 bb.0: liveins: $vgpr0 %7 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 904de8111fa..c83eb378a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -571,6 +571,7 @@ main_body: ; ; TODO: we should keep the loop counter in an SGPR ; +; GCN: v_readfirstlane_b32 ; GCN: s_buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index e7555a67033..80071e3407e 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -1,43 +1,28 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-dce-in-ra=0 -o - %s | FileCheck %s ; Don't crash when the use of an undefined value is only detected by the ; register coalescer because it is hidden with subregister insert/extract. target triple="amdgcn--" -define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { ; CHECK-LABEL: foobar: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) - -; FIXME: The change related to the fact that -; DetectDeadLanes pass hit "Copy across incompatible class" SGPR -> VGPR in analysis -; and hence it cannot derive the fact that the vector element is unused. -; Such a copies appear because the float4 vectors and their elements in the test are uniform -; but the PHI node in "ife" block is divergent because of the CF dependency (divergent branch in bb0) - -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 - -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc -; CHECK-NEXT: ; mask branch BB0_2 -; CHECK-NEXT: BB0_1: ; %ift -; CHECK-NEXT: s_mov_b32 s4, s5 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: BB0_2: ; %ife -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc + +; CHECK: BB0_1: +; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 + +; CHECK: BB0_2: +; CHECK: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index a1cf6cf6300..82283f39792 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -7,9 +7,10 @@ ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} +; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader -; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: ; %loop_body -; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] +; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: +; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index fbf7364bfc4..50cf85e28ae 100644 --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -226,12 +226,13 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr ; GCN-LABEL: {{^}}test_s0_s1_k_f32: ; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 +; GCN-DAG: s_mov_b32 [[SK0:s[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: v_mov_b32_e32 [[VS0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS0]], [[VS1]], [[SK0]] +; GCN-DAG: s_mov_b32 [[SK1:s[0-9]+]], 0x45800000 +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VS0]], [[VS1]], [[SK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 79a753cc046..3a9970e78e3 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -165,8 +165,8 @@ exit: ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: s_cmpk_eq_i32 s{{[0-9+]}}, 0x100 -; SI: s_cbranch_scc0 [[LABEL_LOOP]] +; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 +; SI: s_cbranch_vccz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm @@ -214,7 +214,7 @@ exit: ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] -; SI: ; mask branch [[LABEL_FLOW:BB[0-9]+_[0-9]+]] +; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index b0e9171cbb0..0c52daca047 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s |

