[AMDGPU] Optimize SI_IF lowering for simple if regions

Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64. The xor is used to extract only the changed bits. In case of a simple if region where the only use of that value is in the SI_END_CF to restore the old exec mask, we can omit the xor and perform an or of the exec mask with the original exec value saved by the s_and_saveexec_b64. Differential Revision: https://reviews.llvm.org/D35861 llvm-svn: 309185
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-07-26 21:29:15 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-07-26 21:29:15 +0000
commit: 3197eb69812f9ff1c0ef4b2a7b894397dec3de24 (patch)
tree: 9eb5e70f569cbb3569bcf3c65be5a9bc02fa5265 /llvm/test
parent: b3ed4bcb8f1b0a9343d47628ae98127ca33575d1 (diff)
download: bcm5719-llvm-3197eb69812f9ff1c0ef4b2a7b894397dec3de24.tar.gz
bcm5719-llvm-3197eb69812f9ff1c0ef4b2a7b894397dec3de24.zip
13 files changed, 6 insertions, 32 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 662ea37a2b9..51b91ee286f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -14,7 +14,6 @@
 ; GCN-DAG: v_cmp_lt_f32_e32 vcc,
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
 ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 15f579eb06d..233d5a5822f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -141,7 +141,6 @@ bb3:
 ; GCN: buffer_load_dword
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
@@ -385,7 +384,6 @@ bb3:
 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 
@@ -436,7 +434,6 @@ endif:
 ; GCN-LABEL: {{^}}analyze_mask_branch:
 ; GCN: v_cmp_lt_f32_e32 vcc
 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 79d9b169187..08a31ac1847 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -20,7 +20,6 @@
 ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
-; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -101,7 +100,6 @@ endif:
 
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
-; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 4a0213dd1de..63a9f1feb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -4,11 +4,9 @@
 ; SI-LABEL: {{^}}br_i1_phi:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; SI: s_and_saveexec_b64
-; SI: s_xor_b64
 ; SI: v_mov_b32_e32 [[REG]], -1{{$}}
 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
 ; SI: s_and_saveexec_b64
-; SI: s_xor_b64
 ; SI: s_endpgm
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 1b3e09a81e5..25b0ad54421 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 
 ; SI: buffer_load_dword [[LOAD:v[0-9]+]]
 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index ba3ff0b08bc..943beb6bb73 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -81,7 +81,6 @@
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 ; GCN-NEXT: s_and_saveexec_b64
-; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit0
 ; GCN: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 1acae60f305..7c2e28108df 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -11,7 +11,6 @@
 ; GCN-NEXT: ; %else
 
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
@@ -60,7 +59,6 @@ ret.bb:                                          ; preds = %else, %main_body
 
 ; GCN: ; BB#{{[0-9]+}}: ; %else
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
 ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT:  ; %unreachable.bb
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 5b0d5274d5b..f82f7731227 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -3,7 +3,6 @@
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
@@ -31,7 +30,6 @@ ret:
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
 ; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index ea8b87f1dee..0c052ae0a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -202,7 +202,6 @@ exit:
 ; CHECK-LABEL: {{^}}test_kill_divergent_loop:
 ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
 ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: s_cbranch_execz [[EXIT]]
 
@@ -337,7 +336,6 @@ bb7:                                              ; preds = %bb4
 ; CHECK-LABEL: {{^}}if_after_kill_block:
 ; CHECK: ; BB#0:
 ; CHECK: s_and_saveexec_b64
-; CHECK: s_xor_b64
 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
 
 ; CHECK: v_cmpx_le_f32_e32 vcc, 0,
@@ -347,7 +345,6 @@ bb7:                                              ; preds = %bb4
 
 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
-; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
 ; CHECK-NOT: branch
 
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 35615c40d49..fb9628cf7b9 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -10,7 +10,6 @@ target triple="amdgcn--"
 ; CHECK: v_mbcnt_lo_u32_b32_e64
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
 ; BB0_1:
 ; CHECK: s_load_dword s0, s[0:1], 0xa
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index a9d45d71fa2..9416336a44e 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -303,7 +303,6 @@ done:
 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
 ; GCN: s_endpgm
@@ -335,7 +334,6 @@ endif:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
@@ -360,7 +358,6 @@ endif:
 ; GCN-LABEL: {{^}}divergent_if_uniform_if:
 ; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_or_b64 exec, exec, [[MASK]]
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 8a08f9d8bb0..82283f39792 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -5,7 +5,6 @@
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cmp_ne_u32_e32 vcc, 0
 ; CHECK: s_and_saveexec_b64
-; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
 ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
@@ -35,7 +34,6 @@ out:
 
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: s_and_saveexec_b64
-; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
 define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index a67f36d0a7e..a5d810de769 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -18,7 +18,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 ; SI: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
@@ -66,8 +65,7 @@ end:
 ; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
@@ -94,8 +92,7 @@ exit:
 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
@@ -160,8 +157,8 @@ exit:
 ; SI-LABEL: {{^}}simple_test_v_loop:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch
+; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
@@ -202,8 +199,8 @@ exit:
 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
 ; SI: v_cmp_lt_i32_e32 vcc
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch
+; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; Initialize inner condition to false
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-07-26 21:29:15 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-07-26 21:29:15 +0000
commit	3197eb69812f9ff1c0ef4b2a7b894397dec3de24 (patch)
tree	9eb5e70f569cbb3569bcf3c65be5a9bc02fa5265 /llvm/test
parent	b3ed4bcb8f1b0a9343d47628ae98127ca33575d1 (diff)
download	bcm5719-llvm-3197eb69812f9ff1c0ef4b2a7b894397dec3de24.tar.gz bcm5719-llvm-3197eb69812f9ff1c0ef4b2a7b894397dec3de24.zip