DAG combine "and|or (select c, -1, 0), x" -> "select c, x, 0|-1"

Allowed folding for "and/or" binops with non-constant operand if arguments of select are 0/-1 values. Normally this code with "and" opcode does not get to a DAG combiner and simplified yet in the InstCombine. However AMDGPU produces it during lowering and InstCombine has no chance to optimize it out. In turn the same pattern with "or" opcode can reach DAG. Differential Revision: https://reviews.llvm.org/D48301 llvm-svn: 335250
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2018-06-21 16:02:05 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2018-06-21 16:02:05 +0000
commit: 22ee191c3ea92dd7054975c9e9074fbf3c3825f5 (patch)
tree: a6ced5032526de4ef2d8e59d379fe05ec0a49acc /llvm/test/CodeGen/AMDGPU
parent: 21a2973cc4c2d803ff680190a2ee17c15fb587dc (diff)
download: bcm5719-llvm-22ee191c3ea92dd7054975c9e9074fbf3c3825f5.tar.gz
bcm5719-llvm-22ee191c3ea92dd7054975c9e9074fbf3c3825f5.zip
2 files changed, 111 insertions, 17 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 645e6e1e2dd..0e568a65681 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -1,5 +1,107 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
+; GCN-LABEL: {{^}}select_and1:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN-NOT: v_and_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 0, i32 -1
+  %a = and i32 %y, %s
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_and2:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN-NOT: v_and_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 0, i32 -1
+  %a = and i32 %s, %y
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_and3:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN-NOT: v_and_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 -1, i32 0
+  %a = and i32 %y, %s
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_and_v4:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN-NOT: v_and_b32
+; GCN:     store_dword
+define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %a = and <4 x i32> %s, %y
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_or1:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN-NOT: v_or_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 0, i32 -1
+  %a = or i32 %y, %s
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_or2:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN-NOT: v_or_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 0, i32 -1
+  %a = or i32 %s, %y
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_or3:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN-NOT: v_or_b32
+; GCN:     store_dword v[{{[0-9:]+}}], [[SEL]],
+define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, i32 -1, i32 0
+  %a = or i32 %y, %s
+  store i32 %a, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_or_v4:
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN-NOT: v_or_b32
+; GCN:     store_dword
+define amdgpu_kernel void @select_or_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
+  %c = icmp slt i32 %x, 11
+  %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %a = or <4 x i32> %s, %y
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32
+  ret void
+}
+
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants:
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 08b764824ca..b85c6bffe7e 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -31,25 +31,25 @@
 ; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
 ; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
 ; SI-DAG: v_sub_{{[iu]}}32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]]
-; SI: v_cndmask_b32_e64
-; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI: v_cmp_eq_u32_e64 [[CC1:s\[[0-9:]+\]]], 0, [[RCP_HI]]
+; SI: v_cndmask_b32_e64 [[CND1:v[0-9]+]], [[RCP_LO]], [[NEG_RCP_LO]], [[CC1]]
+; SI: v_mul_hi_u32 [[E:v[0-9]+]], [[CND1]], [[RCP]]
 ; SI-DAG: v_add_{{[iu]}}32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]]
 ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]]
-; SI: v_cndmask_b32_e64
-; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
-; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI: v_cndmask_b32_e64 [[CND2:v[0-9]+]], [[RCP_S_E]], [[RCP_A_E]], [[CC1]]
+; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]], [[CND2]],
+; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]], [[CND2]]
 ; SI-DAG: v_add_{{[iu]}}32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
 ; SI-DAG: v_sub_{{[iu]}}32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]]
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[Quotient_S_One:v[0-9]+]],
 ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[Remainder_S_Den:v[0-9]+]],
-; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_add_{{[iu]}}32_e32 [[Remainder_A_Den:v[0-9]+]],
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
+; SI-NOT: v_and_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
@@ -124,8 +124,6 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
@@ -147,8 +145,6 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
@@ -157,6 +153,7 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
+; SI-NOT: v_and_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
@@ -274,8 +271,6 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
@@ -297,8 +292,6 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
@@ -320,8 +313,6 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
@@ -339,6 +330,7 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; SI-DAG: v_add_{{[iu]}}32_e32
 ; SI-DAG: v_subrev_{{[iu]}}32_e32
 ; SI-DAG: v_cndmask_b32_e64
+; SI-NOT: v_and_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2018-06-21 16:02:05 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2018-06-21 16:02:05 +0000
commit	22ee191c3ea92dd7054975c9e9074fbf3c3825f5 (patch)
tree	a6ced5032526de4ef2d8e59d379fe05ec0a49acc /llvm/test/CodeGen/AMDGPU
parent	21a2973cc4c2d803ff680190a2ee17c15fb587dc (diff)
download	bcm5719-llvm-22ee191c3ea92dd7054975c9e9074fbf3c3825f5.tar.gz bcm5719-llvm-22ee191c3ea92dd7054975c9e9074fbf3c3825f5.zip