diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-24 19:11:28 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-24 19:11:28 +0000 |
| commit | c5bb362b1328dbce6cba55452bce12eed8f72663 (patch) | |
| tree | 07981dae1d753db03f2ab5a133251b9458f21735 /llvm | |
| parent | 06570954e2cd955f3c7c246131ca350ade78d754 (diff) | |
| download | bcm5719-llvm-c5bb362b1328dbce6cba55452bce12eed8f72663.tar.gz bcm5719-llvm-c5bb362b1328dbce6cba55452bce12eed8f72663.zip | |
[X86][SSE] Add SimplifyDemandedBitsForTargetNode PMULDQ/PMULUDQ handling
Add X86 SimplifyDemandedBitsForTargetNode and use it to simplify PMULDQ/PMULUDQ target nodes.
This enables us to repeatedly simplify the node's arguments after the previous approach had to be reverted due to PR39398.
Differential Revision: https://reviews.llvm.org/D53643
llvm-svn: 345182
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 32 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-pmuldq.ll | 30 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll | 68 |
4 files changed, 70 insertions, 66 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 44d0d711dd1..d86f9d5a220 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31870,6 +31870,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return false; } +bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known, + TargetLoweringOpt &TLO, unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + switch(Opc) { + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. + KnownBits KnownOp; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + APInt DemandedMask = OriginalDemandedBits & APInt::getLowBitsSet(64, 32); + if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1)) + return true; + break; + } + } + + return TargetLowering::SimplifyDemandedBitsForTargetNode( + Op, OriginalDemandedBits, Known, TLO, Depth); +} + /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but @@ -40362,13 +40386,9 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, if (ISD::isBuildVectorAllZeros(RHS.getNode())) return RHS; + // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - APInt DemandedMask(APInt::getLowBitsSet(64, 32)); - - // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element. - if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI)) - return SDValue(N, 0); - if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI)) + if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); return SDValue(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index eeef7579714..fea7ecbdbb4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -874,6 +874,12 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, + const APInt &DemandedBits, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index edc6cb01d97..cd58947b186 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -47,26 +47,10 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: combine_shuffle_zero_pmuludq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: combine_shuffle_zero_pmuludq: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq +; AVX-LABEL: combine_shuffle_zero_pmuludq: +; AVX: # %bb.0: +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> %3 = bitcast <4 x i32> %1 to <2 x i64> @@ -84,22 +68,16 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) ; ; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 9f339a8a555..82385386c88 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -143,31 +143,31 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_even_div(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_even_div: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,3435973837,2863311531,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrld $1, %xmm3 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -377,30 +377,30 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind readnone { define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone { ; CHECK-SSE2-LABEL: test_urem_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,0,2863311531,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrld $1, %xmm3 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] -; CHECK-SSE2-NEXT: psrld $3, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,1,12,14] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] |

