diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 34 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/combine-pmuldq.ll | 68 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_gather_scatter.ll | 8 |
3 files changed, 54 insertions, 56 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 363cdee48f3..10dcf2168ba 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32423,6 +32423,37 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVMUL(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (VT.getScalarType() != MVT::i64) + return SDValue(); + + MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // MULDQ returns the 64-bit result of the signed multiplication of the lower + // 32-bits. We can lower with this if the sign bits stretch that far. + if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, LHS), + DAG.getBitcast(MulVT, RHS)); + } + + // If the upper bits are zero we can use a single pmuludq. + APInt Mask = APInt::getHighBitsSet(64, 32); + if (DAG.MaskedValueIsZero(LHS, Mask) && DAG.MaskedValueIsZero(RHS, Mask)) { + return DAG.getNode(X86ISD::PMULUDQ, dl, VT, DAG.getBitcast(MulVT, LHS), + DAG.getBitcast(MulVT, RHS)); + } + + return SDValue(); +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -32432,6 +32463,9 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); + if (!DCI.isBeforeLegalize() && VT.isVector()) + return combineVMUL(N, DAG, Subtarget); + if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index aa318c68a88..421c948efab 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -15,32 +15,14 @@ define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-NEXT: pmuldq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: combine_shuffle_sext_pmuldq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: combine_shuffle_sext_pmuldq: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQVL-LABEL: combine_shuffle_sext_pmuldq: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQVL-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512DQVL-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq +; AVX-LABEL: combine_shuffle_sext_pmuldq: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> %3 = sext <2 x i32> %1 to <2 x i64> @@ -60,32 +42,14 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: combine_shuffle_zext_pmuludq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: combine_shuffle_zext_pmuludq: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQVL-LABEL: combine_shuffle_zext_pmuludq: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq +; AVX-LABEL: combine_shuffle_zext_pmuludq: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> %3 = zext <2 x i32> %1 to <2 x i64> diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index d318dde3443..d3521ca9f1e 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -497,7 +497,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 @@ -510,7 +510,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 @@ -582,7 +582,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 @@ -595,7 +595,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 |