diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-04-13 06:07:18 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-04-13 06:07:18 +0000 |
| commit | 254ed028a4bd4fa81d0049d90e6ab23d704dd366 (patch) | |
| tree | d271a92f4eeb695afa59afb66b153125a49b7d36 /llvm/lib | |
| parent | 7fc737a2472996609a855c2b8951b4a327d13d69 (diff) | |
| download | bcm5719-llvm-254ed028a4bd4fa81d0049d90e6ab23d704dd366.tar.gz bcm5719-llvm-254ed028a4bd4fa81d0049d90e6ab23d704dd366.zip | |
[X86] Remove the pmuldq/pmuldq intrinsics and replace with native IR.
This completes the work started in r329604 and r329605 when we changed clang to no longer use the intrinsics.
We lost some InstCombine SimplifyDemandedBit optimizations through this change as we aren't able to fold 'and', bitcast, shuffle very well.
llvm-svn: 329990
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 63 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 18 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 69 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 29 |
4 files changed, 45 insertions, 134 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 41309f2ef74..78385ac82d3 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -168,6 +168,12 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.pmull.") || // Added in 4.0 Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0 Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0 + Name == "sse2.pmulu.dq" || // Added in 7.0 + Name == "sse41.pmuldq" || // Added in 7.0 + Name == "avx2.pmulu.dq" || // Added in 7.0 + Name == "avx2.pmul.dq" || // Added in 7.0 + Name == "avx512.pmulu.dq.512" || // Added in 7.0 + Name == "avx512.pmul.dq.512" || // Added in 7.0 Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0 Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0 Name.startswith("avx512.mask.pmul.hr.sw.") || // Added in 7.0 @@ -906,6 +912,35 @@ static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI, return Res; } +static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) { + Type *Ty = CI.getType(); + + // Arguments have a vXi32 type so cast to vXi64. + Value *LHS = Builder.CreateBitCast(CI.getArgOperand(0), Ty); + Value *RHS = Builder.CreateBitCast(CI.getArgOperand(1), Ty); + + if (IsSigned) { + // Shift left then arithmetic shift right. + Constant *ShiftAmt = ConstantInt::get(Ty, 32); + LHS = Builder.CreateShl(LHS, ShiftAmt); + LHS = Builder.CreateAShr(LHS, ShiftAmt); + RHS = Builder.CreateShl(RHS, ShiftAmt); + RHS = Builder.CreateAShr(RHS, ShiftAmt); + } else { + // Clear the upper bits. + Constant *Mask = ConstantInt::get(Ty, 0xffffffff); + LHS = Builder.CreateAnd(LHS, Mask); + RHS = Builder.CreateAnd(RHS, Mask); + } + + Value *Res = Builder.CreateMul(LHS, RHS); + + if (CI.getNumArgOperands() == 4) + Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2)); + + return Res; +} + // Applying mask on vector of i1's and make sure result is at least 8 bits wide. static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder,Value *Vec, Value *Mask, unsigned NumElts) { @@ -1028,24 +1063,6 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_avx512_pshuf_b_512; else llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("pmul.dq.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse41_pmuldq; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_pmul_dq; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_pmul_dq_512; - else - llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("pmulu.dq.")) { - if (VecWidth == 128) - IID = Intrinsic::x86_sse2_pmulu_dq; - else if (VecWidth == 256) - IID = Intrinsic::x86_avx2_pmulu_dq; - else if (VecWidth == 512) - IID = Intrinsic::x86_avx512_pmulu_dq_512; - else - llvm_unreachable("Unexpected intrinsic"); } else if (Name.startswith("pmul.hr.sw.")) { if (VecWidth == 128) IID = Intrinsic::x86_ssse3_pmul_hr_sw_128; @@ -1455,6 +1472,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name.startswith("avx2.pminu") || Name.startswith("avx512.mask.pminu"))) { Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT); + } else if (IsX86 && (Name == "sse2.pmulu.dq" || + Name == "avx2.pmulu.dq" || + Name == "avx512.pmulu.dq.512" || + Name.startswith("avx512.mask.pmulu.dq."))) { + Rep = upgradePMULDQ(Builder, *CI, /*Signed*/false); + } else if (IsX86 && (Name == "sse41.pmuldq" || + Name == "avx2.pmul.dq" || + Name == "avx512.pmul.dq.512" || + Name.startswith("avx512.mask.pmul.dq."))) { + Rep = upgradePMULDQ(Builder, *CI, /*Signed*/true); } else if (IsX86 && (Name == "sse2.cvtdq2pd" || Name == "sse2.cvtps2pd" || Name == "avx.cvtdq2.pd.256" || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1870a668c98..08e4f6387b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20855,24 +20855,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - case Intrinsic::x86_avx512_pmul_dq_512: { - MVT OpVT = Op.getSimpleValueType(); - return DAG.getNode(X86ISD::PMULDQ, dl, OpVT, - DAG.getBitcast(OpVT, Op.getOperand(1)), - DAG.getBitcast(OpVT, Op.getOperand(2))); - } - - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_avx2_pmulu_dq: - case Intrinsic::x86_avx512_pmulu_dq_512: { - MVT OpVT = Op.getSimpleValueType(); - return DAG.getNode(X86ISD::PMULUDQ, dl, OpVT, - DAG.getBitcast(OpVT, Op.getOperand(1)), - DAG.getBitcast(OpVT, Op.getOperand(2))); - } - case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index e545cc27a70..c62b4d6cdd9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -566,55 +566,6 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, return Builder.CreateAShr(Vec, ShiftVec); } -static Value *simplifyX86muldq(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Type *ResTy = II.getType(); - assert(Arg0->getType()->getScalarSizeInBits() == 32 && - Arg1->getType()->getScalarSizeInBits() == 32 && - ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types"); - - // muldq/muludq(undef, undef) -> zero (matches generic mul behavior) - if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) - return ConstantAggregateZero::get(ResTy); - - // Constant folding. - // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)), - // vXi64 sext(shuffle<0,2,..>(Arg1)))) - // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)), - // vXi64 zext(shuffle<0,2,..>(Arg1)))) - if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) - return nullptr; - - unsigned NumElts = ResTy->getVectorNumElements(); - assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) && - Arg1->getType()->getVectorNumElements() == (2 * NumElts) && - "Unexpected muldq/muludq types"); - - unsigned IntrinsicID = II.getIntrinsicID(); - bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID || - Intrinsic::x86_avx2_pmul_dq == IntrinsicID || - Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID); - - SmallVector<unsigned, 16> ShuffleMask; - for (unsigned i = 0; i != NumElts; ++i) - ShuffleMask.push_back(i * 2); - - auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask); - auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask); - - if (IsSigned) { - LHS = Builder.CreateSExt(LHS, ResTy); - RHS = Builder.CreateSExt(RHS, ResTy); - } else { - LHS = Builder.CreateZExt(LHS, ResTy); - RHS = Builder.CreateZExt(RHS, ResTy); - } - - return Builder.CreateMul(LHS, RHS); -} - static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) { Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); @@ -2642,26 +2593,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - case Intrinsic::x86_avx2_pmulu_dq: - case Intrinsic::x86_avx512_pmul_dq_512: - case Intrinsic::x86_avx512_pmulu_dq_512: { - if (Value *V = simplifyX86muldq(*II, Builder)) - return replaceInstUsesWith(*II, V); - - unsigned VWidth = II->getType()->getVectorNumElements(); - APInt UndefElts(VWidth, 0); - APInt DemandedElts = APInt::getAllOnesValue(VWidth); - if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { - if (V != II) - return replaceInstUsesWith(*II, V); - return II; - } - break; - } - case Intrinsic::x86_sse2_packssdw_128: case Intrinsic::x86_sse2_packsswb_128: case Intrinsic::x86_avx2_packssdw: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 35ddd7a3eb2..0c03cc31228 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1436,35 +1436,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - case Intrinsic::x86_avx2_pmulu_dq: - case Intrinsic::x86_avx512_pmul_dq_512: - case Intrinsic::x86_avx512_pmulu_dq_512: { - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - unsigned InnerVWidth = Op0->getType()->getVectorNumElements(); - assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); - - APInt InnerDemandedElts(InnerVWidth, 0); - for (unsigned i = 0; i != VWidth; ++i) - if (DemandedElts[i]) - InnerDemandedElts.setBit(i * 2); - - UndefElts2 = APInt(InnerVWidth, 0); - TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2, - Depth + 1); - if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } - - UndefElts3 = APInt(InnerVWidth, 0); - TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3, - Depth + 1); - if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } - - break; - } - case Intrinsic::x86_sse2_packssdw_128: case Intrinsic::x86_sse2_packsswb_128: case Intrinsic::x86_sse2_packuswb_128: |

