diff options
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 27 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 18 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86MCInstLower.cpp | 27 |
7 files changed, 78 insertions, 25 deletions
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 6d42a101b0e..9aca2da4902 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -287,4 +287,27 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } +void DecodeVPERMILPMask(const ConstantDataSequential *C, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); + assert(MaskTy->getVectorElementType()->isIntegerTy() && + "Expected integer constant mask elements!"); + int ElementBits = MaskTy->getScalarSizeInBits(); + int NumElements = MaskTy->getVectorNumElements(); + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + assert((unsigned)NumElements == C->getNumElements() && + "Constant mask has a different number of elements!"); + + ShuffleMask.reserve(NumElements); + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + uint64_t Element = C->getElementAsInteger(i); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } +} + } // llvm namespace diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 03a843e7b8d..8034d209ac3 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -84,6 +84,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const ConstantDataSequential *C, + SmallVectorImpl<int> &ShuffleMask); + } // llvm namespace #endif diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2e195080f8b..40ab77aaaa0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9395,26 +9395,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // If we have a single input shuffle with different shuffle patterns in the - // two 128-bit lanes, just do two shuffles and blend them together. This will - // be faster than extracting the high 128-bit lane, shuffling it, and - // re-inserting it. Especially on newer processors where blending is *the* - // fastest operation. + // two 128-bit lanes use the variable mask to VPERMILPS. if (isSingleInputShuffleMask(Mask)) { - int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; - int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]}; - for (int &M : HiMask) - if (M >= 0) - M -= 4; - SDValue Lo = V1, Hi = V1; - if (!isNoopShuffleMask(LoMask)) - Lo = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Lo, - getV4X86ShuffleImm8ForMask(LoMask, DAG)); - if (!isNoopShuffleMask(HiMask)) - Hi = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Hi, - getV4X86ShuffleImm8ForMask(HiMask, DAG)); - unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7; - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi, - DAG.getConstant(BlendMask, MVT::i8)); + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); } // Shuffle the input elements into the desired positions in V1 and V2 and diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index a624fa25dab..a16cf4a0b64 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -343,6 +343,7 @@ namespace llvm { MOVSS, UNPCKL, UNPCKH, + VPERMILPV, VPERMILPI, VPERMV, VPERMV3, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 455991e4681..2badbb7d76b 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; +def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -232,6 +234,7 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 7a7ca8548a1..a186899d231 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8418,6 +8418,15 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), @@ -8428,6 +8437,15 @@ def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index ded84fc28f1..5665a012606 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1022,15 +1022,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::PSHUFBrm: case X86::VPSHUFBrm: - // Lower PSHUFB normally but add a comment if we can find a constant - // shuffle mask. We won't be able to do this at the MC layer because the - // mask isn't an immediate. + case X86::VPERMILPSrm: + case X86::VPERMILPDrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPDYrm: + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. std::string Comment; raw_string_ostream CS(Comment); SmallVector<int, 16> Mask; - assert(MI->getNumOperands() >= 6 && - "Wrong number of operands for PSHUFBrm or VPSHUFBrm"); + // All of these instructions accept a constant pool operand as their fifth. + assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!"); const MachineOperand &DstOp = MI->getOperand(0); const MachineOperand &SrcOp = MI->getOperand(1); const MachineOperand &MaskOp = MI->getOperand(5); @@ -1061,7 +1065,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { assert(MaskTy == C->getType() && "Expected a constant of the same type!"); - DecodePSHUFBMask(C, Mask); + switch (MI->getOpcode()) { + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + DecodePSHUFBMask(C, Mask); + break; + case X86::VPERMILPSrm: + case X86::VPERMILPDrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPDYrm: + DecodeVPERMILPMask(C, Mask); + } + assert(Mask.size() == MaskTy->getVectorNumElements() && "Shuffle mask has a different size than its type!"); } |