summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp23
-rw-r--r--llvm/lib/Target/X86/Utils/X86ShuffleDecode.h4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp27
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h1
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td3
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td18
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp27
7 files changed, 78 insertions, 25 deletions
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 6d42a101b0e..9aca2da4902 100644
--- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -287,4 +287,27 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+ assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+ "Expected integer constant mask elements!");
+ int ElementBits = MaskTy->getScalarSizeInBits();
+ int NumElements = MaskTy->getVectorNumElements();
+ assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ "Unexpected number of vector elements.");
+ assert((unsigned)NumElements == C->getNumElements() &&
+ "Constant mask has a different number of elements!");
+
+ ShuffleMask.reserve(NumElements);
+ for (int i = 0; i < NumElements; ++i) {
+ int Base = (i * ElementBits / 128) * (128 / ElementBits);
+ uint64_t Element = C->getElementAsInteger(i);
+ // Only the least significant 2 bits of the integer are used.
+ int Index = Base + (Element & 0x3);
+ ShuffleMask.push_back(Index);
+ }
+}
+
} // llvm namespace
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
index 03a843e7b8d..8034d209ac3 100644
--- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -84,6 +84,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
/// No VT provided since it only works on 256-bit, 4 element vectors.
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+ SmallVectorImpl<int> &ShuffleMask);
+
} // llvm namespace
#endif
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2e195080f8b..40ab77aaaa0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9395,26 +9395,15 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
// If we have a single input shuffle with different shuffle patterns in the
- // two 128-bit lanes, just do two shuffles and blend them together. This will
- // be faster than extracting the high 128-bit lane, shuffling it, and
- // re-inserting it. Especially on newer processors where blending is *the*
- // fastest operation.
+ // two 128-bit lanes use the variable mask to VPERMILPS.
if (isSingleInputShuffleMask(Mask)) {
- int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
- int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]};
- for (int &M : HiMask)
- if (M >= 0)
- M -= 4;
- SDValue Lo = V1, Hi = V1;
- if (!isNoopShuffleMask(LoMask))
- Lo = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Lo,
- getV4X86ShuffleImm8ForMask(LoMask, DAG));
- if (!isNoopShuffleMask(HiMask))
- Hi = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Hi,
- getV4X86ShuffleImm8ForMask(HiMask, DAG));
- unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7;
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi,
- DAG.getConstant(BlendMask, MVT::i8));
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], MVT::i32);
+ return DAG.getNode(
+ X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
}
// Shuffle the input elements into the desired positions in V1 and V2 and
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index a624fa25dab..a16cf4a0b64 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -343,6 +343,7 @@ namespace llvm {
MOVSS,
UNPCKL,
UNPCKH,
+ VPERMILPV,
VPERMILPI,
VPERMV,
VPERMV3,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 455991e4681..2badbb7d76b 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>]>;
def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>]>;
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -232,6 +234,7 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 7a7ca8548a1..a186899d231 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8418,6 +8418,15 @@ let ExeDomain = SSEPackedDouble in {
}
let Predicates = [HasAVX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+ (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+ (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+ (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
(VPERMILPSYri VR256:$src1, imm:$imm)>;
def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
@@ -8428,6 +8437,15 @@ def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
(VPERMILPDYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+ (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+ (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+ (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+ (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
(VPERMILPDri VR128:$src1, imm:$imm)>;
def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index ded84fc28f1..5665a012606 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1022,15 +1022,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
- // Lower PSHUFB normally but add a comment if we can find a constant
- // shuffle mask. We won't be able to do this at the MC layer because the
- // mask isn't an immediate.
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPDYrm:
+ // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+ // a constant shuffle mask. We won't be able to do this at the MC layer
+ // because the mask isn't an immediate.
std::string Comment;
raw_string_ostream CS(Comment);
SmallVector<int, 16> Mask;
- assert(MI->getNumOperands() >= 6 &&
- "Wrong number of operands for PSHUFBrm or VPSHUFBrm");
+ // All of these instructions accept a constant pool operand as their fifth.
+ assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!");
const MachineOperand &DstOp = MI->getOperand(0);
const MachineOperand &SrcOp = MI->getOperand(1);
const MachineOperand &MaskOp = MI->getOperand(5);
@@ -1061,7 +1065,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
assert(MaskTy == C->getType() &&
"Expected a constant of the same type!");
- DecodePSHUFBMask(C, Mask);
+ switch (MI->getOpcode()) {
+ case X86::PSHUFBrm:
+ case X86::VPSHUFBrm:
+ DecodePSHUFBMask(C, Mask);
+ break;
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPDYrm:
+ DecodeVPERMILPMask(C, Mask);
+ }
+
assert(Mask.size() == MaskTy->getVectorNumElements() &&
"Shuffle mask has a different size than its type!");
}
OpenPOWER on IntegriCloud