summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp120
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll7
2 files changed, 58 insertions, 69 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b89914f8893..c72f195ab65 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27120,29 +27120,44 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
ContainsZeros |= (M == SM_SentinelZero);
}
- // Attempt to match against byte/bit shifts.
- // FIXME: Add 512-bit support.
- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskScalarSizeInBits, Mask,
- 0, Zeroable, Subtarget);
- if (0 < ShiftAmt) {
- PermuteImm = (unsigned)ShiftAmt;
+ // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
+ if (!ContainsZeros && MaskScalarSizeInBits == 64) {
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ } else if (AllowFloatDomain && Subtarget.hasAVX()) {
+ // VPERMILPD can permute with a non-repeating shuffle.
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
return true;
}
}
- // Ensure we don't contain any zero elements.
- if (ContainsZeros)
- return false;
-
- assert(llvm::all_of(Mask, [&](int M) {
- return SM_SentinelUndef <= M && M < (int)NumMaskElts;
- }) && "Expected unary shuffle");
-
- // Handle PSHUFLW/PSHUFHW repeated patterns.
- if (MaskScalarSizeInBits == 16) {
+ // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(Mask.data() + 0, 4);
@@ -27170,12 +27185,30 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
+ }
+ }
- return false;
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskScalarSizeInBits, Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
+ return true;
}
- return false;
}
+ // Ensure we don't contain any zero elements.
+ if (ContainsZeros)
+ return false;
+
+ assert(llvm::all_of(Mask, [&](int M) {
+ return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+ }) && "Expected unary shuffle");
+
// We only support permutation of 32/64 bit elements after this.
if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
return false;
@@ -27185,48 +27218,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
return false;
- // Pre-AVX2 we must use float shuffles on 256-bit vectors.
- if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
- AllowFloatDomain = true;
- AllowIntDomain = false;
- }
-
- // Check for lane crossing permutes.
- if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
- // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
- if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
- PermuteImm = getV4X86ShuffleImm(Mask);
- return true;
- }
- if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
- SmallVector<int, 4> RepeatedMask;
- if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
- PermuteImm = getV4X86ShuffleImm(RepeatedMask);
- return true;
- }
- }
- return false;
- }
-
- // VPERMILPD can permute with a non-repeating shuffle.
- if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
- Shuffle = X86ISD::VPERMILPI;
- ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
- PermuteImm = 0;
- for (int i = 0, e = Mask.size(); i != e; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
- PermuteImm |= (M & 1) << i;
- }
- return true;
- }
-
// We need a repeating shuffle mask for VPERMILPS/PSHUFD.
SmallVector<int, 4> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
@@ -27578,7 +27569,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
bool AllowFloatDomain = FloatDomain || (Depth > 3);
- bool AllowIntDomain = !FloatDomain || (Depth > 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
+ (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 242872329a3..02314857c6d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -445,18 +445,15 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
ret <16 x i8> %res1
}
-; TODO - we could fold the load if we lowered to pshuflw instead.
define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
; SSE: # BB#0:
-; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; AVX-NEXT: retq
%res0 = load <16 x i8>, <16 x i8> *%a0, align 16
%res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
OpenPOWER on IntegriCloud