summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2019-01-27 21:53:33 +0000
committerSanjay Patel <spatel@rotateright.com>2019-01-27 21:53:33 +0000
commitebe6b43aecc7d784a66afc63d746b106052ba7e3 (patch)
tree7995493ee9be9daa1ccdb7ca79c00c1d559f6005
parent816c9b3e25463d26e4bb3e0cc8889987c7af2704 (diff)
downloadbcm5719-llvm-ebe6b43aecc7d784a66afc63d746b106052ba7e3.tar.gz
bcm5719-llvm-ebe6b43aecc7d784a66afc63d746b106052ba7e3.zip
[x86] add restriction for lowering to vpermps
This transform was added with rL351346, and we had an escape for shufps, but we also want one for unpckps vs. vpermps because vpermps doesn't take an immediate shuffle index operand. llvm-svn: 352333
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp21
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll84
2 files changed, 45 insertions, 60 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ffe64f129bb..bca3e74b7d4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9719,6 +9719,21 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
return IsUnpackwdMask;
}
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(Mask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -11709,8 +11724,10 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
return SDValue();
// Final bailout: if the mask is simple, we are better off using an extract
- // and a simple narrow shuffle.
- if (NumElts == 4 && isSingleSHUFPSMask(NewMask))
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
return SDValue();
// Extend the shuffle mask with undef elements.
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
index 4bd4a481069..47d9c41e019 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
@@ -45,23 +45,15 @@ define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
ret <2 x double> %r
}
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8i32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckh_unary_extracted_v8i32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -69,20 +61,12 @@ define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
}
define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckh_unary_extracted_v8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -169,23 +153,15 @@ define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
ret <2 x double> %r
}
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8i32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckl_unary_extracted_v8i32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -193,20 +169,12 @@ define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
}
define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckl_unary_extracted_v8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
OpenPOWER on IntegriCloud