summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp38
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll14
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll20
3 files changed, 53 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da3ec8b35eb..b5744e093d2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7258,8 +7258,27 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
if (Mask[i] >= 0 && Mask[i] != i)
return SDValue(); // Shuffled V1 input!
}
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ if (VT == MVT::v4f32 || VT == MVT::v2f64)
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!");
+
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) &&
+ "Not a supported integer vector type!");
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
@@ -7343,6 +7362,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
@@ -7631,6 +7655,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
@@ -8289,6 +8318,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Blend;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index f6382a98559..d23b0794305 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -170,7 +170,7 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03
-; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41: pblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
@@ -187,8 +187,8 @@ define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03_copy
-; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
@@ -251,8 +251,8 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21
-; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
@@ -269,8 +269,8 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21_copy
-; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41: pblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index affa933768b..757ef8bf176 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -40,7 +40,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -282,7 +282,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
@@ -293,7 +293,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
@@ -305,7 +305,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
@@ -317,7 +317,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
@@ -335,9 +335,9 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0451
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@@ -355,9 +355,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_4015
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
@@ -369,7 +369,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
OpenPOWER on IntegriCloud