summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2014-09-14 23:43:33 +0000
committerChandler Carruth <chandlerc@gmail.com>2014-09-14 23:43:33 +0000
commit0a98790b3202a5888c824fa9c3d18c3e484f8d69 (patch)
tree82a676bdb06312d4a49e40b6331bda6bce92bc95
parent87ebb6859c071b3b5ba9d432e6c0bc471266fe96 (diff)
downloadbcm5719-llvm-0a98790b3202a5888c824fa9c3d18c3e484f8d69.tar.gz
bcm5719-llvm-0a98790b3202a5888c824fa9c3d18c3e484f8d69.zip
[x86] Teach the new vector shuffle lowering to use BLENDPS and BLENDPD.
These are super simple. They even take precedence over crazy instructions like INSERTPS because they have very high throughput on modern x86 chips. I still have to teach the integer shuffle variants about this to avoid so many domain crossings. However, due to the particular instructions available, that's a touch more complex and so a separate patch. Also, the backend doesn't seem to realize it can commute blend instructions by negating the mask. That would help remove a number of copies here. Suggestions on how to do this welcome, it's an area I'm less familiar with. llvm-svn: 217744
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp35
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll100
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll14
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll22
4 files changed, 134 insertions, 37 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f3774321a07..8c9d8711d1f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7233,6 +7233,31 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
return DAG.getConstant(Imm, MVT::i8);
}
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is in fact a blend.
+static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+
+ unsigned BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] >= Size) {
+ if (Mask[i] != i + Size)
+ return SDValue(); // Shuffled V2 input!
+ BlendMask |= 1u << i;
+ continue;
+ }
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return SDValue(); // Shuffled V1 input!
+ }
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -7267,6 +7292,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG))
+ return Blend;
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
@@ -7353,6 +7383,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return Blend;
+
if (NumV2Elements == 1) {
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 619105f5026..f6382a98559 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -111,17 +111,35 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: @shuffle_v2f64_03
-; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2f64_03
+; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2f64_03
+; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2f64_03
+; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: @shuffle_v2f64_21
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2f64_21
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2f64_21
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2f64_21
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
}
@@ -143,17 +161,35 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_03
-; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_03
+; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_03
+; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_03
+; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_03_copy
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_03_copy
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_03_copy
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_03_copy
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
@@ -204,18 +240,38 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_21
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_21
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_21
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_21
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_21_copy
-; ALL: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; ALL-NEXT: movapd %xmm2, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_21_copy
+; SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_21_copy
+; SSE3: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE3-NEXT: movapd %xmm2, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_21_copy
+; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 9105197f67c..d5bb55a2caa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -216,11 +216,14 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_4zzz
-; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT: blendps {{.*}} # [[X]] = xmm0[0],[[X]][1,2,3]
+; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_4zzz
-; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
+; AVX1-NEXT: vblendps {{.*}} # xmm0 = xmm0[0],[[X]][1,2,3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle
@@ -290,11 +293,14 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_zzz7
-; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT: blendps {{.*}} # [[X]] = [[X]][0,1,2],xmm0[3]
+; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_zzz7
-; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
+; AVX1-NEXT: vblendps {{.*}} # xmm0 = [[X]][0,1,2],xmm0[3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
ret <4 x float> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index a21b78985d7..cd79a38ca4a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -40,7 +40,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -119,7 +119,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -282,7 +282,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
@@ -293,7 +293,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
@@ -305,7 +305,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
@@ -317,7 +317,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
@@ -335,9 +335,9 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_0451
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@@ -355,9 +355,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: @shuffle_v4i64_4015
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
@@ -370,7 +370,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
OpenPOWER on IntegriCloud