summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2016-08-19 10:31:53 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2016-08-19 10:31:53 +0000
commitf1b8fdc074d2a9667ee62d36f8bfdaeaa21980c2 (patch)
tree6d48b49e41817ba577f1647776207499ab5ab521 /llvm
parent11a1936b70be22a9b99fc661fd46a837507df65c (diff)
downloadbcm5719-llvm-f1b8fdc074d2a9667ee62d36f8bfdaeaa21980c2.tar.gz
bcm5719-llvm-f1b8fdc074d2a9667ee62d36f8bfdaeaa21980c2.zip
[X86][SSE] Add support for matching commuted insertps patterns
INSERTPS doesn't fit well with our shuffle mask canonicalization, so we need to attempt both the original mask and the commuted mask to more likely get a match llvm-svn: 279230
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp112
-rw-r--r--llvm/test/CodeGen/X86/insertps-combine.ll9
2 files changed, 70 insertions, 51 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0815f830ccc..7745e6c4ce9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8762,63 +8762,85 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- unsigned ZMask = 0;
- int V1DstIndex = -1;
- int V2DstIndex = -1;
- bool V1UsedInPlace = false;
- for (int i = 0; i < 4; ++i) {
- // Synthesize a zero mask from the zeroable elements (includes undefs).
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- continue;
- }
+ // Attempt to match INSERTPS with one element from VA or VB being
+ // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+ // are updated.
+ auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+ ArrayRef<int> CandidateMask) {
+ unsigned ZMask = 0;
+ int VADstIndex = -1;
+ int VBDstIndex = -1;
+ bool VAUsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
- // Flag if we use any V1 inputs in place.
- if (i == Mask[i]) {
- V1UsedInPlace = true;
- continue;
+ // Flag if we use any VA inputs in place.
+ if (i == CandidateMask[i]) {
+ VAUsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (VADstIndex >= 0 || VBDstIndex >= 0)
+ return false;
+
+ if (CandidateMask[i] < 4) {
+ // VA input out of place for insertion.
+ VADstIndex = i;
+ } else {
+ // VB input for insertion.
+ VBDstIndex = i;
+ }
}
- // We can only insert a single non-zeroable element.
- if (V1DstIndex >= 0 || V2DstIndex >= 0)
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (VADstIndex < 0 && VBDstIndex < 0)
return false;
- if (Mask[i] < 4) {
- // V1 input out of place for insertion.
- V1DstIndex = i;
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned VBSrcIndex = 0;
+ if (VADstIndex >= 0) {
+ // If we have a VA input out of place, we use VA as the V2 element
+ // insertion and don't use the original V2 at all.
+ VBSrcIndex = CandidateMask[VADstIndex];
+ VBDstIndex = VADstIndex;
+ VB = VA;
} else {
- // V2 input for insertion.
- V2DstIndex = i;
+ VBSrcIndex = CandidateMask[VBDstIndex] - 4;
}
- }
- // Don't bother if we have no (non-zeroable) element for insertion.
- if (V1DstIndex < 0 && V2DstIndex < 0)
- return false;
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!VAUsedInPlace)
+ VA = DAG.getUNDEF(MVT::v4f32);
- // Determine element insertion src/dst indices. The src index is from the
- // start of the inserted vector, not the start of the concatenated vector.
- unsigned V2SrcIndex = 0;
- if (V1DstIndex >= 0) {
- // If we have a V1 input out of place, we use V1 as the V2 element insertion
- // and don't use the original V2 at all.
- V2SrcIndex = Mask[V1DstIndex];
- V2DstIndex = V1DstIndex;
- V2 = V1;
- } else {
- V2SrcIndex = Mask[V2DstIndex] - 4;
- }
+ // Update V1, V2 and InsertPSMask accordingly.
+ V1 = VA;
+ V2 = VB;
- // If no V1 inputs are used in place, then the result is created only from
- // the zero mask and the V2 insertion - so remove V1 dependency.
- if (!V1UsedInPlace)
- V1 = DAG.getUNDEF(MVT::v4f32);
+ // Insert the V2 element into the desired position.
+ InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+ };
- // Insert the V2 element into the desired position.
- InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
- return true;
+ if (matchAsInsertPS(V1, V2, Mask))
+ return true;
+
+ // Commute and try again.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ if (matchAsInsertPS(V2, V1, CommutedMask))
+ return true;
+
+ return false;
}
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll
index c422b6dbf2c..b55a029ea65 100644
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -96,16 +96,13 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_z06z:
; SSE: # BB#0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3]
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_z06z:
; AVX: # BB#0:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],xmm1[2],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 undef>
%shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00>, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
OpenPOWER on IntegriCloud