diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 112 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/insertps-combine.ll | 9 |
2 files changed, 70 insertions, 51 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0815f830ccc..7745e6c4ce9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8762,63 +8762,85 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - unsigned ZMask = 0; - int V1DstIndex = -1; - int V2DstIndex = -1; - bool V1UsedInPlace = false; - for (int i = 0; i < 4; ++i) { - // Synthesize a zero mask from the zeroable elements (includes undefs). - if (Zeroable[i]) { - ZMask |= 1 << i; - continue; - } + // Attempt to match INSERTPS with one element from VA or VB being + // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask + // are updated. + auto matchAsInsertPS = [&](SDValue VA, SDValue VB, + ArrayRef<int> CandidateMask) { + unsigned ZMask = 0; + int VADstIndex = -1; + int VBDstIndex = -1; + bool VAUsedInPlace = false; + + for (int i = 0; i < 4; ++i) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } - // Flag if we use any V1 inputs in place. - if (i == Mask[i]) { - V1UsedInPlace = true; - continue; + // Flag if we use any VA inputs in place. + if (i == CandidateMask[i]) { + VAUsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (VADstIndex >= 0 || VBDstIndex >= 0) + return false; + + if (CandidateMask[i] < 4) { + // VA input out of place for insertion. + VADstIndex = i; + } else { + // VB input for insertion. + VBDstIndex = i; + } } - // We can only insert a single non-zeroable element. - if (V1DstIndex >= 0 || V2DstIndex >= 0) + // Don't bother if we have no (non-zeroable) element for insertion. + if (VADstIndex < 0 && VBDstIndex < 0) return false; - if (Mask[i] < 4) { - // V1 input out of place for insertion. - V1DstIndex = i; + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned VBSrcIndex = 0; + if (VADstIndex >= 0) { + // If we have a VA input out of place, we use VA as the V2 element + // insertion and don't use the original V2 at all. + VBSrcIndex = CandidateMask[VADstIndex]; + VBDstIndex = VADstIndex; + VB = VA; } else { - // V2 input for insertion. - V2DstIndex = i; + VBSrcIndex = CandidateMask[VBDstIndex] - 4; } - } - // Don't bother if we have no (non-zeroable) element for insertion. - if (V1DstIndex < 0 && V2DstIndex < 0) - return false; + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!VAUsedInPlace) + VA = DAG.getUNDEF(MVT::v4f32); - // Determine element insertion src/dst indices. The src index is from the - // start of the inserted vector, not the start of the concatenated vector. - unsigned V2SrcIndex = 0; - if (V1DstIndex >= 0) { - // If we have a V1 input out of place, we use V1 as the V2 element insertion - // and don't use the original V2 at all. - V2SrcIndex = Mask[V1DstIndex]; - V2DstIndex = V1DstIndex; - V2 = V1; - } else { - V2SrcIndex = Mask[V2DstIndex] - 4; - } + // Update V1, V2 and InsertPSMask accordingly. + V1 = VA; + V2 = VB; - // If no V1 inputs are used in place, then the result is created only from - // the zero mask and the V2 insertion - so remove V1 dependency. - if (!V1UsedInPlace) - V1 = DAG.getUNDEF(MVT::v4f32); + // Insert the V2 element into the desired position. + InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + return true; + }; - // Insert the V2 element into the desired position. - InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); - return true; + if (matchAsInsertPS(V1, V2, Mask)) + return true; + + // Commute and try again. + SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + if (matchAsInsertPS(V2, V1, CommutedMask)) + return true; + + return false; } static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll index c422b6dbf2c..b55a029ea65 100644 --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -96,16 +96,13 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) { define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: shuffle_v4f32_z06z: ; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3] -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; SSE-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_z06z: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,3] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],xmm1[2],zero ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 undef> %shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00>, <4 x i32> <i32 4, i32 1, i32 2, i32 7> |

