diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-09 21:47:55 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-09 21:47:55 +0000 |
| commit | 606126e848b256fb9d0a91371b15e5ec024a63f8 (patch) | |
| tree | 1bba188a65a70bf44414ac7003b856d89d2759e6 | |
| parent | bcb8190f9947c0fb573dce679447bdb05b4e4ae3 (diff) | |
| download | bcm5719-llvm-606126e848b256fb9d0a91371b15e5ec024a63f8.tar.gz bcm5719-llvm-606126e848b256fb9d0a91371b15e5ec024a63f8.zip | |
[X86][SSE] Add support for target shuffle combining to INSERTPS
llvm-svn: 274990
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 58 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/insertps-combine.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll | 2 |
3 files changed, 51 insertions, 18 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e3a85a70c6..83e910d4f14 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8677,16 +8677,14 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. -static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - SelectionDAG &DAG) { +static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, + unsigned &InsertPSMask, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; @@ -8707,7 +8705,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, // We can only insert a single non-zeroable element. if (V1DstIndex >= 0 || V2DstIndex >= 0) - return SDValue(); + return false; if (Mask[i] < 4) { // V1 input out of place for insertion. @@ -8720,7 +8718,7 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, // Don't bother if we have no (non-zeroable) element for insertion. if (V1DstIndex < 0 && V2DstIndex < 0) - return SDValue(); + return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. @@ -8740,8 +8738,21 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); - unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + // Insert the V2 element into the desired position. + InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + return true; +} + +static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + // Attempt to match the insertps pattern. + unsigned InsertPSMask; + if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) + return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, @@ -25081,6 +25092,33 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, } } + // Attempt to combine to INSERTPS. + if (Subtarget.hasSSE41() && NumMaskElts == 4 && + (VT == MVT::v2f64 || VT == MVT::v4f32)) { + SmallBitVector Zeroable(4, false); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] < 0) + Zeroable[i] = true; + + unsigned InsertPSMask; + SDValue V1 = Input, V2 = Input; + if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, + Zeroable, Mask, DAG)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) + return false; // Nothing to do! + V1 = DAG.getBitcast(MVT::v4f32, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MVT::v4f32, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll index 01d859e8d33..b21fdec624b 100644 --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -60,17 +60,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) { define <4 x float> @shuffle_v4f32_0zz0(float %a) { ; SSE-LABEL: shuffle_v4f32_0zz0: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0zz0: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] ; AVX-NEXT: retq %vecinit = insertelement <4 x float> undef, float %a, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index ffae771da46..ea6535d390e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -190,7 +190,7 @@ define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32_as_insertps: ; ALL: # BB#0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero ; ALL-NEXT: retq %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4> |

