diff options
author | Benjamin Kramer <benny.kra@googlemail.com> | 2018-07-03 11:15:17 +0000 |
---|---|---|
committer | Benjamin Kramer <benny.kra@googlemail.com> | 2018-07-03 11:15:17 +0000 |
commit | fd171f2f893d0bf1e7d2af0200c31a7df32b2553 (patch) | |
tree | 3983613a5f2bfe7c440d7d3f449b1129a688d0ed | |
parent | b371ccc6615ff217d7cf7f6ab8b640ee343c5e88 (diff) | |
download | bcm5719-llvm-fd171f2f893d0bf1e7d2af0200c31a7df32b2553.tar.gz bcm5719-llvm-fd171f2f893d0bf1e7d2af0200c31a7df32b2553.zip |
Revert "[X86][SSE] Blend any v8i16/v4i32 shift with 2 shift unique values"
This reverts commit r336113. It causes crashes.
llvm-svn: 336189
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 69 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/lower-vec-shift.ll | 34 |
2 files changed, 69 insertions, 34 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b96794876d2..e59da306dae 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23441,7 +23441,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MUL, dl, VT, R, Scale); // If possible, lower this shift as a sequence of two shifts by - // constant plus a BLENDing shuffle instead of scalarizing it. + // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // @@ -23449,39 +23449,64 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) // // The advantage is that the two shifts from the example would be - // lowered as X86ISD::VSRLI nodes in parallel before blending. + // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing + // the vector shift into four scalar shifts plus four pairs of vector + // insert/extract. if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { - SDValue Amt1, Amt2; - unsigned NumElts = VT.getVectorNumElements(); - SmallVector<int, 8> ShuffleMask; - for (unsigned i = 0; i != NumElts; ++i) { - SDValue A = Amt->getOperand(i); - if (A.isUndef()) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - if (!Amt1 || Amt1 == A) { - ShuffleMask.push_back(i); - Amt1 = A; - continue; + bool UseMOVSD = false; + bool CanBeSimplified; + // The splat value for the first packed shift (the 'X' from the example). + SDValue Amt1 = Amt->getOperand(0); + // The splat value for the second packed shift (the 'Y' from the example). + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); + + // See if it is possible to replace this node with a sequence of + // two shifts followed by a MOVSS/MOVSD/PBLEND. + if (VT == MVT::v4i32) { + // Check if it is legal to use a MOVSS. + CanBeSimplified = Amt2 == Amt->getOperand(2) && + Amt2 == Amt->getOperand(3); + if (!CanBeSimplified) { + // Otherwise, check if we can still simplify this node using a MOVSD. + CanBeSimplified = Amt1 == Amt->getOperand(1) && + Amt->getOperand(2) == Amt->getOperand(3); + UseMOVSD = true; + Amt2 = Amt->getOperand(2); } - if (!Amt2 || Amt2 == A) { - ShuffleMask.push_back(i + NumElts); - Amt2 = A; - continue; + } else { + // Do similar checks for the case where the machine value type + // is MVT::v8i16. + CanBeSimplified = Amt1 == Amt->getOperand(1); + for (unsigned i=3; i != 8 && CanBeSimplified; ++i) + CanBeSimplified = Amt2 == Amt->getOperand(i); + + if (!CanBeSimplified) { + UseMOVSD = true; + CanBeSimplified = true; + Amt2 = Amt->getOperand(4); + for (unsigned i=0; i != 4 && CanBeSimplified; ++i) + CanBeSimplified = Amt1 == Amt->getOperand(i); + for (unsigned j=4; j != 8 && CanBeSimplified; ++j) + CanBeSimplified = Amt2 == Amt->getOperand(j); } - break; } - if (ShuffleMask.size() == NumElts && isa<ConstantSDNode>(Amt1) && + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { + // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND. SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); - return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); + SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1); + SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2); + if (UseMOVSD) + return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, + BitCast2, {0, 1, 6, 7})); + return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, + BitCast2, {0, 5, 6, 7})); } } diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll index cca165e29ff..1dfb8ca65bc 100644 --- a/llvm/test/CodeGen/X86/lower-vec-shift.ll +++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll @@ -211,21 +211,31 @@ define <4 x i32> @test8(<4 x i32> %a) { define <8 x i16> @test9(<8 x i16> %a) { ; SSE-LABEL: test9: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psraw $3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0] -; SSE-NEXT: psraw $1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: psraw $2, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: psraw $1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test9: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: test9: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test9: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> ret <8 x i16> %lshr } |