diff options
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 100 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-zext.ll | 99 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/widen_conv-3.ll | 40 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/widen_conv-4.ll | 42 |
6 files changed, 210 insertions, 77 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 8ba19f76797..5751fbfd5fd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -653,6 +653,7 @@ private: void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -713,6 +714,7 @@ private: SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N); + SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N); SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ffff3e97d79..6fde14390ec 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -621,6 +621,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi); break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + SplitVecRes_ExtVecInRegOp(N, Lo, Hi); + break; + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CONVERT_RNDSAT: @@ -917,6 +923,39 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo, DAG.getValueType(HiVT)); } +void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, + SDValue &Hi) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + + SDLoc dl(N); + SDValue InLo, InHi; + GetSplitVector(N0, InLo, InHi); + EVT InLoVT = InLo.getValueType(); + unsigned InNumElements = InLoVT.getVectorNumElements(); + + EVT OutLoVT, OutHiVT; + std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + unsigned OutNumElements = OutLoVT.getVectorNumElements(); + assert((2 * OutNumElements) <= InNumElements && + "Illegal extend vector in reg split"); + + // *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the + // input vector (i.e. we only use InLo): + // OutLo will extend the first OutNumElements from InLo. + // OutHi will extend the next OutNumElements from InLo. + + // Shuffle the elements from InLo for OutHi into the bottom elements to + // create a 'fake' InHi. + SmallVector<int, 8> SplitHi(InNumElements, -1); + for (unsigned i = 0; i != OutNumElements; ++i) + SplitHi[i] = i + OutNumElements; + InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi); + + Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo); + Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi); +} + void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); @@ -2069,6 +2108,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_Shift(N); break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Res = WidenVecRes_EXTEND_VECTOR_INREG(N); + break; + case ISD::ANY_EXTEND: case ISD::FP_EXTEND: case ISD::FP_ROUND: @@ -2355,6 +2400,61 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); } +SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue InOp = N->getOperand(0); + SDLoc DL(N); + + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT WidenSVT = WidenVT.getVectorElementType(); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + + EVT InVT = InOp.getValueType(); + EVT InSVT = InVT.getVectorElementType(); + unsigned InVTNumElts = InVT.getVectorNumElements(); + + if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { + InOp = GetWidenedVector(InOp); + InVT = InOp.getValueType(); + if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) { + switch (Opcode) { + case ISD::ANY_EXTEND_VECTOR_INREG: + return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT); + } + } + } + + // Unroll, extend the scalars and rebuild the vector. + SmallVector<SDValue, 16> Ops; + for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp, + DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + switch (Opcode) { + case ISD::ANY_EXTEND_VECTOR_INREG: + Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val); + break; + case ISD::SIGN_EXTEND_VECTOR_INREG: + Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val); + break; + case ISD::ZERO_EXTEND_VECTOR_INREG: + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val); + break; + default: + llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected"); + } + Ops.push_back(Val); + } + + while (Ops.size() != WidenNumElts) + Ops.push_back(DAG.getUNDEF(WidenSVT)); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); +} + SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { // If this is an FCOPYSIGN with same input types, we can treat it as a // normal (can trap) binary op. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c5b8723478..9aaf8205b96 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28634,7 +28634,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. - if (VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256())) { + // Also use this if we don't have SSE41 to allow the legalizer do its job. + if (!Subtarget.hasSSE41() || VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.hasInt256())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 803244c7848..4497e9f4b9b 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -544,23 +544,20 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_zext_4i8_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_4i8_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_4i8_to_4i64: @@ -625,22 +622,21 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) { ; SSE2-LABEL: load_zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_8i8_to_8i32: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i8_to_8i32: @@ -674,34 +670,33 @@ entry: define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { ; SSE2-LABEL: load_zext_8i8_to_8i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_8i8_to_8i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: pshufb %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,zero,zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[12],zero,zero,zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i8_to_8i64: @@ -851,21 +846,21 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-LABEL: load_zext_4i16_to_4i64: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_4i16_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_4i16_to_4i64: diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll index f9a03051896..e8fa1043e9f 100644 --- a/llvm/test/CodeGen/X86/widen_conv-3.ll +++ b/llvm/test/CodeGen/X86/widen_conv-3.ll @@ -49,18 +49,27 @@ entry: define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind { ; X86-SSE2-LABEL: convert_v3i8_to_v3f32: ; X86-SSE2: # BB#0: # %entry -; X86-SSE2-NEXT: pushl %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $32, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %eax +; X86-SSE2-NEXT: movl 12(%ebp), %ecx ; X86-SSE2-NEXT: movzwl (%ecx), %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, (%esp) +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: shll $8, %edx +; X86-SSE2-NEXT: movzbl (%esp), %esi +; X86-SSE2-NEXT: orl %edx, %esi +; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx -; X86-SSE2-NEXT: movd %ecx, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; X86-SSE2-NEXT: pslld $24, %xmm0 +; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: psrad $24, %xmm0 ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) @@ -69,7 +78,9 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: leal -4(%ebp), %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: convert_v3i8_to_v3f32: @@ -99,11 +110,16 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE2-NEXT: shll $8, %eax +; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE2-NEXT: orl %eax, %ecx +; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax -; X64-SSE2-NEXT: movd %eax, %xmm1 -; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; X64-SSE2-NEXT: pslld $24, %xmm0 +; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-SSE2-NEXT: psrad $24, %xmm0 ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll index 289fb1ed6a7..71b7976ab8b 100644 --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -75,18 +75,28 @@ entry: define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind { ; X86-SSE2-LABEL: convert_v3i8_to_v3f32: ; X86-SSE2: # BB#0: # %entry -; X86-SSE2-NEXT: pushl %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $32, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %eax +; X86-SSE2-NEXT: movl 12(%ebp), %ecx ; X86-SSE2-NEXT: movzwl (%ecx), %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, (%esp) +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: shll $8, %edx +; X86-SSE2-NEXT: movzbl (%esp), %esi +; X86-SSE2-NEXT: orl %edx, %esi +; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx -; X86-SSE2-NEXT: movd %ecx, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; X86-SSE2-NEXT: andps .LCPI1_0, %xmm0 +; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: movaps %xmm0, %xmm1 @@ -94,7 +104,9 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: popl %eax +; X86-SSE2-NEXT: leal -4(%ebp), %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: convert_v3i8_to_v3f32: @@ -123,11 +135,17 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE2-NEXT: shll $8, %eax +; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE2-NEXT: orl %eax, %ecx +; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax -; X64-SSE2-NEXT: movd %eax, %xmm1 -; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; X64-SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) ; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] |