summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-08-25 18:01:24 +0000
committerCraig Topper <craig.topper@intel.com>2018-08-25 18:01:24 +0000
commitebec2793d1768d655a4afab19420ab8efb9d8d89 (patch)
tree4eacd21a69944e51886dbe51e2c4e39f9d105605
parenta11a3b381806e2bd4e54ed1dfa32b57723e30714 (diff)
downloadbcm5719-llvm-ebec2793d1768d655a4afab19420ab8efb9d8d89.tar.gz
bcm5719-llvm-ebec2793d1768d655a4afab19420ab8efb9d8d89.zip
[X86] Replace support for vXi32 SMUL_LOHI/UMUL_LOHI with MULHS/MULHU support instead.
Summary: The only time vector SMUL_LOHI/UMUL_LOHI nodes are created is during division/remainder lowering. If its created before op legalization, generic DAGCombine immediately turns that SMUL_LOHI/UMUL_LOHI into a MULHS/MULHU since only the upper half is used. That node will stick around through vector op legalization and will be turned back into UMUL_LOHI/SMUL_LOHI during op legalization. It will then be custom lowered by the X86 backend. Due to this two step lowering the vector shuffles created by the custom lowering get legalized after their inputs rather than before. This prevents the shuffles from being combined with any build_vector of constants. This patch uses changes vXi32 to use MULHS/MULHU instead. This is what the later DAG combine did anyway. But by skipping the change back to UMUL_LOHI/SMUL_LOHI we lower it before any constant BUILD_VECTORS. This allows the vector_shuffle creation to constant fold with the build_vectors. This accounts for the test changes here. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D51254 llvm-svn: 340690
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp183
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll96
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll116
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll26
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll110
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv.ll40
-rw-r--r--llvm/test/CodeGen/X86/vselect-avx.ll13
9 files changed, 259 insertions, 329 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 345ff72729f..fe4f157fa86 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -782,8 +782,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
@@ -1087,9 +1087,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
-
+ setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
@@ -1331,8 +1330,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
- setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
@@ -22901,6 +22900,75 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntArith(Op, DAG);
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
+ assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
+ (VT == MVT::v16i32 && Subtarget.hasAVX512()));
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+
+ int NumElts = VT.getVectorNumElements();
+
+ // PMULxD operations multiply each even value (starting at 0) of LHS with
+ // the related value of RHS and produce a widen result.
+ // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ //
+ // In other word, to have all the results, we need to perform two PMULxD:
+ // 1. one with the even values.
+ // 2. one with the odd values.
+ // To achieve #2, with need to place the odd values at an even position.
+ //
+ // Place the odd value at an even position (basically, shift all values 1
+ // step to the left):
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
+ 9, -1, 11, -1, 13, -1, 15, -1};
+ // <a|b|c|d> => <b|undef|d|undef>
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+ makeArrayRef(&Mask[0], NumElts));
+ // <e|f|g|h> => <f|undef|h|undef>
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+ makeArrayRef(&Mask[0], NumElts));
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
+ bool IsSigned = Op->getOpcode() == ISD::MULHS;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Op0),
+ DAG.getBitcast(MulVT, Op1)));
+ // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+ // => <2 x i64> <bf|dh>
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Odd0),
+ DAG.getBitcast(MulVT, Odd1)));
+
+ // Shuffle it back into the right order.
+ SmallVector<int, 16> ShufMask(NumElts);
+ for (int i = 0; i != NumElts; ++i)
+ ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
+
+ SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the result of an
+ // unsigned multiply.
+ if (IsSigned && !Subtarget.hasSSE41()) {
+ SDValue ShAmt = DAG.getConstant(31, dl, VT);
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
+ }
+
+ return Res;
+ }
+
// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
@@ -23084,105 +23152,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
return DAG.getBitcast(VT, CallInfo.first);
}
-static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
- MVT VT = Op0.getSimpleValueType();
- SDLoc dl(Op);
-
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256()) {
- unsigned Opcode = Op.getOpcode();
- unsigned NumElems = VT.getVectorNumElements();
- MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
- SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
- SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
- SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
- SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
- SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
- SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
- SDValue Ops[] = {
- DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
- DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
- };
- return DAG.getMergeValues(Ops, dl);
- }
-
- assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
- (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
- (VT == MVT::v16i32 && Subtarget.hasAVX512()));
-
- int NumElts = VT.getVectorNumElements();
-
- // PMULxD operations multiply each even value (starting at 0) of LHS with
- // the related value of RHS and produce a widen result.
- // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
- // => <2 x i64> <ae|cg>
- //
- // In other word, to have all the results, we need to perform two PMULxD:
- // 1. one with the even values.
- // 2. one with the odd values.
- // To achieve #2, with need to place the odd values at an even position.
- //
- // Place the odd value at an even position (basically, shift all values 1
- // step to the left):
- const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
- // <a|b|c|d> => <b|undef|d|undef>
- SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
- makeArrayRef(&Mask[0], NumElts));
- // <e|f|g|h> => <f|undef|h|undef>
- SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
- makeArrayRef(&Mask[0], NumElts));
-
- // Emit two multiplies, one for the lower 2 ints and one for the higher 2
- // ints.
- MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
- bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
- unsigned Opcode =
- (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
- // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
- // => <2 x i64> <ae|cg>
- SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
- DAG.getBitcast(MulVT, Op0),
- DAG.getBitcast(MulVT, Op1)));
- // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
- // => <2 x i64> <bf|dh>
- SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
- DAG.getBitcast(MulVT, Odd0),
- DAG.getBitcast(MulVT, Odd1)));
-
- // Shuffle it back into the right order.
- SmallVector<int, 16> HighMask(NumElts);
- SmallVector<int, 16> LowMask(NumElts);
- for (int i = 0; i != NumElts; ++i) {
- HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
- LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
- }
-
- SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
-
- // If we have a signed multiply but no PMULDQ fix up the high parts of a
- // unsigned multiply.
- if (IsSigned && !Subtarget.hasSSE41()) {
- SDValue ShAmt = DAG.getConstant(
- 31, dl,
- DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
- SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
- SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
-
- SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
- Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
- }
-
- // The first result of MUL_LOHI is actually the low value, followed by the
- // high value.
- SDValue Ops[] = {Lows, Highs};
- return DAG.getMergeValues(Ops, dl);
-}
-
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
@@ -25579,8 +25548,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
- case ISD::UMUL_LOHI:
- case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::ROTL:
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 3f251dd8d62..159bb762a87 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -78,22 +78,19 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $31, %xmm0
@@ -134,13 +131,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_div7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
@@ -384,33 +380,30 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrld $31, %xmm2
-; SSE2-NEXT: psrad $2, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: psrad $2, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psubd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
@@ -448,13 +441,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_rem7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 5df4d09e971..3a4b2f3be82 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -88,41 +88,37 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
-; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
@@ -363,46 +359,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
-; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 893c7d1bbd7..c6fabd54d3d 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -86,7 +86,6 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
@@ -313,7 +312,6 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 598782ddd63..329a9295b29 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -128,13 +128,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_div7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -422,13 +421,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_rem7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 377ff5ea77a..668c6af8015 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -96,41 +96,37 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -371,46 +367,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
-; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 22c359cb7e9..2152d3f593b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -94,7 +94,6 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
@@ -324,7 +323,6 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll
index 205cb2d5e9c..50090b4f819 100644
--- a/llvm/test/CodeGen/X86/vector-idiv.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv.ll
@@ -24,24 +24,19 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; SSE2-LABEL: PR20355:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: psubd %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psrld $31, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: PR20355:
@@ -71,13 +66,12 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: PR20355:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index be23d4b4195..26214fd5377 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -106,13 +106,12 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
;
; AVX2-LABEL: test3:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766]
+; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
OpenPOWER on IntegriCloud