summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp67
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td54
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp2
-rw-r--r--llvm/test/CodeGen/X86/combine-sdiv.ll3
-rw-r--r--llvm/test/CodeGen/X86/combine-udiv.ll48
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll12
7 files changed, 67 insertions, 121 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index b4664f49201..ec0a0ace36f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -330,6 +330,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
+ case ISD::MULHS:
+ case ISD::MULHU:
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f3e69d4d551..6149d82d38f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -714,8 +714,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
- setOperationAction(ISD::MULHS, VT, Custom);
- setOperationAction(ISD::MULHU, VT, Custom);
+ setOperationAction(ISD::MULHS, VT, Legal);
+ setOperationAction(ISD::MULHU, VT, Legal);
} else {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
@@ -2670,66 +2670,6 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
-// Lower vector multiply high (ISD::MULHS and ISD::MULHU).
-static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) {
- // Multiplications are only custom-lowered for 128-bit vectors so that
- // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not
- // legal.
- EVT VT = Op.getValueType();
- assert(VT.is128BitVector() && VT.isInteger() &&
- "unexpected type for custom-lowering ISD::MULH{U,S}");
-
- SDValue V0 = Op.getOperand(0);
- SDValue V1 = Op.getOperand(1);
-
- SDLoc DL(Op);
-
- EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
-
- // We turn (V0 mulhs/mulhu V1) to:
- //
- // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
- // (extract_subvector (ExtractVT V128:V1, (i64 0))))),
- // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
- // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
- //
- // Where ExtractVT is a subvector with half number of elements, and
- // VMullIdx2 is the index of the middle element (the high part).
- //
- // The vector hight part extract and multiply will be matched against
- // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
- // issue a {s}mull2 instruction.
- //
- // This basically multiply the lower subvector with '{s,u}mull', the high
- // subvector with '{s,u}mull2', and shuffle both results high part in
- // resulting vector.
- unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
- SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
- SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
-
- SDValue VMullV0 =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
- SDValue VMullV1 =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
-
- SDValue VMull2V0 =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
- SDValue VMull2V1 =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
-
- unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
- : AArch64ISD::UMULL;
-
- EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
- SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
- SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
-
- Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
- Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
-
- return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
-}
-
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2932,9 +2872,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
- case ISD::MULHS:
- case ISD::MULHU:
- return LowerMULH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 84aedaafffe..cfe05822fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4108,25 +4108,6 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
- Instruction INST8B, Instruction INST4H, Instruction INST2S> {
- def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
- (extract_high_v16i8 V128:$Rm))),
- (INST8B V128:$Rn, V128:$Rm)>;
- def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
- (extract_high_v8i16 V128:$Rm))),
- (INST4H V128:$Rn, V128:$Rm)>;
- def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
- (extract_high_v4i32 V128:$Rm))),
- (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
- SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
- UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
@@ -5990,6 +5971,41 @@ def : Pat<(i32 (trunc GPR64sp:$src)),
// __builtin_trap() uses the BRK instruction on AArch64.
def : Pat<(trap), (BRK 1)>;
+// Multiply high patterns which multiply the lower subvector using smull/umull
+// and the upper subvector with smull2/umull2. Then shuffle the high the high
+// part of both results together.
+def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)),
+ (UZP2v16i8
+ (SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
+def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)),
+ (UZP2v8i16
+ (SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
+def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
+ (UZP2v4i32
+ (SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
+def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
+ (UZP2v16i8
+ (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
+def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)),
+ (UZP2v8i16
+ (UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
+def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
+ (UZP2v4i32
+ (UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
// Conversions within AdvSIMD types in the same register size are free.
// But because we need a consistent lane ordering, in big endian many
// conversions require one or more REV instructions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bebbb90ae81..da816c41981 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11529,7 +11529,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
SDValue BC = peekThroughBitcasts(V);
// Also check the simpler case, where we can directly reuse the scalar.
- if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index a10d9c57e41..d77dab8a9fb 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -3146,8 +3146,9 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; XOP-NEXT: vpmovsxbw %xmm1, %xmm1
; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
+; XOP-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm1
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 304bd7329ed..4dea599532e 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -647,18 +647,17 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movl $255, %eax
-; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movl $171, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pmullw %xmm2, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movl $255, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_udiv_nonuniform4:
@@ -670,12 +669,9 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: pmullw %xmm0, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: packuswb %xmm3, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: psllw $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; SSE41-NEXT: psllw $1, %xmm0
+; SSE41-NEXT: psllw $8, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -690,12 +686,9 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $8, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -725,13 +718,10 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; XOP-NEXT: movl $171, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm2
-; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; XOP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; XOP-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
+; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm2
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 82385386c88..4364d3d9363 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -629,12 +629,12 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
;
; CHECK-AVX1-LABEL: test_urem_both:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61]
-; CHECK-AVX1-NEXT: # xmm1 = mem[0,0]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
@@ -645,7 +645,7 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
;
; CHECK-AVX2-LABEL: test_urem_both:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
@@ -661,7 +661,7 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
;
; CHECK-AVX512VL-LABEL: test_urem_both:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
OpenPOWER on IntegriCloud