summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoman Lebedev <lebedev.ri@gmail.com>2019-11-22 15:22:42 +0300
committerRoman Lebedev <lebedev.ri@gmail.com>2019-11-22 15:22:42 +0300
commit96cf5c8d4784cd8763977608e2890c0683ebf7b4 (patch)
tree86f0f171aeb85247f9766d3e3f6e409313f705d6
parent3f46022e33bd33b3d8f816be3c3adbe7de806119 (diff)
downloadbcm5719-llvm-96cf5c8d4784cd8763977608e2890c0683ebf7b4.tar.gz
bcm5719-llvm-96cf5c8d4784cd8763977608e2890c0683ebf7b4.zip
[Codegen] TargetLowering::prepareUREMEqFold(): `x u% C1 ==/!= C2` (PR35479)
Summary: The current lowering is: ``` Name: (X % C1) == C2 -> X * C3 <= C4 || false Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1 %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, C2 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = -1 /u C1 %n0 = mul i8 %x, C3 %n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right %n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n3 = or i8 %n1, %n2 ; rotate right %is_tautologically_false = icmp ule i8 C1, C2 %C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4 %res = icmp ule i8 %n3, %C4_fixed %r = xor i1 %res, %is_tautologically_false ``` https://rise4fun.com/Alive/2xC https://rise4fun.com/Alive/jpb5 However, we can support non-tautological cases `C1 u> C2` too. Said handling consists of two parts: * `C2 u<= (-1 %u C1)`. It just works. We only have to change `(X % C1) == C2` into `((X - C2) % C1) == 0` ``` Name: (X % C1) == C2 -> (X - C2) * C3 <= C4 iff C2 u<= (-1 %u C1) Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1 && C2 u<= (-1 %u C1) %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, C2 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = (-1 /u C1) %n0 = sub i8 %x, C2 %n1 = mul i8 %n0, C3 %n2 = lshr i8 %n1, countTrailingZeros(C1) ; rotate right %n3 = shl i8 %n1, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n4 = or i8 %n2, %n3 ; rotate right %is_tautologically_false = icmp ule i8 C1, C2 %C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4 %res = icmp ule i8 %n4, %C4_fixed %r = xor i1 %res, %is_tautologically_false ``` https://rise4fun.com/Alive/m4P https://rise4fun.com/Alive/SKrx * `C2 u> (-1 %u C1)`. We also have to change `(X % C1) == C2` into `((X - C2) % C1) == 0`, and we have to decrement C4: ``` Name: (X % C1) == C2 -> (X - C2) * C3 <= C4 iff C2 u> (-1 %u C1) Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1 && C2 u> (-1 %u C1) %zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition %o0 = urem i8 %x, C1 %r = icmp eq i8 %o0, C2 => %zz = and i8 C3, 0 ; and silence it from complaining about said reg %C4 = (-1 /u C1)-1 %n0 = sub i8 %x, C2 %n1 = mul i8 %n0, C3 %n2 = lshr i8 %n1, countTrailingZeros(C1) ; rotate right %n3 = shl i8 %n1, ((8-countTrailingZeros(C1)) %u 8) ; rotate right %n4 = or i8 %n2, %n3 ; rotate right %is_tautologically_false = icmp ule i8 C1, C2 %C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4 %res = icmp ule i8 %n4, %C4_fixed %r = xor i1 %res, %is_tautologically_false ``` https://rise4fun.com/Alive/d40 https://rise4fun.com/Alive/8cF I believe this concludes `x u% C1 ==/!= C2` lowering. In fact, clang is may now be better in this regard than gcc: as it can be seen from `@t32_6_4` test, we do lower `x % 6 == 4` via this pattern, while gcc does not: https://godbolt.org/z/XNU2z9 And all the general alive proofs say this is legal. And manual checking agrees: https://rise4fun.com/Alive/WA2 Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=35479 | PR35479 ]]. Reviewers: RKSimon, craig.topper, spatel Reviewed By: RKSimon Subscribers: nick, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70053
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp37
-rw-r--r--llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll164
-rw-r--r--llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll35
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-nonzero.ll356
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll211
5 files changed, 320 insertions, 483 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9a9ac690aaa..6f563c2a0ee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4943,7 +4943,7 @@ SDValue TargetLowering::buildUREMEqFold(EVT SETCCVT, SDValue REMNode,
ISD::CondCode Cond,
DAGCombinerInfo &DCI,
const SDLoc &DL) const {
- SmallVector<SDNode *, 4> Built;
+ SmallVector<SDNode *, 5> Built;
if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
DCI, DL, Built)) {
for (SDNode *N : Built)
@@ -4978,6 +4978,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
if (!isOperationLegalOrCustom(ISD::MUL, VT))
return SDValue();
+ bool ComparingWithAllZeros = true;
+ bool AllComparisonsWithNonZerosAreTautological = true;
bool HadTautologicalLanes = false;
bool AllLanesAreTautological = true;
bool HadEvenDivisor = false;
@@ -4993,6 +4995,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
const APInt &D = CDiv->getAPIntValue();
const APInt &Cmp = CCmp->getAPIntValue();
+ ComparingWithAllZeros &= Cmp.isNullValue();
+
// x u% C1` is *always* less than C1. So given `x u% C1 == C2`,
// if C2 is not less than C1, the comparison is always false.
// But we will only be able to produce the comparison that will give the
@@ -5000,12 +5004,6 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
bool TautologicalInvertedLane = D.ule(Cmp);
HadTautologicalInvertedLanes |= TautologicalInvertedLane;
- // If we are checking that remainder is something smaller than the divisor,
- // then this comparison isn't tautological. For now this is not handled,
- // other than the comparison that remainder is zero.
- if (!Cmp.isNullValue() && !TautologicalInvertedLane)
- return false;
-
// If all lanes are tautological (either all divisors are ones, or divisor
// is not greater than the constant we are comparing with),
// we will prefer to avoid the fold.
@@ -5013,6 +5011,12 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
HadTautologicalLanes |= TautologicalLane;
AllLanesAreTautological &= TautologicalLane;
+ // If we are comparing with non-zero, we need'll need to subtract said
+ // comparison value from the LHS. But there is no point in doing that if
+ // every lane where we are comparing with non-zero is tautological..
+ if (!Cmp.isNullValue())
+ AllComparisonsWithNonZerosAreTautological &= TautologicalLane;
+
// Decompose D into D0 * 2^K
unsigned K = D.countTrailingZeros();
assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
@@ -5033,8 +5037,15 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
- // Q = floor((2^W - 1) / D)
- APInt Q = APInt::getAllOnesValue(W).udiv(D);
+ // Q = floor((2^W - 1) u/ D)
+ // R = ((2^W - 1) u% D)
+ APInt Q, R;
+ APInt::udivrem(APInt::getAllOnesValue(W), D, Q, R);
+
+ // If we are comparing with zero, then that comparison constant is okay,
+ // else it may need to be one less than that.
+ if (Cmp.ugt(R))
+ Q -= 1;
assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
"We are expecting that K is always less than all-ones for ShSVT");
@@ -5093,6 +5104,14 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
QVal = QAmts[0];
}
+ if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) {
+ if (!isOperationLegalOrCustom(ISD::SUB, VT))
+ return SDValue(); // FIXME: Could/should use `ISD::ADD`?
+ assert(CompTargetNode.getValueType() == N.getValueType() &&
+ "Expecting that the types on LHS and RHS of comparisons match.");
+ N = DAG.getNode(ISD::SUB, DL, VT, N, CompTargetNode);
+ }
+
// (mul N, P)
SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
Created.push_back(Op0.getNode());
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
index 6fe10ce5d5b..f29add83513 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
@@ -6,12 +6,10 @@ define i1 @t32_3_1(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #33
-; CHECK-NEXT: add w8, w8, w8, lsl #1
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #1 // =1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #1431655765
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 3
%cmp = icmp eq i32 %urem, 1
@@ -23,12 +21,11 @@ define i1 @t32_3_2(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #33
-; CHECK-NEXT: add w8, w8, w8, lsl #1
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #-1431655766
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #1431655765
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 3
%cmp = icmp eq i32 %urem, 2
@@ -41,12 +38,10 @@ define i1 @t32_5_1(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: add w8, w8, w8, lsl #2
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #1 // =1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #858993459
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 1
@@ -58,12 +53,11 @@ define i1 @t32_5_2(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: add w8, w8, w8, lsl #2
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #1717986918
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #858993459
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 2
@@ -75,12 +69,11 @@ define i1 @t32_5_3(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: add w8, w8, w8, lsl #2
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #3 // =3
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #-1717986919
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #858993459
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 3
@@ -92,12 +85,11 @@ define i1 @t32_5_4(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: add w8, w8, w8, lsl #2
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: cmp w8, #4 // =4
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #-858993460
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #858993459
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 4
@@ -110,12 +102,13 @@ define i1 @t32_6_1(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: cmp w8, #1 // =1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #1431655765
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #43691
+; CHECK-NEXT: ror w8, w8, #1
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 1
@@ -127,12 +120,13 @@ define i1 @t32_6_2(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w9, #-1431655766
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #43691
+; CHECK-NEXT: ror w8, w8, #1
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 2
@@ -144,12 +138,13 @@ define i1 @t32_6_3(i32 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: cmp w8, #3 // =3
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: sub w8, w8, #1 // =1
+; CHECK-NEXT: mov w9, #43691
+; CHECK-NEXT: ror w8, w8, #1
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 3
@@ -160,13 +155,15 @@ define i1 @t32_6_4(i32 %X) nounwind {
; CHECK-LABEL: t32_6_4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
+; CHECK-NEXT: mov w9, #21844
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: cmp w8, #4 // =4
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: movk w9, #21845, lsl #16
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #43690
+; CHECK-NEXT: ror w8, w8, #1
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 4
@@ -177,13 +174,15 @@ define i1 @t32_6_5(i32 %X) nounwind {
; CHECK-LABEL: t32_6_5:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
+; CHECK-NEXT: mov w9, #43689
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #34
-; CHECK-NEXT: mov w9, #6
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: cmp w8, #5 // =5
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: movk w9, #43690, lsl #16
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #43690
+; CHECK-NEXT: ror w8, w8, #1
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 5
@@ -199,12 +198,11 @@ define i1 @t16_3_2(i16 %X) nounwind {
; CHECK-NEXT: mov w9, #43691
; CHECK-NEXT: and w8, w0, #0xffff
; CHECK-NEXT: movk w9, #43690, lsl #16
-; CHECK-NEXT: umull x9, w8, w9
-; CHECK-NEXT: lsr x9, x9, #33
-; CHECK-NEXT: add w9, w9, w9, lsl #1
-; CHECK-NEXT: sub w8, w8, w9
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w10, #-1431655766
+; CHECK-NEXT: madd w8, w8, w9, w10
+; CHECK-NEXT: mov w9, #1431655765
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i16 %X, 3
%cmp = icmp eq i16 %urem, 2
@@ -217,12 +215,11 @@ define i1 @t8_3_2(i8 %X) nounwind {
; CHECK-NEXT: mov w9, #43691
; CHECK-NEXT: and w8, w0, #0xff
; CHECK-NEXT: movk w9, #43690, lsl #16
-; CHECK-NEXT: umull x9, w8, w9
-; CHECK-NEXT: lsr x9, x9, #33
-; CHECK-NEXT: add w9, w9, w9, lsl #1
-; CHECK-NEXT: sub w8, w8, w9
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w10, #-1431655766
+; CHECK-NEXT: madd w8, w8, w9, w10
+; CHECK-NEXT: mov w9, #1431655765
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i8 %X, 3
%cmp = icmp eq i8 %urem, 2
@@ -234,12 +231,11 @@ define i1 @t64_3_2(i64 %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #-6148914691236517206
; CHECK-NEXT: movk x8, #43691
-; CHECK-NEXT: umulh x8, x0, x8
-; CHECK-NEXT: lsr x8, x8, #1
-; CHECK-NEXT: add x8, x8, x8, lsl #1
-; CHECK-NEXT: sub x8, x0, x8
-; CHECK-NEXT: cmp x8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov x9, #-6148914691236517206
+; CHECK-NEXT: madd x8, x0, x8, x9
+; CHECK-NEXT: mov x9, #6148914691236517205
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i64 %X, 3
%cmp = icmp eq i64 %urem, 2
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index f45b5598eae..5ee7c2a9aee 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -4,18 +4,16 @@
define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_3:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: adrp x9, .LCPI0_1
; CHECK-NEXT: mov w8, #43691
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1]
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: adrp x9, .LCPI0_0
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #1
-; CHECK-NEXT: movi v3.4s, #3
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 3, i32 3, i32 3, i32 3>
@@ -26,18 +24,17 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_5:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: adrp x9, .LCPI1_0
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_0]
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #2
-; CHECK-NEXT: movi v3.4s, #5
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: mov w9, #13106
+; CHECK-NEXT: movk w9, #13107, lsl #16
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: dup v1.4s, w9
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 5, i32 5>
diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
index f8a7d7ba519..d52d47e46f0 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
@@ -5,27 +5,18 @@
define i1 @t32_3_1(i32 %X) nounwind {
; X86-LABEL: t32_3_1:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $1, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_3_1:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $33, %rcx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $1, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 3
%cmp = icmp eq i32 %urem, 1
@@ -35,27 +26,18 @@ define i1 @t32_3_1(i32 %X) nounwind {
define i1 @t32_3_2(i32 %X) nounwind {
; X86-LABEL: t32_3_2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $2, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_3_2:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $33, %rcx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $2, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 3
%cmp = icmp eq i32 %urem, 2
@@ -66,27 +48,18 @@ define i1 @t32_3_2(i32 %X) nounwind {
define i1 @t32_5_1(i32 %X) nounwind {
; X86-LABEL: t32_5_1:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
-; X86-NEXT: leal (%edx,%edx,4), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $1, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
+; X86-NEXT: addl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_5_1:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: leal (%rcx,%rcx,4), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $1, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD
+; X64-NEXT: addl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 1
@@ -96,27 +69,18 @@ define i1 @t32_5_1(i32 %X) nounwind {
define i1 @t32_5_2(i32 %X) nounwind {
; X86-LABEL: t32_5_2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
-; X86-NEXT: leal (%edx,%edx,4), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $2, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
+; X86-NEXT: addl $1717986918, %eax # imm = 0x66666666
+; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_5_2:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: leal (%rcx,%rcx,4), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $2, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD
+; X64-NEXT: addl $1717986918, %eax # imm = 0x66666666
+; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 2
@@ -126,27 +90,18 @@ define i1 @t32_5_2(i32 %X) nounwind {
define i1 @t32_5_3(i32 %X) nounwind {
; X86-LABEL: t32_5_3:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
-; X86-NEXT: leal (%edx,%edx,4), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $3, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
+; X86-NEXT: addl $-1717986919, %eax # imm = 0x99999999
+; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_5_3:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: leal (%rcx,%rcx,4), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $3, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD
+; X64-NEXT: addl $-1717986919, %eax # imm = 0x99999999
+; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 3
@@ -156,27 +111,18 @@ define i1 @t32_5_3(i32 %X) nounwind {
define i1 @t32_5_4(i32 %X) nounwind {
; X86-LABEL: t32_5_4:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
-; X86-NEXT: leal (%edx,%edx,4), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $4, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
+; X86-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_5_4:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: leal (%rcx,%rcx,4), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $4, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD
+; X64-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC
+; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 4
@@ -187,29 +133,20 @@ define i1 @t32_5_4(i32 %X) nounwind {
define i1 @t32_6_1(i32 %X) nounwind {
; X86-LABEL: t32_6_1:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: andl $-2, %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $1, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: rorl %eax
+; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_6_1:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $1, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: rorl %eax
+; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 1
@@ -219,29 +156,20 @@ define i1 @t32_6_1(i32 %X) nounwind {
define i1 @t32_6_2(i32 %X) nounwind {
; X86-LABEL: t32_6_2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: andl $-2, %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $2, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT: rorl %eax
+; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_6_2:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $2, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X64-NEXT: rorl %eax
+; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 2
@@ -251,29 +179,20 @@ define i1 @t32_6_2(i32 %X) nounwind {
define i1 @t32_6_3(i32 %X) nounwind {
; X86-LABEL: t32_6_3:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: andl $-2, %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $3, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: decl %eax
+; X86-NEXT: rorl %eax
+; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_6_3:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $3, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: decl %eax
+; X64-NEXT: rorl %eax
+; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 3
@@ -283,29 +202,20 @@ define i1 @t32_6_3(i32 %X) nounwind {
define i1 @t32_6_4(i32 %X) nounwind {
; X86-LABEL: t32_6_4:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: andl $-2, %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $4, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $1431655764, %eax # imm = 0x55555554
+; X86-NEXT: rorl %eax
+; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_6_4:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $4, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $1431655764, %eax # imm = 0x55555554
+; X64-NEXT: rorl %eax
+; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 4
@@ -315,29 +225,20 @@ define i1 @t32_6_4(i32 %X) nounwind {
define i1 @t32_6_5(i32 %X) nounwind {
; X86-LABEL: t32_6_5:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: andl $-2, %edx
-; X86-NEXT: leal (%edx,%edx,2), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: cmpl $5, %ecx
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB
+; X86-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9
+; X86-NEXT: rorl %eax
+; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t32_6_5:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: shrq $34, %rcx
-; X64-NEXT: addl %ecx, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpl $5, %edi
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9
+; X64-NEXT: rorl %eax
+; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i32 %X, 6
%cmp = icmp eq i32 %urem, 5
@@ -350,24 +251,20 @@ define i1 @t32_6_5(i32 %X) nounwind {
define i1 @t16_3_2(i16 %X) nounwind {
; X86-LABEL: t16_3_2:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $43691, %eax, %ecx # imm = 0xAAAB
-; X86-NEXT: shrl $17, %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: cmpw $2, %ax
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-21845, {{[0-9]+}}(%esp), %eax # imm = 0xAAAB
+; X86-NEXT: addl $-21846, %eax # imm = 0xAAAA
+; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: cmpl $21845, %eax # imm = 0x5555
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t16_3_2:
; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
-; X64-NEXT: shrl $17, %eax
-; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: subl %eax, %edi
-; X64-NEXT: cmpw $2, %di
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-21845, %edi, %eax # imm = 0xAAAB
+; X64-NEXT: addl $-21846, %eax # imm = 0xAAAA
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: cmpl $21845, %eax # imm = 0x5555
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i16 %X, 3
%cmp = icmp eq i16 %urem, 2
@@ -377,24 +274,18 @@ define i1 @t16_3_2(i16 %X) nounwind {
define i1 @t8_3_2(i8 %X) nounwind {
; X86-LABEL: t8_3_2:
; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %ecx
-; X86-NEXT: shrl $9, %ecx
-; X86-NEXT: leal (%ecx,%ecx,2), %ecx
-; X86-NEXT: subb %cl, %al
-; X86-NEXT: cmpb $2, %al
-; X86-NEXT: sete %al
+; X86-NEXT: imull $-85, {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addb $-86, %al
+; X86-NEXT: cmpb $85, %al
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t8_3_2:
; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $171, %eax, %ecx
-; X64-NEXT: shrl $9, %ecx
-; X64-NEXT: leal (%rcx,%rcx,2), %ecx
-; X64-NEXT: subb %cl, %al
-; X64-NEXT: cmpb $2, %al
-; X64-NEXT: sete %al
+; X64-NEXT: imull $-85, %edi, %eax
+; X64-NEXT: addb $-86, %al
+; X64-NEXT: cmpb $85, %al
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i8 %X, 3
%cmp = icmp eq i8 %urem, 2
@@ -419,14 +310,13 @@ define i1 @t64_3_2(i64 %X) nounwind {
;
; X64-LABEL: t64_3_2:
; X64: # %bb.0:
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
-; X64-NEXT: leaq (%rdx,%rdx,2), %rax
-; X64-NEXT: subq %rax, %rdi
-; X64-NEXT: cmpq $2, %rdi
-; X64-NEXT: sete %al
+; X64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: imulq %rdi, %rax
+; X64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; X64-NEXT: cmpq %rax, %rcx
+; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i64 %X, 3
%cmp = icmp eq i64 %urem, 2
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index b580e39c728..b3814a1b3f6 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -8,77 +8,52 @@
define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t32_3:
; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $1, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: t32_3:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_3:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: t32_3:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: t32_3:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%urem = urem <4 x i32> %X, <i32 3, i32 3, i32 3, i32 3>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>
@@ -88,77 +63,53 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t32_5:
; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pslld $2, %xmm1
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: t32_5:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_5:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: t32_5:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458]
+; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: t32_5:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 5, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 1, i32 2, i32 3, i32 4>
@@ -233,16 +184,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: t32_6_part0:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 3>
@@ -317,16 +263,11 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: t32_6_part1:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
%cmp = icmp eq <4 x i32> %urem, <i32 4, i32 5, i32 0, i32 0>
@@ -415,18 +356,12 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: t32_tautological:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
-; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; CHECK-AVX512VL-NEXT: retq
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 3>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>
OpenPOWER on IntegriCloud