diff options
| author | Nikita Popov <nikita.ppv@gmail.com> | 2019-02-07 21:02:22 +0000 | 
|---|---|---|
| committer | Nikita Popov <nikita.ppv@gmail.com> | 2019-02-07 21:02:22 +0000 | 
| commit | 9d7e86a9782a10170efb73483e7ffb8bd077386c (patch) | |
| tree | 1bb13711671f079da53450eadc72213d5e31a4d8 /llvm | |
| parent | be9b65d89d3905d164196dfab4e917ce47873da3 (diff) | |
| download | bcm5719-llvm-9d7e86a9782a10170efb73483e7ffb8bd077386c.tar.gz bcm5719-llvm-9d7e86a9782a10170efb73483e7ffb8bd077386c.zip  | |
[CodeGen] Handle vector UADDO, SADDO, USUBO, SSUBO
This is part of https://bugs.llvm.org/show_bug.cgi?id=40442.
Vector legalization is implemented for the add/sub overflow opcodes.
UMULO/SMULO are also handled as far as legalization is concerned, but
they don't support vector expansion yet (so no tests for them).
The vector result widening implementation is suboptimal, because it
could result in a legalization loop.
Differential Revision: https://reviews.llvm.org/D57639
llvm-svn: 353464
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 155 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/vec_uaddo.ll | 319 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/saddo.ll | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ssubo.ll | 22 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/uaddo.ll | 23 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/usubo.ll | 20 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_saddo.ll | 2028 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_ssubo.ll | 2078 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_uaddo.ll | 1381 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_usubo.ll | 1422 | 
13 files changed, 7480 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 9873b1e3e84..aee4194b86e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -881,7 +881,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {    // Calculate the overflow flag: zero extend the arithmetic result from    // the original type. -  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT); +  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT.getScalarType());    // Overflowed if and only if this is not equal to Res.    Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index a2a8d164f28..ace46e91358 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -674,6 +674,7 @@ private:    SDValue ScalarizeVecRes_TernaryOp(SDNode *N);    SDValue ScalarizeVecRes_UnaryOp(SDNode *N);    SDValue ScalarizeVecRes_StrictFPOp(SDNode *N); +  SDValue ScalarizeVecRes_OverflowOp(SDNode *N, unsigned ResNo);    SDValue ScalarizeVecRes_InregOp(SDNode *N);    SDValue ScalarizeVecRes_VecInregOp(SDNode *N); @@ -728,6 +729,8 @@ private:    void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);    void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);    void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); +  void SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo, +                              SDValue &Lo, SDValue &Hi);    void SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -809,6 +812,7 @@ private:    SDValue WidenVecRes_Binary(SDNode *N);    SDValue WidenVecRes_BinaryCanTrap(SDNode *N);    SDValue WidenVecRes_StrictFP(SDNode *N); +  SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);    SDValue WidenVecRes_Convert(SDNode *N);    SDValue WidenVecRes_FCOPYSIGN(SDNode *N);    SDValue WidenVecRes_POWI(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 32876bf59f7..f5ebb56ef2d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -171,6 +171,14 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {    case ISD::STRICT_FTRUNC:      R = ScalarizeVecRes_StrictFPOp(N);      break; +  case ISD::UADDO: +  case ISD::SADDO: +  case ISD::USUBO: +  case ISD::SSUBO: +  case ISD::UMULO: +  case ISD::SMULO: +    R = ScalarizeVecRes_OverflowOp(N, ResNo); +    break;    case ISD::SMULFIX:    case ISD::UMULFIX:      R = ScalarizeVecRes_MULFIX(N); @@ -235,6 +243,43 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {    return Result;  } +SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N, +                                                     unsigned ResNo) { +  SDLoc DL(N); +  EVT ResVT = N->getValueType(0); +  EVT OvVT = N->getValueType(1); + +  SDValue ScalarLHS, ScalarRHS; +  if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) { +    ScalarLHS = GetScalarizedVector(N->getOperand(0)); +    ScalarRHS = GetScalarizedVector(N->getOperand(1)); +  } else { +    SmallVector<SDValue, 1> ElemsLHS, ElemsRHS; +    DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS); +    DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS); +    ScalarLHS = ElemsLHS[0]; +    ScalarRHS = ElemsRHS[0]; +  } + +  SDVTList ScalarVTs = DAG.getVTList( +      ResVT.getVectorElementType(), OvVT.getVectorElementType()); +  SDNode *ScalarNode = DAG.getNode( +      N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode(); + +  // Replace the other vector result not being explicitly scalarized here. +  unsigned OtherNo = 1 - ResNo; +  EVT OtherVT = N->getValueType(OtherNo); +  if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) { +    SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo)); +  } else { +    SDValue OtherVal = DAG.getNode( +        ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo)); +    ReplaceValueWith(SDValue(N, OtherNo), OtherVal); +  } + +  return SDValue(ScalarNode, ResNo); +} +  SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,                                                         unsigned ResNo) {    SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -859,6 +904,14 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {    case ISD::STRICT_FTRUNC:      SplitVecRes_StrictFPOp(N, Lo, Hi);      break; +  case ISD::UADDO: +  case ISD::SADDO: +  case ISD::USUBO: +  case ISD::SSUBO: +  case ISD::UMULO: +  case ISD::SMULO: +    SplitVecRes_OverflowOp(N, ResNo, Lo, Hi); +    break;    case ISD::SMULFIX:    case ISD::UMULFIX:      SplitVecRes_MULFIX(N, Lo, Hi); @@ -1205,6 +1258,47 @@ void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,    ReplaceValueWith(SDValue(N, 1), Chain);  } +void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo, +                                              SDValue &Lo, SDValue &Hi) { +  SDLoc dl(N); +  EVT ResVT = N->getValueType(0); +  EVT OvVT = N->getValueType(1); +  EVT LoResVT, HiResVT, LoOvVT, HiOvVT; +  std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT); +  std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT); + +  SDValue LoLHS, HiLHS, LoRHS, HiRHS; +  if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) { +    GetSplitVector(N->getOperand(0), LoLHS, HiLHS); +    GetSplitVector(N->getOperand(1), LoRHS, HiRHS); +  } else { +    std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0); +    std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1); +  } + +  unsigned Opcode = N->getOpcode(); +  SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT); +  SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT); +  SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode(); +  SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode(); + +  Lo = SDValue(LoNode, ResNo); +  Hi = SDValue(HiNode, ResNo); + +  // Replace the other vector result not being explicitly split here. +  unsigned OtherNo = 1 - ResNo; +  EVT OtherVT = N->getValueType(OtherNo); +  if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) { +    SetSplitVector(SDValue(N, OtherNo), +                   SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo)); +  } else { +    SDValue OtherVal = DAG.getNode( +        ISD::CONCAT_VECTORS, dl, OtherVT, +        SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo)); +    ReplaceValueWith(SDValue(N, OtherNo), OtherVal); +  } +} +  void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,                                                       SDValue &Hi) {    SDValue Vec = N->getOperand(0); @@ -2471,6 +2565,15 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {      Res = WidenVecRes_StrictFP(N);      break; +  case ISD::UADDO: +  case ISD::SADDO: +  case ISD::USUBO: +  case ISD::SSUBO: +  case ISD::UMULO: +  case ISD::SMULO: +    Res = WidenVecRes_OverflowOp(N, ResNo); +    break; +    case ISD::FCOPYSIGN:      Res = WidenVecRes_FCOPYSIGN(N);      break; @@ -2845,6 +2948,58 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {    return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);  } +SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) { +  SDLoc DL(N); +  EVT ResVT = N->getValueType(0); +  EVT OvVT = N->getValueType(1); +  EVT WideResVT, WideOvVT; +  SDValue WideLHS, WideRHS; + +  // TODO: This might result in a widen/split loop. +  if (ResNo == 0) { +    WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT); +    WideOvVT = EVT::getVectorVT( +        *DAG.getContext(), OvVT.getVectorElementType(), +        WideResVT.getVectorNumElements()); + +    WideLHS = GetWidenedVector(N->getOperand(0)); +    WideRHS = GetWidenedVector(N->getOperand(1)); +  } else { +    WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT); +    WideResVT = EVT::getVectorVT( +        *DAG.getContext(), ResVT.getVectorElementType(), +        WideOvVT.getVectorNumElements()); + +    SDValue Zero = DAG.getConstant( +        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); +    WideLHS = DAG.getNode( +        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT), +        N->getOperand(0), Zero); +    WideRHS = DAG.getNode( +        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT), +        N->getOperand(1), Zero); +  } + +  SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT); +  SDNode *WideNode = DAG.getNode( +      N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode(); + +  // Replace the other vector result not being explicitly widened here. +  unsigned OtherNo = 1 - ResNo; +  EVT OtherVT = N->getValueType(OtherNo); +  if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) { +    SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo)); +  } else { +    SDValue Zero = DAG.getConstant( +        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); +    SDValue OtherVal = DAG.getNode( +        ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero); +    ReplaceValueWith(SDValue(N, OtherNo), OtherVal); +  } + +  return SDValue(WideNode, ResNo); +} +  SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {    SDValue InOp = N->getOperand(0);    SDLoc DL(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index c7c32b42b8e..b205e97fa39 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6113,7 +6113,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {      SDValue Op1 = getValue(I.getArgOperand(0));      SDValue Op2 = getValue(I.getArgOperand(1)); -    SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1); +    EVT ResultVT = Op1.getValueType(); +    EVT OverflowVT = MVT::i1; +    if (ResultVT.isVector()) +      OverflowVT = EVT::getVectorVT( +          *Context, OverflowVT, ResultVT.getVectorNumElements()); + +    SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);      setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));      return nullptr;    } diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll new file mode 100644 index 00000000000..6c2f07f8dc9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v1i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s +; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT:    str s1, [x0] +; CHECK-NEXT:    ret +  %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) +  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 +  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 +  %res = sext <1 x i1> %obit to <1 x i32> +  store <1 x i32> %val, <1 x i32>* %p2 +  ret <1 x i32> %res +} + +define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s +; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT:    str d1, [x0] +; CHECK-NEXT:    ret +  %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) +  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i32> %val, <2 x i32>* %p2 +  ret <2 x i32> %res +} + +define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v3i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s +; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT:    xtn v0.4h, v0.4s +; CHECK-NEXT:    add x8, x0, #8 // =8 +; CHECK-NEXT:    sshll v0.4s, v0.4h, #0 +; CHECK-NEXT:    st1 { v1.s }[2], [x8] +; CHECK-NEXT:    str d1, [x0] +; CHECK-NEXT:    ret +  %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) +  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 +  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 +  %res = sext <3 x i1> %obit to <3 x i32> +  store <3 x i32> %val, <3 x i32>* %p2 +  ret <3 x i32> %res +} + +define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s +; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT:    xtn v0.4h, v0.4s +; CHECK-NEXT:    sshll v0.4s, v0.4h, #0 +; CHECK-NEXT:    str q1, [x0] +; CHECK-NEXT:    ret +  %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) +  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i32> %val, <4 x i32>* %p2 +  ret <4 x i32> %res +} + +define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v6i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    fmov s0, w6 +; CHECK-NEXT:    mov x8, sp +; CHECK-NEXT:    mov v0.s[1], w7 +; CHECK-NEXT:    ldr s2, [sp, #16] +; CHECK-NEXT:    ld1 { v0.s }[2], [x8] +; CHECK-NEXT:    add x9, sp, #8 // =8 +; CHECK-NEXT:    add x10, sp, #24 // =24 +; CHECK-NEXT:    fmov s1, w0 +; CHECK-NEXT:    ld1 { v2.s }[1], [x10] +; CHECK-NEXT:    ld1 { v0.s }[3], [x9] +; CHECK-NEXT:    mov v1.s[1], w1 +; CHECK-NEXT:    fmov s3, w4 +; CHECK-NEXT:    ldr x11, [sp, #32] +; CHECK-NEXT:    mov v1.s[2], w2 +; CHECK-NEXT:    mov v3.s[1], w5 +; CHECK-NEXT:    mov v1.s[3], w3 +; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s +; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s +; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s +; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s +; CHECK-NEXT:    str d2, [x11, #16] +; CHECK-NEXT:    xtn v2.4h, v3.4s +; CHECK-NEXT:    xtn v1.4h, v1.4s +; CHECK-NEXT:    sshll v2.4s, v2.4h, #0 +; CHECK-NEXT:    sshll v1.4s, v1.4h, #0 +; CHECK-NEXT:    mov w5, v2.s[1] +; CHECK-NEXT:    mov w1, v1.s[1] +; CHECK-NEXT:    mov w2, v1.s[2] +; CHECK-NEXT:    mov w3, v1.s[3] +; CHECK-NEXT:    fmov w4, s2 +; CHECK-NEXT:    fmov w0, s1 +; CHECK-NEXT:    str q0, [x11] +; CHECK-NEXT:    ret +  %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) +  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 +  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 +  %res = sext <6 x i1> %obit to <6 x i32> +  store <6 x i32> %val, <6 x i32>* %p2 +  ret <6 x i32> %res +} + +define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v8i32: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s +; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s +; CHECK-NEXT:    cmhi v1.4s, v1.4s, v3.4s +; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s +; CHECK-NEXT:    xtn v1.4h, v1.4s +; CHECK-NEXT:    xtn v0.4h, v0.4s +; CHECK-NEXT:    sshll v0.4s, v0.4h, #0 +; CHECK-NEXT:    sshll v1.4s, v1.4h, #0 +; CHECK-NEXT:    stp q2, q3, [x0] +; CHECK-NEXT:    ret +  %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) +  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i32> %val, <8 x i32>* %p2 +  ret <8 x i32> %res +} + +define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; CHECK-LABEL: uaddo_v16i8: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v4.16b, v0.16b, v1.16b +; CHECK-NEXT:    cmhi v0.16b, v0.16b, v4.16b +; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b +; CHECK-NEXT:    zip2 v2.8b, v0.8b, v0.8b +; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT:    ushll v1.4s, v1.4h, #0 +; CHECK-NEXT:    ushll v2.4s, v2.4h, #0 +; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b +; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b +; CHECK-NEXT:    shl v1.4s, v1.4s, #31 +; CHECK-NEXT:    shl v2.4s, v2.4s, #31 +; CHECK-NEXT:    ushll v3.4s, v3.4h, #0 +; CHECK-NEXT:    ushll v5.4s, v0.4h, #0 +; CHECK-NEXT:    sshr v0.4s, v1.4s, #31 +; CHECK-NEXT:    sshr v1.4s, v2.4s, #31 +; CHECK-NEXT:    shl v2.4s, v3.4s, #31 +; CHECK-NEXT:    shl v3.4s, v5.4s, #31 +; CHECK-NEXT:    sshr v2.4s, v2.4s, #31 +; CHECK-NEXT:    sshr v3.4s, v3.4s, #31 +; CHECK-NEXT:    str q4, [x0] +; CHECK-NEXT:    ret +  %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) +  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i8> %val, <16 x i8>* %p2 +  ret <16 x i32> %res +} + +define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; CHECK-LABEL: uaddo_v8i16: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h +; CHECK-NEXT:    cmhi v0.8h, v0.8h, v2.8h +; CHECK-NEXT:    xtn v0.8b, v0.8h +; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b +; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b +; CHECK-NEXT:    ushll v1.4s, v1.4h, #0 +; CHECK-NEXT:    ushll v0.4s, v0.4h, #0 +; CHECK-NEXT:    shl v1.4s, v1.4s, #31 +; CHECK-NEXT:    shl v3.4s, v0.4s, #31 +; CHECK-NEXT:    sshr v0.4s, v1.4s, #31 +; CHECK-NEXT:    sshr v1.4s, v3.4s, #31 +; CHECK-NEXT:    str q2, [x0] +; CHECK-NEXT:    ret +  %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) +  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i16> %val, <8 x i16>* %p2 +  ret <8 x i32> %res +} + +define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i64: +; CHECK:       // %bb.0: +; CHECK-NEXT:    add v1.2d, v0.2d, v1.2d +; CHECK-NEXT:    cmhi v0.2d, v0.2d, v1.2d +; CHECK-NEXT:    xtn v0.2s, v0.2d +; CHECK-NEXT:    str q1, [x0] +; CHECK-NEXT:    ret +  %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) +  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i64> %val, <2 x i64>* %p2 +  ret <2 x i32> %res +} + +define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i24: +; CHECK:       // %bb.0: +; CHECK-NEXT:    bic v1.4s, #255, lsl #24 +; CHECK-NEXT:    bic v0.4s, #255, lsl #24 +; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s +; CHECK-NEXT:    mov v1.16b, v0.16b +; CHECK-NEXT:    mov w8, v0.s[3] +; CHECK-NEXT:    bic v1.4s, #255, lsl #24 +; CHECK-NEXT:    mov w9, v0.s[2] +; CHECK-NEXT:    mov w10, v0.s[1] +; CHECK-NEXT:    sturh w8, [x0, #9] +; CHECK-NEXT:    lsr w8, w8, #16 +; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s +; CHECK-NEXT:    fmov w11, s0 +; CHECK-NEXT:    strh w9, [x0, #6] +; CHECK-NEXT:    sturh w10, [x0, #3] +; CHECK-NEXT:    lsr w9, w9, #16 +; CHECK-NEXT:    lsr w10, w10, #16 +; CHECK-NEXT:    strb w8, [x0, #11] +; CHECK-NEXT:    mvn v0.16b, v1.16b +; CHECK-NEXT:    lsr w8, w11, #16 +; CHECK-NEXT:    strh w11, [x0] +; CHECK-NEXT:    strb w9, [x0, #8] +; CHECK-NEXT:    strb w10, [x0, #5] +; CHECK-NEXT:    strb w8, [x0, #2] +; CHECK-NEXT:    ret +  %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) +  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i24> %val, <4 x i24>* %p2 +  ret <4 x i32> %res +} + +define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i1: +; CHECK:       // %bb.0: +; CHECK-NEXT:    movi v2.4h, #1 +; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b +; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b +; CHECK-NEXT:    add v1.4h, v0.4h, v1.4h +; CHECK-NEXT:    umov w9, v1.h[1] +; CHECK-NEXT:    umov w8, v1.h[0] +; CHECK-NEXT:    and w9, w9, #0x1 +; CHECK-NEXT:    bfi w8, w9, #1, #1 +; CHECK-NEXT:    umov w9, v1.h[2] +; CHECK-NEXT:    and v0.8b, v1.8b, v2.8b +; CHECK-NEXT:    and w9, w9, #0x1 +; CHECK-NEXT:    cmeq v0.4h, v0.4h, v1.4h +; CHECK-NEXT:    bfi w8, w9, #2, #1 +; CHECK-NEXT:    umov w9, v1.h[3] +; CHECK-NEXT:    mvn v0.8b, v0.8b +; CHECK-NEXT:    bfi w8, w9, #3, #29 +; CHECK-NEXT:    sshll v0.4s, v0.4h, #0 +; CHECK-NEXT:    and w8, w8, #0xf +; CHECK-NEXT:    strb w8, [x0] +; CHECK-NEXT:    ret +  %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) +  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i1> %val, <4 x i1>* %p2 +  ret <4 x i32> %res +} + +define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i128: +; CHECK:       // %bb.0: +; CHECK-NEXT:    adds x9, x2, x6 +; CHECK-NEXT:    adcs x10, x3, x7 +; CHECK-NEXT:    cmp x9, x2 +; CHECK-NEXT:    cset w11, lo +; CHECK-NEXT:    cmp x10, x3 +; CHECK-NEXT:    cset w12, lo +; CHECK-NEXT:    csel w11, w11, w12, eq +; CHECK-NEXT:    adds x12, x0, x4 +; CHECK-NEXT:    adcs x13, x1, x5 +; CHECK-NEXT:    cmp x12, x0 +; CHECK-NEXT:    cset w14, lo +; CHECK-NEXT:    cmp x13, x1 +; CHECK-NEXT:    cset w15, lo +; CHECK-NEXT:    csel w14, w14, w15, eq +; CHECK-NEXT:    ldr x8, [sp] +; CHECK-NEXT:    fmov s0, w14 +; CHECK-NEXT:    mov v0.s[1], w11 +; CHECK-NEXT:    shl v0.2s, v0.2s, #31 +; CHECK-NEXT:    sshr v0.2s, v0.2s, #31 +; CHECK-NEXT:    stp x9, x10, [x8, #16] +; CHECK-NEXT:    stp x12, x13, [x8] +; CHECK-NEXT:    ret +  %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) +  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i128> %val, <2 x i128>* %p2 +  ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 0b52821f72c..a5d7592faaf 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -1,11 +1,14 @@  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s +  declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone  declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +  ; FUNC-LABEL: {{^}}saddo_i64_zext:  define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {    %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -65,3 +68,22 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*    store i1 %carry, i1 addrspace(1)* %carryout    ret void  } + +; FUNC-LABEL: {{^}}v_saddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { +  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 +  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 +  %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind +  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 +  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 +  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 +  %carry.ext = zext <2 x i1> %carry to <2 x i32> +  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index fee14b48b44..4f4bab84150 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -1,10 +1,11 @@  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s +  declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone  declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone +declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone  ; FUNC-LABEL: {{^}}ssubo_i64_zext:  define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { @@ -70,3 +71,22 @@ define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*    store i1 %carry, i1 addrspace(1)* %carryout    ret void  } + +; FUNC-LABEL: {{^}}v_ssubo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { +  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 +  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 +  %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind +  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 +  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 +  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 +  %carry.ext = zext <2 x i1> %carry to <2 x i32> +  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 0cb2487dd4a..cd9ea4d3437 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -1,7 +1,6 @@  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s  ; FUNC-LABEL: {{^}}s_uaddo_i64_zext:  ; GCN: s_add_u32 @@ -152,10 +151,32 @@ define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)*    ret void  } +; FUNC-LABEL: {{^}}v_uaddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { +  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 +  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 +  %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind +  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 +  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 +  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 +  %carry.ext = zext <2 x i1> %carry to <2 x i32> +  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout +  ret void +} + +  declare i32 @llvm.amdgcn.workitem.id.x() #1  declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1  declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1  declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index eeb19f86f38..f4e1a9afd6f 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -1,7 +1,7 @@  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +  ; FUNC-LABEL: {{^}}s_usubo_i64_zext:  ; GCN: s_sub_u32 @@ -159,10 +159,28 @@ define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)*    ret void  } +; FUNC-LABEL: {{^}}v_usubo_v2i32: +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { +  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 +  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 +  %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind +  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 +  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 +  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 +  %carry.ext = zext <2 x i1> %carry to <2 x i32> +  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout +  ret void +} +  declare i32 @llvm.amdgcn.workitem.id.x() #1  declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1  declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1  declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll new file mode 100644 index 00000000000..dbdc412f81d --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -0,0 +1,2028 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v1i32: +; SSE:       # %bb.0: +; SSE-NEXT:    xorl %eax, %eax +; SSE-NEXT:    addl %esi, %edi +; SSE-NEXT:    seto %al +; SSE-NEXT:    negl %eax +; SSE-NEXT:    movl %edi, (%rdx) +; SSE-NEXT:    retq +; +; AVX-LABEL: saddo_v1i32: +; AVX:       # %bb.0: +; AVX-NEXT:    xorl %eax, %eax +; AVX-NEXT:    addl %esi, %edi +; AVX-NEXT:    seto %al +; AVX-NEXT:    negl %eax +; AVX-NEXT:    movl %edi, (%rdx) +; AVX-NEXT:    retq +  %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) +  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 +  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 +  %res = sext <1 x i1> %obit to <1 x i32> +  store <1 x i32> %val, <1 x i32>* %p2 +  ret <1 x i32> %res +} + +define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    psllq $32, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT:    psllq $32, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    paddq %xmm2, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    psllq $32, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT:    pxor %xmm3, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT:    movq %xmm1, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v2i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    psllq $32, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT:    psllq $32, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    paddq %xmm2, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    psllq $32, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT:    pxor %xmm3, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT:    movq %xmm1, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v2i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm1, %xmm2 +; SSE41-NEXT:    psllq $32, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT:    movdqa %xmm0, %xmm1 +; SSE41-NEXT:    psllq $32, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT:    paddq %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    psllq $32, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT:    movq %xmm1, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v2i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v2i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm2 +; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2 +; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm0 +; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v2i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm1 +; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpmovqd %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    retq +  %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) +  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i32> %val, <2 x i32>* %p2 +  ret <2 x i32> %res +} + +define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v3i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT:    paddd %xmm1, %xmm0 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT:    pandn %xmm3, %xmm2 +; SSE2-NEXT:    movq %xmm0, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT:    movd %xmm0, 8(%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v3i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT:    paddd %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT:    pandn %xmm3, %xmm2 +; SSSE3-NEXT:    movq %xmm0, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT:    movd %xmm0, 8(%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v3i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT:    paddd %xmm1, %xmm0 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT:    pandn %xmm3, %xmm2 +; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT:    movq %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v3i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v3i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v3i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT:    vmovq %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) +  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 +  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 +  %res = sext <3 x i1> %obit to <3 x i32> +  store <3 x i32> %val, <3 x i32>* %p2 +  ret <3 x i32> %res +} + +define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v4i32: +; SSE:       # %bb.0: +; SSE-NEXT:    pxor %xmm2, %xmm2 +; SSE-NEXT:    pxor %xmm3, %xmm3 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE-NEXT:    pxor %xmm4, %xmm3 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE-NEXT:    pxor %xmm4, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE-NEXT:    paddd %xmm1, %xmm0 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSE-NEXT:    pxor %xmm4, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE-NEXT:    pandn %xmm3, %xmm2 +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm2, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: saddo_v4i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v4i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v4i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) +  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i32> %val, <4 x i32>* %p2 +  ret <4 x i32> %res +} + +define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v6i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq %rdi, %rax +; SSE2-NEXT:    movd %r8d, %xmm0 +; SSE2-NEXT:    movd %ecx, %xmm1 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd %edx, %xmm0 +; SSE2-NEXT:    movd %esi, %xmm4 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT:    movd %r9d, %xmm3 +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pxor %xmm6, %xmm6 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pxor %xmm7, %xmm7 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT:    pxor %xmm5, %xmm7 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT:    paddd %xmm4, %xmm0 +; SSE2-NEXT:    pxor %xmm4, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT:    pxor %xmm5, %xmm4 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT:    pandn %xmm6, %xmm4 +; SSE2-NEXT:    pxor %xmm6, %xmm6 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pxor %xmm7, %xmm7 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7 +; SSE2-NEXT:    pxor %xmm5, %xmm7 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT:    paddd %xmm3, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT:    pxor %xmm5, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT:    pandn %xmm6, %xmm2 +; SSE2-NEXT:    movq %xmm1, 16(%rcx) +; SSE2-NEXT:    movdqa %xmm0, (%rcx) +; SSE2-NEXT:    movq %xmm2, 16(%rdi) +; SSE2-NEXT:    movdqa %xmm4, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v6i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq %rdi, %rax +; SSSE3-NEXT:    movd %r8d, %xmm0 +; SSSE3-NEXT:    movd %ecx, %xmm1 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd %edx, %xmm0 +; SSSE3-NEXT:    movd %esi, %xmm4 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT:    movd %r9d, %xmm3 +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pxor %xmm6, %xmm6 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pxor %xmm7, %xmm7 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT:    pxor %xmm5, %xmm7 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT:    paddd %xmm4, %xmm0 +; SSSE3-NEXT:    pxor %xmm4, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT:    pxor %xmm5, %xmm4 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm4 +; SSSE3-NEXT:    pandn %xmm6, %xmm4 +; SSSE3-NEXT:    pxor %xmm6, %xmm6 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pxor %xmm7, %xmm7 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7 +; SSSE3-NEXT:    pxor %xmm5, %xmm7 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT:    paddd %xmm3, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT:    pxor %xmm5, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT:    pandn %xmm6, %xmm2 +; SSSE3-NEXT:    movq %xmm1, 16(%rcx) +; SSSE3-NEXT:    movdqa %xmm0, (%rcx) +; SSSE3-NEXT:    movq %xmm2, 16(%rdi) +; SSSE3-NEXT:    movdqa %xmm4, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v6i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq %rdi, %rax +; SSE41-NEXT:    movd %esi, %xmm4 +; SSE41-NEXT:    pinsrd $1, %edx, %xmm4 +; SSE41-NEXT:    pinsrd $2, %ecx, %xmm4 +; SSE41-NEXT:    pinsrd $3, %r8d, %xmm4 +; SSE41-NEXT:    movd %r9d, %xmm2 +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm6, %xmm6 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT:    pxor %xmm5, %xmm6 +; SSE41-NEXT:    pxor %xmm7, %xmm7 +; SSE41-NEXT:    pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT:    pxor %xmm5, %xmm7 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT:    paddd %xmm4, %xmm1 +; SSE41-NEXT:    pxor %xmm4, %xmm4 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT:    pxor %xmm5, %xmm4 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4 +; SSE41-NEXT:    pandn %xmm6, %xmm4 +; SSE41-NEXT:    pxor %xmm6, %xmm6 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT:    pxor %xmm5, %xmm6 +; SSE41-NEXT:    pxor %xmm7, %xmm7 +; SSE41-NEXT:    pcmpgtd %xmm2, %xmm7 +; SSE41-NEXT:    pxor %xmm5, %xmm7 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT:    paddd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT:    pxor %xmm5, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT:    pandn %xmm6, %xmm3 +; SSE41-NEXT:    movq %xmm0, 16(%rcx) +; SSE41-NEXT:    movdqa %xmm1, (%rcx) +; SSE41-NEXT:    movq %xmm3, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm4, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v6i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6 +; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0 +; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT:    vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v6i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v6i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) +  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 +  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 +  %res = sext <6 x i1> %obit to <6 x i32> +  store <6 x i32> %val, <6 x i32>* %p2 +  ret <6 x i32> %res +} + +define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v8i32: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa %xmm1, %xmm4 +; SSE-NEXT:    pxor %xmm1, %xmm1 +; SSE-NEXT:    pxor %xmm6, %xmm6 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm6 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE-NEXT:    pxor %xmm5, %xmm6 +; SSE-NEXT:    pxor %xmm7, %xmm7 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm7 +; SSE-NEXT:    pxor %xmm5, %xmm7 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE-NEXT:    paddd %xmm2, %xmm0 +; SSE-NEXT:    pxor %xmm2, %xmm2 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSE-NEXT:    pxor %xmm5, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE-NEXT:    pandn %xmm6, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm6 +; SSE-NEXT:    pcmpgtd %xmm3, %xmm6 +; SSE-NEXT:    pxor %xmm5, %xmm6 +; SSE-NEXT:    pxor %xmm7, %xmm7 +; SSE-NEXT:    pcmpgtd %xmm4, %xmm7 +; SSE-NEXT:    pxor %xmm5, %xmm7 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE-NEXT:    paddd %xmm3, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm4, %xmm1 +; SSE-NEXT:    pxor %xmm5, %xmm1 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm1 +; SSE-NEXT:    pandn %xmm6, %xmm1 +; SSE-NEXT:    movdqa %xmm4, 16(%rdi) +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm2, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: saddo_v8i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm9 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm4, %xmm9 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm6 +; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1 +; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT:    vandps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vmovaps %ymm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v8i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v8i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) +  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i32> %val, <8 x i32>* %p2 +  ret <8 x i32> %res +} + +define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v16i32: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa %xmm3, %xmm8 +; SSE-NEXT:    pxor %xmm3, %xmm3 +; SSE-NEXT:    pxor %xmm11, %xmm11 +; SSE-NEXT:    pcmpgtd %xmm4, %xmm11 +; SSE-NEXT:    pcmpeqd %xmm10, %xmm10 +; SSE-NEXT:    pxor %xmm10, %xmm11 +; SSE-NEXT:    pxor %xmm12, %xmm12 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm12 +; SSE-NEXT:    pxor %xmm10, %xmm12 +; SSE-NEXT:    pcmpeqd %xmm12, %xmm11 +; SSE-NEXT:    paddd %xmm4, %xmm0 +; SSE-NEXT:    pxor %xmm9, %xmm9 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm9 +; SSE-NEXT:    pxor %xmm10, %xmm9 +; SSE-NEXT:    pcmpeqd %xmm12, %xmm9 +; SSE-NEXT:    pandn %xmm11, %xmm9 +; SSE-NEXT:    pxor %xmm12, %xmm12 +; SSE-NEXT:    pcmpgtd %xmm5, %xmm12 +; SSE-NEXT:    pxor %xmm10, %xmm12 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE-NEXT:    pxor %xmm10, %xmm4 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm12 +; SSE-NEXT:    paddd %xmm5, %xmm1 +; SSE-NEXT:    pxor %xmm11, %xmm11 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm11 +; SSE-NEXT:    pxor %xmm10, %xmm11 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm11 +; SSE-NEXT:    pandn %xmm12, %xmm11 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm6, %xmm4 +; SSE-NEXT:    pxor %xmm10, %xmm4 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm5 +; SSE-NEXT:    pxor %xmm10, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm4 +; SSE-NEXT:    paddd %xmm6, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm6 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm6 +; SSE-NEXT:    pxor %xmm10, %xmm6 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm6 +; SSE-NEXT:    pandn %xmm4, %xmm6 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm7, %xmm4 +; SSE-NEXT:    pxor %xmm10, %xmm4 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm8, %xmm5 +; SSE-NEXT:    pxor %xmm10, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm4 +; SSE-NEXT:    paddd %xmm7, %xmm8 +; SSE-NEXT:    pcmpgtd %xmm8, %xmm3 +; SSE-NEXT:    pxor %xmm10, %xmm3 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE-NEXT:    pandn %xmm4, %xmm3 +; SSE-NEXT:    movdqa %xmm8, 48(%rdi) +; SSE-NEXT:    movdqa %xmm2, 32(%rdi) +; SSE-NEXT:    movdqa %xmm1, 16(%rdi) +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm9, %xmm0 +; SSE-NEXT:    movdqa %xmm11, %xmm1 +; SSE-NEXT:    movdqa %xmm6, %xmm2 +; SSE-NEXT:    retq +; +; AVX1-LABEL: saddo_v16i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9 +; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm10 +; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm10, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm11 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm6, %xmm11 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm9 +; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm10, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm10 +; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-NEXT:    vandps %ymm3, %ymm8, %ymm3 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT:    vpackssdw %xmm6, %xmm3, %xmm8 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm3, %xmm11 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm12, %xmm7, %xmm12 +; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT:    vpaddd %xmm6, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT:    vandps %ymm2, %ymm11, %ymm2 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm4 +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vpmovsxwd %xmm8, %xmm1 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT:    vmovaps %ymm3, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v16i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7 +; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1 +; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT:    vpandn %ymm5, %ymm1, %ymm1 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5 +; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7 +; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0 +; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0 +; AVX2-NEXT:    vpandn %ymm5, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT:    vmovdqa %ymm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v16i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) +  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i32> %val, <16 x i32>* %p2 +  ret <16 x i32> %res +} + +define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: saddo_v16i8: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSE2-NEXT:    paddb %xmm1, %xmm0 +; SSE2-NEXT:    pcmpgtb %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSE2-NEXT:    pandn %xmm2, %xmm3 +; SSE2-NEXT:    movdqa %xmm3, %xmm1 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    movdqa %xmm1, %xmm4 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm4 +; SSE2-NEXT:    psrad $31, %xmm4 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT:    movdqa %xmm3, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm3 +; SSE2-NEXT:    psrad $31, %xmm3 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm4, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v16i8: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pcmpgtb %xmm1, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSSE3-NEXT:    paddb %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSSE3-NEXT:    pandn %xmm2, %xmm3 +; SSSE3-NEXT:    movdqa %xmm3, %xmm1 +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    movdqa %xmm1, %xmm4 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm4 +; SSSE3-NEXT:    psrad $31, %xmm4 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT:    movdqa %xmm3, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm3 +; SSSE3-NEXT:    psrad $31, %xmm3 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm4, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v16i8: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSE41-NEXT:    paddb %xmm1, %xmm0 +; SSE41-NEXT:    pcmpgtb %xmm0, %xmm3 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSE41-NEXT:    pandn %xmm2, %xmm3 +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm4 +; SSE41-NEXT:    psrad $31, %xmm4 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm3 +; SSE41-NEXT:    psrad $31, %xmm3 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm4, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v16i8: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm6 +; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm1 +; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT:    vmovdqa %xmm6, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v16i8: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm6 +; AVX2-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm1 +; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT:    vmovdqa %xmm6, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v16i8: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) +  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i8> %val, <16 x i8>* %p2 +  ret <16 x i32> %res +} + +define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: saddo_v8i16: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm1, %xmm2 +; SSE2-NEXT:    pxor %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pcmpgtw %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSE2-NEXT:    paddw %xmm2, %xmm0 +; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1 +; SSE2-NEXT:    pxor %xmm4, %xmm1 +; SSE2-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSE2-NEXT:    pandn %xmm3, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v8i16: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm1, %xmm2 +; SSSE3-NEXT:    pxor %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSSE3-NEXT:    paddw %xmm2, %xmm0 +; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1 +; SSSE3-NEXT:    pxor %xmm4, %xmm1 +; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSSE3-NEXT:    pandn %xmm3, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v8i16: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm1, %xmm2 +; SSE41-NEXT:    pxor %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pcmpgtw %xmm2, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSE41-NEXT:    paddw %xmm2, %xmm0 +; SSE41-NEXT:    pcmpgtw %xmm0, %xmm1 +; SSE41-NEXT:    pxor %xmm4, %xmm1 +; SSE41-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSE41-NEXT:    pandn %xmm3, %xmm1 +; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v8i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v8i16: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v8i16: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) +  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i16> %val, <8 x i16>* %p2 +  ret <8 x i32> %res +} + +define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i64: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm3 +; SSE2-NEXT:    paddq %xmm1, %xmm0 +; SSE2-NEXT:    pxor %xmm2, %xmm1 +; SSE2-NEXT:    movdqa %xmm2, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT:    pand %xmm5, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT:    por %xmm1, %xmm4 +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm1, %xmm4 +; SSE2-NEXT:    pxor %xmm2, %xmm3 +; SSE2-NEXT:    movdqa %xmm2, %xmm5 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT:    pand %xmm6, %xmm3 +; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT:    por %xmm3, %xmm5 +; SSE2-NEXT:    pxor %xmm1, %xmm5 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSE2-NEXT:    pand %xmm4, %xmm3 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm6, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT:    por %xmm0, %xmm2 +; SSE2-NEXT:    pxor %xmm1, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm0 +; SSE2-NEXT:    pandn %xmm3, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v2i64: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm3 +; SSSE3-NEXT:    paddq %xmm1, %xmm0 +; SSSE3-NEXT:    pxor %xmm2, %xmm1 +; SSSE3-NEXT:    movdqa %xmm2, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT:    pand %xmm5, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT:    por %xmm1, %xmm4 +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm1, %xmm4 +; SSSE3-NEXT:    pxor %xmm2, %xmm3 +; SSSE3-NEXT:    movdqa %xmm2, %xmm5 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT:    pand %xmm6, %xmm3 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT:    por %xmm3, %xmm5 +; SSSE3-NEXT:    pxor %xmm1, %xmm5 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSSE3-NEXT:    pand %xmm4, %xmm3 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    movdqa %xmm2, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    pand %xmm6, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT:    por %xmm0, %xmm2 +; SSSE3-NEXT:    pxor %xmm1, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm0 +; SSSE3-NEXT:    pandn %xmm3, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v2i64: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT:    movdqa %xmm0, %xmm3 +; SSE41-NEXT:    paddq %xmm1, %xmm0 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm2, %xmm4 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT:    pand %xmm5, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT:    por %xmm1, %xmm4 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm1, %xmm4 +; SSE41-NEXT:    pxor %xmm2, %xmm3 +; SSE41-NEXT:    movdqa %xmm2, %xmm5 +; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT:    pand %xmm6, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT:    por %xmm3, %xmm5 +; SSE41-NEXT:    pxor %xmm1, %xmm5 +; SSE41-NEXT:    pcmpeqq %xmm5, %xmm4 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    movdqa %xmm2, %xmm3 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT:    pand %xmm6, %xmm2 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT:    por %xmm2, %xmm0 +; SSE41-NEXT:    pxor %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqq %xmm5, %xmm0 +; SSE41-NEXT:    pandn %xmm4, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v2i64: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v2i64: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v2i64: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k0, %k1 +; AVX512-NEXT:    vmovdqa %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) +  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i64> %val, <2 x i64>* %p2 +  ret <2 x i32> %res +} + +define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: saddo_v4i24: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    pslld $8, %xmm1 +; SSE2-NEXT:    psrad $8, %xmm1 +; SSE2-NEXT:    pslld $8, %xmm2 +; SSE2-NEXT:    psrad $8, %xmm2 +; SSE2-NEXT:    paddd %xmm1, %xmm2 +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    pslld $8, %xmm0 +; SSE2-NEXT:    psrad $8, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm1, %xmm0 +; SSE2-NEXT:    movd %xmm2, %eax +; SSE2-NEXT:    movw %ax, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT:    movd %xmm1, %ecx +; SSE2-NEXT:    movw %cx, 9(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT:    movd %xmm1, %edx +; SSE2-NEXT:    movw %dx, 6(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT:    movd %xmm1, %esi +; SSE2-NEXT:    movw %si, 3(%rdi) +; SSE2-NEXT:    shrl $16, %eax +; SSE2-NEXT:    movb %al, 2(%rdi) +; SSE2-NEXT:    shrl $16, %ecx +; SSE2-NEXT:    movb %cl, 11(%rdi) +; SSE2-NEXT:    shrl $16, %edx +; SSE2-NEXT:    movb %dl, 8(%rdi) +; SSE2-NEXT:    shrl $16, %esi +; SSE2-NEXT:    movb %sil, 5(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v4i24: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm2 +; SSSE3-NEXT:    pslld $8, %xmm1 +; SSSE3-NEXT:    psrad $8, %xmm1 +; SSSE3-NEXT:    pslld $8, %xmm2 +; SSSE3-NEXT:    psrad $8, %xmm2 +; SSSE3-NEXT:    paddd %xmm1, %xmm2 +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    pslld $8, %xmm0 +; SSSE3-NEXT:    psrad $8, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm1, %xmm0 +; SSSE3-NEXT:    movd %xmm2, %eax +; SSSE3-NEXT:    movw %ax, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %ecx +; SSSE3-NEXT:    movw %cx, 9(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT:    movd %xmm1, %edx +; SSSE3-NEXT:    movw %dx, 6(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %esi +; SSSE3-NEXT:    movw %si, 3(%rdi) +; SSSE3-NEXT:    shrl $16, %eax +; SSSE3-NEXT:    movb %al, 2(%rdi) +; SSSE3-NEXT:    shrl $16, %ecx +; SSSE3-NEXT:    movb %cl, 11(%rdi) +; SSSE3-NEXT:    shrl $16, %edx +; SSSE3-NEXT:    movb %dl, 8(%rdi) +; SSSE3-NEXT:    shrl $16, %esi +; SSSE3-NEXT:    movb %sil, 5(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v4i24: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm2 +; SSE41-NEXT:    pslld $8, %xmm1 +; SSE41-NEXT:    psrad $8, %xmm1 +; SSE41-NEXT:    pslld $8, %xmm2 +; SSE41-NEXT:    psrad $8, %xmm2 +; SSE41-NEXT:    paddd %xmm1, %xmm2 +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    pslld $8, %xmm0 +; SSE41-NEXT:    psrad $8, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm1, %xmm0 +; SSE41-NEXT:    pextrd $3, %xmm2, %eax +; SSE41-NEXT:    movw %ax, 9(%rdi) +; SSE41-NEXT:    pextrd $2, %xmm2, %ecx +; SSE41-NEXT:    movw %cx, 6(%rdi) +; SSE41-NEXT:    pextrd $1, %xmm2, %edx +; SSE41-NEXT:    movw %dx, 3(%rdi) +; SSE41-NEXT:    movd %xmm2, %esi +; SSE41-NEXT:    movw %si, (%rdi) +; SSE41-NEXT:    shrl $16, %eax +; SSE41-NEXT:    movb %al, 11(%rdi) +; SSE41-NEXT:    shrl $16, %ecx +; SSE41-NEXT:    movb %cl, 8(%rdi) +; SSE41-NEXT:    shrl $16, %edx +; SSE41-NEXT:    movb %dl, 5(%rdi) +; SSE41-NEXT:    shrl $16, %esi +; SSE41-NEXT:    movb %sil, 2(%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v4i24: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $3, %xmm1, %eax +; AVX1-NEXT:    movw %ax, 9(%rdi) +; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX1-NEXT:    movw %cx, 6(%rdi) +; AVX1-NEXT:    vpextrd $1, %xmm1, %edx +; AVX1-NEXT:    movw %dx, 3(%rdi) +; AVX1-NEXT:    vmovd %xmm1, %esi +; AVX1-NEXT:    movw %si, (%rdi) +; AVX1-NEXT:    shrl $16, %eax +; AVX1-NEXT:    movb %al, 11(%rdi) +; AVX1-NEXT:    shrl $16, %ecx +; AVX1-NEXT:    movb %cl, 8(%rdi) +; AVX1-NEXT:    shrl $16, %edx +; AVX1-NEXT:    movb %dl, 5(%rdi) +; AVX1-NEXT:    shrl $16, %esi +; AVX1-NEXT:    movb %sil, 2(%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v4i24: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $3, %xmm1, %eax +; AVX2-NEXT:    movw %ax, 9(%rdi) +; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX2-NEXT:    movw %cx, 6(%rdi) +; AVX2-NEXT:    vpextrd $1, %xmm1, %edx +; AVX2-NEXT:    movw %dx, 3(%rdi) +; AVX2-NEXT:    vmovd %xmm1, %esi +; AVX2-NEXT:    movw %si, (%rdi) +; AVX2-NEXT:    shrl $16, %eax +; AVX2-NEXT:    movb %al, 11(%rdi) +; AVX2-NEXT:    shrl $16, %ecx +; AVX2-NEXT:    movb %cl, 8(%rdi) +; AVX2-NEXT:    shrl $16, %edx +; AVX2-NEXT:    movb %dl, 5(%rdi) +; AVX2-NEXT:    shrl $16, %esi +; AVX2-NEXT:    movb %sil, 2(%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v4i24: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vpextrd $3, %xmm1, %eax +; AVX512-NEXT:    movw %ax, 9(%rdi) +; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX512-NEXT:    movw %cx, 6(%rdi) +; AVX512-NEXT:    vpextrd $1, %xmm1, %edx +; AVX512-NEXT:    movw %dx, 3(%rdi) +; AVX512-NEXT:    vmovd %xmm1, %esi +; AVX512-NEXT:    movw %si, (%rdi) +; AVX512-NEXT:    shrl $16, %eax +; AVX512-NEXT:    movb %al, 11(%rdi) +; AVX512-NEXT:    shrl $16, %ecx +; AVX512-NEXT:    movb %cl, 8(%rdi) +; AVX512-NEXT:    shrl $16, %edx +; AVX512-NEXT:    movb %dl, 5(%rdi) +; AVX512-NEXT:    shrl $16, %esi +; AVX512-NEXT:    movb %sil, 2(%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) +  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i24> %val, <4 x i24>* %p2 +  ret <4 x i32> %res +} + +define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: saddo_v4i1: +; SSE:       # %bb.0: +; SSE-NEXT:    pslld $31, %xmm1 +; SSE-NEXT:    psrad $31, %xmm1 +; SSE-NEXT:    pslld $31, %xmm0 +; SSE-NEXT:    psrad $31, %xmm0 +; SSE-NEXT:    paddd %xmm1, %xmm0 +; SSE-NEXT:    movdqa %xmm0, %xmm1 +; SSE-NEXT:    pslld $31, %xmm1 +; SSE-NEXT:    psrad $31, %xmm1 +; SSE-NEXT:    pcmpeqd %xmm1, %xmm0 +; SSE-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE-NEXT:    pxor %xmm2, %xmm0 +; SSE-NEXT:    movmskps %xmm1, %eax +; SSE-NEXT:    movb %al, (%rdi) +; SSE-NEXT:    retq +; +; AVX1-LABEL: saddo_v4i1: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vmovmskps %xmm1, %eax +; AVX1-NEXT:    movb %al, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v4i1: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1 +; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vmovmskps %xmm1, %eax +; AVX2-NEXT:    movb %al, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v4i1: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 +; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k3 +; AVX512-NEXT:    kxorw %k2, %k0, %k0 +; AVX512-NEXT:    kxorw %k0, %k1, %k1 +; AVX512-NEXT:    kandnw %k3, %k1, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    kmovd %k0, %eax +; AVX512-NEXT:    movb %al, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) +  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i1> %val, <4 x i1>* %p2 +  ret <4 x i32> %res +} + +define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i128: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pushq %rbp +; SSE2-NEXT:    pushq %rbx +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT:    movq %rcx, %rax +; SSE2-NEXT:    adcq %r11, %rax +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    testq %rcx, %rcx +; SSE2-NEXT:    setns %cl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %bpl +; SSE2-NEXT:    testq %r11, %r11 +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    sete %cl +; SSE2-NEXT:    andb %bpl, %cl +; SSE2-NEXT:    movzbl %cl, %ebp +; SSE2-NEXT:    testq %r9, %r9 +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    testq %rsi, %rsi +; SSE2-NEXT:    setns %cl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    sete %r11b +; SSE2-NEXT:    addq %r8, %rdi +; SSE2-NEXT:    adcq %r9, %rsi +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %cl +; SSE2-NEXT:    andb %r11b, %cl +; SSE2-NEXT:    movzbl %cl, %ecx +; SSE2-NEXT:    movd %ecx, %xmm0 +; SSE2-NEXT:    pinsrw $4, %ebp, %xmm0 +; SSE2-NEXT:    movq %rdx, 16(%r10) +; SSE2-NEXT:    movq %rdi, (%r10) +; SSE2-NEXT:    movq %rax, 24(%r10) +; SSE2-NEXT:    movq %rsi, 8(%r10) +; SSE2-NEXT:    psllq $63, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    popq %rbx +; SSE2-NEXT:    popq %rbp +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: saddo_v2i128: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pushq %rbp +; SSSE3-NEXT:    pushq %rbx +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT:    movq %rcx, %rax +; SSSE3-NEXT:    adcq %r11, %rax +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    testq %rcx, %rcx +; SSSE3-NEXT:    setns %cl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %bpl +; SSSE3-NEXT:    testq %r11, %r11 +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    sete %cl +; SSSE3-NEXT:    andb %bpl, %cl +; SSSE3-NEXT:    movzbl %cl, %ebp +; SSSE3-NEXT:    testq %r9, %r9 +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    testq %rsi, %rsi +; SSSE3-NEXT:    setns %cl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    sete %r11b +; SSSE3-NEXT:    addq %r8, %rdi +; SSSE3-NEXT:    adcq %r9, %rsi +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %cl +; SSSE3-NEXT:    andb %r11b, %cl +; SSSE3-NEXT:    movzbl %cl, %ecx +; SSSE3-NEXT:    movd %ecx, %xmm0 +; SSSE3-NEXT:    pinsrw $4, %ebp, %xmm0 +; SSSE3-NEXT:    movq %rdx, 16(%r10) +; SSSE3-NEXT:    movq %rdi, (%r10) +; SSSE3-NEXT:    movq %rax, 24(%r10) +; SSSE3-NEXT:    movq %rsi, 8(%r10) +; SSSE3-NEXT:    psllq $63, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    popq %rbx +; SSSE3-NEXT:    popq %rbp +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: saddo_v2i128: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pushq %rbp +; SSE41-NEXT:    pushq %rbx +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT:    movq %rcx, %rax +; SSE41-NEXT:    adcq %r11, %rax +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    testq %rcx, %rcx +; SSE41-NEXT:    setns %cl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %bpl +; SSE41-NEXT:    testq %r11, %r11 +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    sete %cl +; SSE41-NEXT:    andb %bpl, %cl +; SSE41-NEXT:    movzbl %cl, %ebp +; SSE41-NEXT:    testq %r9, %r9 +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    testq %rsi, %rsi +; SSE41-NEXT:    setns %cl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    sete %r11b +; SSE41-NEXT:    addq %r8, %rdi +; SSE41-NEXT:    adcq %r9, %rsi +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %cl +; SSE41-NEXT:    andb %r11b, %cl +; SSE41-NEXT:    movzbl %cl, %ecx +; SSE41-NEXT:    movd %ecx, %xmm0 +; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT:    movq %rdx, 16(%r10) +; SSE41-NEXT:    movq %rdi, (%r10) +; SSE41-NEXT:    movq %rax, 24(%r10) +; SSE41-NEXT:    movq %rsi, 8(%r10) +; SSE41-NEXT:    psllq $63, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT:    popq %rbx +; SSE41-NEXT:    popq %rbp +; SSE41-NEXT:    retq +; +; AVX1-LABEL: saddo_v2i128: +; AVX1:       # %bb.0: +; AVX1-NEXT:    pushq %rbp +; AVX1-NEXT:    pushq %rbx +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT:    movq %rcx, %rax +; AVX1-NEXT:    adcq %r11, %rax +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    testq %rcx, %rcx +; AVX1-NEXT:    setns %cl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %bpl +; AVX1-NEXT:    testq %r11, %r11 +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    sete %cl +; AVX1-NEXT:    andb %bpl, %cl +; AVX1-NEXT:    movzbl %cl, %ebp +; AVX1-NEXT:    testq %r9, %r9 +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    testq %rsi, %rsi +; AVX1-NEXT:    setns %cl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    sete %r11b +; AVX1-NEXT:    addq %r8, %rdi +; AVX1-NEXT:    adcq %r9, %rsi +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %cl +; AVX1-NEXT:    andb %r11b, %cl +; AVX1-NEXT:    movzbl %cl, %ecx +; AVX1-NEXT:    vmovd %ecx, %xmm0 +; AVX1-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT:    movq %rdx, 16(%r10) +; AVX1-NEXT:    movq %rdi, (%r10) +; AVX1-NEXT:    movq %rax, 24(%r10) +; AVX1-NEXT:    movq %rsi, 8(%r10) +; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    popq %rbx +; AVX1-NEXT:    popq %rbp +; AVX1-NEXT:    retq +; +; AVX2-LABEL: saddo_v2i128: +; AVX2:       # %bb.0: +; AVX2-NEXT:    pushq %rbp +; AVX2-NEXT:    pushq %rbx +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT:    movq %rcx, %rax +; AVX2-NEXT:    adcq %r11, %rax +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    testq %rcx, %rcx +; AVX2-NEXT:    setns %cl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %bpl +; AVX2-NEXT:    testq %r11, %r11 +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    sete %cl +; AVX2-NEXT:    andb %bpl, %cl +; AVX2-NEXT:    movzbl %cl, %ebp +; AVX2-NEXT:    testq %r9, %r9 +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    testq %rsi, %rsi +; AVX2-NEXT:    setns %cl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    sete %r11b +; AVX2-NEXT:    addq %r8, %rdi +; AVX2-NEXT:    adcq %r9, %rsi +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %cl +; AVX2-NEXT:    andb %r11b, %cl +; AVX2-NEXT:    movzbl %cl, %ecx +; AVX2-NEXT:    vmovd %ecx, %xmm0 +; AVX2-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT:    movq %rdx, 16(%r10) +; AVX2-NEXT:    movq %rdi, (%r10) +; AVX2-NEXT:    movq %rax, 24(%r10) +; AVX2-NEXT:    movq %rsi, 8(%r10) +; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    popq %rbx +; AVX2-NEXT:    popq %rbp +; AVX2-NEXT:    retq +; +; AVX512-LABEL: saddo_v2i128: +; AVX512:       # %bb.0: +; AVX512-NEXT:    pushq %r14 +; AVX512-NEXT:    pushq %rbx +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT:    movq %rcx, %r14 +; AVX512-NEXT:    adcq %r11, %r14 +; AVX512-NEXT:    setns %bl +; AVX512-NEXT:    testq %rcx, %rcx +; AVX512-NEXT:    setns %cl +; AVX512-NEXT:    cmpb %bl, %cl +; AVX512-NEXT:    setne %bl +; AVX512-NEXT:    testq %r11, %r11 +; AVX512-NEXT:    setns %al +; AVX512-NEXT:    cmpb %al, %cl +; AVX512-NEXT:    sete %al +; AVX512-NEXT:    andb %bl, %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    testq %r9, %r9 +; AVX512-NEXT:    setns %al +; AVX512-NEXT:    testq %rsi, %rsi +; AVX512-NEXT:    setns %cl +; AVX512-NEXT:    cmpb %al, %cl +; AVX512-NEXT:    sete %al +; AVX512-NEXT:    addq %r8, %rdi +; AVX512-NEXT:    adcq %r9, %rsi +; AVX512-NEXT:    setns %bl +; AVX512-NEXT:    cmpb %bl, %cl +; AVX512-NEXT:    setne %cl +; AVX512-NEXT:    andb %al, %cl +; AVX512-NEXT:    movb %cl, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT:    movq %rdx, 16(%r10) +; AVX512-NEXT:    movq %rdi, (%r10) +; AVX512-NEXT:    movq %r14, 24(%r10) +; AVX512-NEXT:    movq %rsi, 8(%r10) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    popq %rbx +; AVX512-NEXT:    popq %r14 +; AVX512-NEXT:    retq +  %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) +  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i128> %val, <2 x i128>* %p2 +  ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll new file mode 100644 index 00000000000..0e1354b939e --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -0,0 +1,2078 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v1i32: +; SSE:       # %bb.0: +; SSE-NEXT:    xorl %eax, %eax +; SSE-NEXT:    subl %esi, %edi +; SSE-NEXT:    seto %al +; SSE-NEXT:    negl %eax +; SSE-NEXT:    movl %edi, (%rdx) +; SSE-NEXT:    retq +; +; AVX-LABEL: ssubo_v1i32: +; AVX:       # %bb.0: +; AVX-NEXT:    xorl %eax, %eax +; AVX-NEXT:    subl %esi, %edi +; AVX-NEXT:    seto %al +; AVX-NEXT:    negl %eax +; AVX-NEXT:    movl %edi, (%rdx) +; AVX-NEXT:    retq +  %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) +  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 +  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 +  %res = sext <1 x i1> %obit to <1 x i32> +  store <1 x i32> %val, <1 x i32>* %p2 +  ret <1 x i32> %res +} + +define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    psllq $32, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT:    psllq $32, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    psubq %xmm2, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    psllq $32, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT:    pxor %xmm3, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT:    movq %xmm1, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v2i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    psllq $32, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT:    psllq $32, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    psubq %xmm2, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    psllq $32, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT:    pxor %xmm3, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT:    movq %xmm1, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v2i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm1, %xmm2 +; SSE41-NEXT:    psllq $32, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT:    movdqa %xmm0, %xmm1 +; SSE41-NEXT:    psllq $32, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT:    psubq %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    psllq $32, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT:    movq %xmm1, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v2i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v2i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm2 +; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm2 +; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm0 +; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v2i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm1 +; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT:    vpmovqd %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    retq +  %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) +  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i32> %val, <2 x i32>* %p2 +  ret <2 x i32> %res +} + +define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v3i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT:    psubd %xmm1, %xmm0 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pandn %xmm3, %xmm2 +; SSE2-NEXT:    movq %xmm0, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT:    movd %xmm0, 8(%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v3i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT:    psubd %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pandn %xmm3, %xmm2 +; SSSE3-NEXT:    movq %xmm0, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT:    movd %xmm0, 8(%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v3i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT:    psubd %xmm1, %xmm0 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pandn %xmm3, %xmm2 +; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT:    movq %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v3i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v3i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v3i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT:    vmovq %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) +  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 +  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 +  %res = sext <3 x i1> %obit to <3 x i32> +  store <3 x i32> %val, <3 x i32>* %p2 +  ret <3 x i32> %res +} + +define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v4i32: +; SSE:       # %bb.0: +; SSE-NEXT:    pxor %xmm3, %xmm3 +; SSE-NEXT:    pxor %xmm2, %xmm2 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE-NEXT:    pxor %xmm4, %xmm2 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm5 +; SSE-NEXT:    pxor %xmm4, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE-NEXT:    psubd %xmm1, %xmm0 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE-NEXT:    pxor %xmm4, %xmm3 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm3 +; SSE-NEXT:    pxor %xmm4, %xmm3 +; SSE-NEXT:    pandn %xmm3, %xmm2 +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm2, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: ssubo_v4i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v4i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v4i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) +  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i32> %val, <4 x i32>* %p2 +  ret <4 x i32> %res +} + +define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v6i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq %rdi, %rax +; SSE2-NEXT:    movd %r8d, %xmm0 +; SSE2-NEXT:    movd %ecx, %xmm1 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd %edx, %xmm2 +; SSE2-NEXT:    movd %esi, %xmm0 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE2-NEXT:    movd %r9d, %xmm1 +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT:    pxor %xmm5, %xmm2 +; SSE2-NEXT:    pxor %xmm7, %xmm7 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT:    pxor %xmm5, %xmm7 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT:    psubd %xmm6, %xmm0 +; SSE2-NEXT:    pxor %xmm6, %xmm6 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pandn %xmm6, %xmm2 +; SSE2-NEXT:    pxor %xmm6, %xmm6 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pxor %xmm7, %xmm7 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT:    pxor %xmm5, %xmm7 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT:    psubd %xmm4, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT:    pxor %xmm5, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT:    pxor %xmm5, %xmm3 +; SSE2-NEXT:    pandn %xmm3, %xmm6 +; SSE2-NEXT:    movq %xmm1, 16(%rcx) +; SSE2-NEXT:    movdqa %xmm0, (%rcx) +; SSE2-NEXT:    movq %xmm6, 16(%rdi) +; SSE2-NEXT:    movdqa %xmm2, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v6i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq %rdi, %rax +; SSSE3-NEXT:    movd %r8d, %xmm0 +; SSSE3-NEXT:    movd %ecx, %xmm1 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd %edx, %xmm2 +; SSSE3-NEXT:    movd %esi, %xmm0 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSSE3-NEXT:    movd %r9d, %xmm1 +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT:    pxor %xmm5, %xmm2 +; SSSE3-NEXT:    pxor %xmm7, %xmm7 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT:    pxor %xmm5, %xmm7 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT:    psubd %xmm6, %xmm0 +; SSSE3-NEXT:    pxor %xmm6, %xmm6 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pandn %xmm6, %xmm2 +; SSSE3-NEXT:    pxor %xmm6, %xmm6 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pxor %xmm7, %xmm7 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT:    pxor %xmm5, %xmm7 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT:    psubd %xmm4, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT:    pxor %xmm5, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm3 +; SSSE3-NEXT:    pxor %xmm5, %xmm3 +; SSSE3-NEXT:    pandn %xmm3, %xmm6 +; SSSE3-NEXT:    movq %xmm1, 16(%rcx) +; SSSE3-NEXT:    movdqa %xmm0, (%rcx) +; SSSE3-NEXT:    movq %xmm6, 16(%rdi) +; SSSE3-NEXT:    movdqa %xmm2, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v6i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq %rdi, %rax +; SSE41-NEXT:    movd %esi, %xmm0 +; SSE41-NEXT:    pinsrd $1, %edx, %xmm0 +; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT:    movd %r9d, %xmm1 +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    pxor %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT:    pxor %xmm5, %xmm2 +; SSE41-NEXT:    pxor %xmm7, %xmm7 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT:    pxor %xmm5, %xmm7 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE41-NEXT:    psubd %xmm6, %xmm0 +; SSE41-NEXT:    pxor %xmm6, %xmm6 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT:    pxor %xmm5, %xmm6 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT:    pxor %xmm5, %xmm6 +; SSE41-NEXT:    pandn %xmm6, %xmm2 +; SSE41-NEXT:    pxor %xmm6, %xmm6 +; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT:    pxor %xmm5, %xmm6 +; SSE41-NEXT:    pxor %xmm7, %xmm7 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm7 +; SSE41-NEXT:    pxor %xmm5, %xmm7 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT:    psubd %xmm3, %xmm1 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT:    pxor %xmm5, %xmm4 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm4 +; SSE41-NEXT:    pxor %xmm5, %xmm4 +; SSE41-NEXT:    pandn %xmm4, %xmm6 +; SSE41-NEXT:    movq %xmm1, 16(%rcx) +; SSE41-NEXT:    movdqa %xmm0, (%rcx) +; SSE41-NEXT:    movq %xmm6, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm2, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v6i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm2 +; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm8 +; AVX1-NEXT:    vpsubd %xmm9, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT:    vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovq %xmm6, 16(%rdi) +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v6i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v6i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) +  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 +  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 +  %res = sext <6 x i1> %obit to <6 x i32> +  store <6 x i32> %val, <6 x i32>* %p2 +  ret <6 x i32> %res +} + +define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v8i32: +; SSE:       # %bb.0: +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm4 +; SSE-NEXT:    pcmpeqd %xmm6, %xmm6 +; SSE-NEXT:    pxor %xmm6, %xmm4 +; SSE-NEXT:    pxor %xmm7, %xmm7 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm7 +; SSE-NEXT:    pxor %xmm6, %xmm7 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm4 +; SSE-NEXT:    psubd %xmm2, %xmm0 +; SSE-NEXT:    pxor %xmm2, %xmm2 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm2 +; SSE-NEXT:    pandn %xmm2, %xmm4 +; SSE-NEXT:    pxor %xmm2, %xmm2 +; SSE-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm2 +; SSE-NEXT:    pxor %xmm7, %xmm7 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm7 +; SSE-NEXT:    pxor %xmm6, %xmm7 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm2 +; SSE-NEXT:    psubd %xmm3, %xmm1 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm5 +; SSE-NEXT:    pxor %xmm6, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm7, %xmm5 +; SSE-NEXT:    pxor %xmm6, %xmm5 +; SSE-NEXT:    pandn %xmm5, %xmm2 +; SSE-NEXT:    movdqa %xmm1, 16(%rdi) +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm4, %xmm0 +; SSE-NEXT:    movdqa %xmm2, %xmm1 +; SSE-NEXT:    retq +; +; AVX1-LABEL: ssubo_v8i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtd %xmm9, %xmm3, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm2 +; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm8 +; AVX1-NEXT:    vpsubd %xmm9, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm1 +; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT:    vandps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vmovaps %ymm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v8i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT:    vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v8i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) +  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i32> %val, <8 x i32>* %p2 +  ret <8 x i32> %res +} + +define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v16i32: +; SSE:       # %bb.0: +; SSE-NEXT:    pxor %xmm10, %xmm10 +; SSE-NEXT:    pxor %xmm8, %xmm8 +; SSE-NEXT:    pcmpgtd %xmm4, %xmm8 +; SSE-NEXT:    pcmpeqd %xmm11, %xmm11 +; SSE-NEXT:    pxor %xmm11, %xmm8 +; SSE-NEXT:    pxor %xmm9, %xmm9 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm9 +; SSE-NEXT:    pxor %xmm11, %xmm9 +; SSE-NEXT:    pcmpeqd %xmm9, %xmm8 +; SSE-NEXT:    psubd %xmm4, %xmm0 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSE-NEXT:    pxor %xmm11, %xmm4 +; SSE-NEXT:    pcmpeqd %xmm9, %xmm4 +; SSE-NEXT:    pxor %xmm11, %xmm4 +; SSE-NEXT:    pandn %xmm4, %xmm8 +; SSE-NEXT:    pxor %xmm9, %xmm9 +; SSE-NEXT:    pcmpgtd %xmm5, %xmm9 +; SSE-NEXT:    pxor %xmm11, %xmm9 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE-NEXT:    pxor %xmm11, %xmm4 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm9 +; SSE-NEXT:    psubd %xmm5, %xmm1 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm1, %xmm5 +; SSE-NEXT:    pxor %xmm11, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm4, %xmm5 +; SSE-NEXT:    pxor %xmm11, %xmm5 +; SSE-NEXT:    pandn %xmm5, %xmm9 +; SSE-NEXT:    pxor %xmm4, %xmm4 +; SSE-NEXT:    pcmpgtd %xmm6, %xmm4 +; SSE-NEXT:    pxor %xmm11, %xmm4 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm5 +; SSE-NEXT:    pxor %xmm11, %xmm5 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm4 +; SSE-NEXT:    psubd %xmm6, %xmm2 +; SSE-NEXT:    pxor %xmm6, %xmm6 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm6 +; SSE-NEXT:    pxor %xmm11, %xmm6 +; SSE-NEXT:    pcmpeqd %xmm5, %xmm6 +; SSE-NEXT:    pxor %xmm11, %xmm6 +; SSE-NEXT:    pandn %xmm6, %xmm4 +; SSE-NEXT:    pxor %xmm5, %xmm5 +; SSE-NEXT:    pcmpgtd %xmm7, %xmm5 +; SSE-NEXT:    pxor %xmm11, %xmm5 +; SSE-NEXT:    pxor %xmm6, %xmm6 +; SSE-NEXT:    pcmpgtd %xmm3, %xmm6 +; SSE-NEXT:    pxor %xmm11, %xmm6 +; SSE-NEXT:    pcmpeqd %xmm6, %xmm5 +; SSE-NEXT:    psubd %xmm7, %xmm3 +; SSE-NEXT:    pcmpgtd %xmm3, %xmm10 +; SSE-NEXT:    pxor %xmm11, %xmm10 +; SSE-NEXT:    pcmpeqd %xmm6, %xmm10 +; SSE-NEXT:    pxor %xmm11, %xmm10 +; SSE-NEXT:    pandn %xmm10, %xmm5 +; SSE-NEXT:    movdqa %xmm3, 48(%rdi) +; SSE-NEXT:    movdqa %xmm2, 32(%rdi) +; SSE-NEXT:    movdqa %xmm1, 16(%rdi) +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm8, %xmm0 +; SSE-NEXT:    movdqa %xmm9, %xmm1 +; SSE-NEXT:    movdqa %xmm4, %xmm2 +; SSE-NEXT:    movdqa %xmm5, %xmm3 +; SSE-NEXT:    retq +; +; AVX1-LABEL: ssubo_v16i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm5, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm9 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT:    vpcmpgtd %xmm12, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm10 +; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm10, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm9 +; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm11, %xmm7, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm6, %ymm9 +; AVX1-NEXT:    vpsubd %xmm8, %xmm12, %xmm8 +; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm10, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm10 +; AVX1-NEXT:    vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT:    vandps %ymm3, %ymm9, %ymm3 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT:    vpackssdw %xmm6, %xmm3, %xmm9 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm13 +; AVX1-NEXT:    vpcmpgtd %xmm13, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm3, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm12, %xmm7, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm6, %ymm11 +; AVX1-NEXT:    vpsubd %xmm13, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm5, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT:    vandps %ymm2, %ymm11, %ymm2 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm4 +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vpmovsxwd %xmm9, %xmm1 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT:    vmovaps %ymm3, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v16i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5 +; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm4, %ymm7 +; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm1 +; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT:    vpandn %ymm1, %ymm5, %ymm1 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm5 +; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm4, %ymm7 +; AVX2-NEXT:    vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm0 +; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm0 +; AVX2-NEXT:    vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT:    vpandn %ymm0, %ymm5, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT:    vmovdqa %ymm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v16i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k0 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT:    vpcmpnltd %zmm2, %zmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) +  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i32> %val, <16 x i32>* %p2 +  ret <16 x i32> %res +} + +define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: ssubo_v16i8: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pxor %xmm2, %xmm2 +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSE2-NEXT:    psubb %xmm1, %xmm0 +; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pandn %xmm2, %xmm3 +; SSE2-NEXT:    movdqa %xmm3, %xmm1 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    movdqa %xmm1, %xmm4 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm4 +; SSE2-NEXT:    psrad $31, %xmm4 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT:    movdqa %xmm3, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm3 +; SSE2-NEXT:    psrad $31, %xmm3 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm4, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v16i8: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pxor %xmm2, %xmm2 +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pcmpgtb %xmm1, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSSE3-NEXT:    psubb %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpgtb %xmm0, %xmm2 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pandn %xmm2, %xmm3 +; SSSE3-NEXT:    movdqa %xmm3, %xmm1 +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    movdqa %xmm1, %xmm4 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm4 +; SSSE3-NEXT:    psrad $31, %xmm4 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT:    movdqa %xmm3, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm3 +; SSSE3-NEXT:    psrad $31, %xmm3 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm4, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v16i8: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtb %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqb %xmm5, %xmm3 +; SSE41-NEXT:    psubb %xmm1, %xmm0 +; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pcmpeqb %xmm5, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pandn %xmm2, %xmm3 +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm4 +; SSE41-NEXT:    psrad $31, %xmm4 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm3 +; SSE41-NEXT:    psrad $31, %xmm3 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm4, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v16i8: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm6 +; AVX1-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm1 +; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT:    vmovdqa %xmm6, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v16i8: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm6 +; AVX2-NEXT:    vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm1 +; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT:    vmovdqa %xmm6, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v16i8: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltb %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) +  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i8> %val, <16 x i8>* %p2 +  ret <16 x i32> %res +} + +define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: ssubo_v8i16: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm1, %xmm2 +; SSE2-NEXT:    pxor %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm1, %xmm1 +; SSE2-NEXT:    pcmpgtw %xmm2, %xmm1 +; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT:    pxor %xmm4, %xmm1 +; SSE2-NEXT:    pxor %xmm5, %xmm5 +; SSE2-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSE2-NEXT:    psubw %xmm2, %xmm0 +; SSE2-NEXT:    pcmpgtw %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pandn %xmm3, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v8i16: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm1, %xmm2 +; SSSE3-NEXT:    pxor %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm1, %xmm1 +; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm1 +; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT:    pxor %xmm4, %xmm1 +; SSSE3-NEXT:    pxor %xmm5, %xmm5 +; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSSE3-NEXT:    psubw %xmm2, %xmm0 +; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pandn %xmm3, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v8i16: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm1, %xmm2 +; SSE41-NEXT:    pxor %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm1, %xmm1 +; SSE41-NEXT:    pcmpgtw %xmm2, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm1 +; SSE41-NEXT:    pxor %xmm5, %xmm5 +; SSE41-NEXT:    pcmpgtw %xmm0, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqw %xmm5, %xmm1 +; SSE41-NEXT:    psubw %xmm2, %xmm0 +; SSE41-NEXT:    pcmpgtw %xmm0, %xmm3 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pcmpeqw %xmm5, %xmm3 +; SSE41-NEXT:    pxor %xmm4, %xmm3 +; SSE41-NEXT:    pandn %xmm3, %xmm1 +; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v8i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v8i16: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v8i16: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnltw %xmm2, %xmm1, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) +  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i16> %val, <8 x i16>* %p2 +  ret <8 x i32> %res +} + +define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i64: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm3 +; SSE2-NEXT:    psubq %xmm1, %xmm0 +; SSE2-NEXT:    pxor %xmm2, %xmm1 +; SSE2-NEXT:    movdqa %xmm2, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT:    pand %xmm5, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT:    por %xmm1, %xmm4 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT:    pxor %xmm5, %xmm4 +; SSE2-NEXT:    pxor %xmm2, %xmm3 +; SSE2-NEXT:    movdqa %xmm2, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT:    pand %xmm6, %xmm3 +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE2-NEXT:    por %xmm3, %xmm6 +; SSE2-NEXT:    pxor %xmm5, %xmm6 +; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; SSE2-NEXT:    pand %xmm4, %xmm1 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, %xmm3 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm4, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT:    por %xmm0, %xmm2 +; SSE2-NEXT:    pxor %xmm5, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm0 +; SSE2-NEXT:    pxor %xmm5, %xmm0 +; SSE2-NEXT:    pandn %xmm0, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v2i64: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm3 +; SSSE3-NEXT:    psubq %xmm1, %xmm0 +; SSSE3-NEXT:    pxor %xmm2, %xmm1 +; SSSE3-NEXT:    movdqa %xmm2, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT:    pand %xmm5, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT:    por %xmm1, %xmm4 +; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT:    pxor %xmm5, %xmm4 +; SSSE3-NEXT:    pxor %xmm2, %xmm3 +; SSSE3-NEXT:    movdqa %xmm2, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT:    pand %xmm6, %xmm3 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSSE3-NEXT:    por %xmm3, %xmm6 +; SSSE3-NEXT:    pxor %xmm5, %xmm6 +; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; SSSE3-NEXT:    pand %xmm4, %xmm1 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    movdqa %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    pand %xmm4, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT:    por %xmm0, %xmm2 +; SSSE3-NEXT:    pxor %xmm5, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm0 +; SSSE3-NEXT:    pxor %xmm5, %xmm0 +; SSSE3-NEXT:    pandn %xmm0, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v2i64: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT:    movdqa %xmm0, %xmm3 +; SSE41-NEXT:    psubq %xmm1, %xmm0 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm2, %xmm4 +; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE41-NEXT:    pand %xmm5, %xmm6 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE41-NEXT:    por %xmm6, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm3 +; SSE41-NEXT:    movdqa %xmm2, %xmm5 +; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT:    pand %xmm6, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT:    por %xmm3, %xmm5 +; SSE41-NEXT:    pxor %xmm4, %xmm5 +; SSE41-NEXT:    pcmpeqq %xmm5, %xmm1 +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    movdqa %xmm2, %xmm3 +; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT:    pand %xmm6, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT:    por %xmm0, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pcmpeqq %xmm5, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    pandn %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v2i64: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v2i64: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT:    vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v2i64: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT:    kxorw %k0, %k1, %k0 +; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k1 +; AVX512-NEXT:    kandw %k1, %k0, %k1 +; AVX512-NEXT:    vmovdqa %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) +  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i64> %val, <2 x i64>* %p2 +  ret <2 x i32> %res +} + +define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: ssubo_v4i24: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    pslld $8, %xmm1 +; SSE2-NEXT:    psrad $8, %xmm1 +; SSE2-NEXT:    pslld $8, %xmm2 +; SSE2-NEXT:    psrad $8, %xmm2 +; SSE2-NEXT:    psubd %xmm1, %xmm2 +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    pslld $8, %xmm0 +; SSE2-NEXT:    psrad $8, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm1, %xmm0 +; SSE2-NEXT:    movd %xmm2, %eax +; SSE2-NEXT:    movw %ax, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT:    movd %xmm1, %ecx +; SSE2-NEXT:    movw %cx, 9(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT:    movd %xmm1, %edx +; SSE2-NEXT:    movw %dx, 6(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT:    movd %xmm1, %esi +; SSE2-NEXT:    movw %si, 3(%rdi) +; SSE2-NEXT:    shrl $16, %eax +; SSE2-NEXT:    movb %al, 2(%rdi) +; SSE2-NEXT:    shrl $16, %ecx +; SSE2-NEXT:    movb %cl, 11(%rdi) +; SSE2-NEXT:    shrl $16, %edx +; SSE2-NEXT:    movb %dl, 8(%rdi) +; SSE2-NEXT:    shrl $16, %esi +; SSE2-NEXT:    movb %sil, 5(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v4i24: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm2 +; SSSE3-NEXT:    pslld $8, %xmm1 +; SSSE3-NEXT:    psrad $8, %xmm1 +; SSSE3-NEXT:    pslld $8, %xmm2 +; SSSE3-NEXT:    psrad $8, %xmm2 +; SSSE3-NEXT:    psubd %xmm1, %xmm2 +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    pslld $8, %xmm0 +; SSSE3-NEXT:    psrad $8, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm1, %xmm0 +; SSSE3-NEXT:    movd %xmm2, %eax +; SSSE3-NEXT:    movw %ax, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %ecx +; SSSE3-NEXT:    movw %cx, 9(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT:    movd %xmm1, %edx +; SSSE3-NEXT:    movw %dx, 6(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %esi +; SSSE3-NEXT:    movw %si, 3(%rdi) +; SSSE3-NEXT:    shrl $16, %eax +; SSSE3-NEXT:    movb %al, 2(%rdi) +; SSSE3-NEXT:    shrl $16, %ecx +; SSSE3-NEXT:    movb %cl, 11(%rdi) +; SSSE3-NEXT:    shrl $16, %edx +; SSSE3-NEXT:    movb %dl, 8(%rdi) +; SSSE3-NEXT:    shrl $16, %esi +; SSSE3-NEXT:    movb %sil, 5(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v4i24: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm2 +; SSE41-NEXT:    pslld $8, %xmm1 +; SSE41-NEXT:    psrad $8, %xmm1 +; SSE41-NEXT:    pslld $8, %xmm2 +; SSE41-NEXT:    psrad $8, %xmm2 +; SSE41-NEXT:    psubd %xmm1, %xmm2 +; SSE41-NEXT:    movdqa %xmm2, %xmm0 +; SSE41-NEXT:    pslld $8, %xmm0 +; SSE41-NEXT:    psrad $8, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm1, %xmm0 +; SSE41-NEXT:    pextrd $3, %xmm2, %eax +; SSE41-NEXT:    movw %ax, 9(%rdi) +; SSE41-NEXT:    pextrd $2, %xmm2, %ecx +; SSE41-NEXT:    movw %cx, 6(%rdi) +; SSE41-NEXT:    pextrd $1, %xmm2, %edx +; SSE41-NEXT:    movw %dx, 3(%rdi) +; SSE41-NEXT:    movd %xmm2, %esi +; SSE41-NEXT:    movw %si, (%rdi) +; SSE41-NEXT:    shrl $16, %eax +; SSE41-NEXT:    movb %al, 11(%rdi) +; SSE41-NEXT:    shrl $16, %ecx +; SSE41-NEXT:    movb %cl, 8(%rdi) +; SSE41-NEXT:    shrl $16, %edx +; SSE41-NEXT:    movb %dl, 5(%rdi) +; SSE41-NEXT:    shrl $16, %esi +; SSE41-NEXT:    movb %sil, 2(%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v4i24: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $3, %xmm1, %eax +; AVX1-NEXT:    movw %ax, 9(%rdi) +; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX1-NEXT:    movw %cx, 6(%rdi) +; AVX1-NEXT:    vpextrd $1, %xmm1, %edx +; AVX1-NEXT:    movw %dx, 3(%rdi) +; AVX1-NEXT:    vmovd %xmm1, %esi +; AVX1-NEXT:    movw %si, (%rdi) +; AVX1-NEXT:    shrl $16, %eax +; AVX1-NEXT:    movb %al, 11(%rdi) +; AVX1-NEXT:    shrl $16, %ecx +; AVX1-NEXT:    movb %cl, 8(%rdi) +; AVX1-NEXT:    shrl $16, %edx +; AVX1-NEXT:    movb %dl, 5(%rdi) +; AVX1-NEXT:    shrl $16, %esi +; AVX1-NEXT:    movb %sil, 2(%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v4i24: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $3, %xmm1, %eax +; AVX2-NEXT:    movw %ax, 9(%rdi) +; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX2-NEXT:    movw %cx, 6(%rdi) +; AVX2-NEXT:    vpextrd $1, %xmm1, %edx +; AVX2-NEXT:    movw %dx, 3(%rdi) +; AVX2-NEXT:    vmovd %xmm1, %esi +; AVX2-NEXT:    movw %si, (%rdi) +; AVX2-NEXT:    shrl $16, %eax +; AVX2-NEXT:    movb %al, 11(%rdi) +; AVX2-NEXT:    shrl $16, %ecx +; AVX2-NEXT:    movb %cl, 8(%rdi) +; AVX2-NEXT:    shrl $16, %edx +; AVX2-NEXT:    movb %dl, 5(%rdi) +; AVX2-NEXT:    shrl $16, %esi +; AVX2-NEXT:    movb %sil, 2(%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v4i24: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1 +; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1 +; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0 +; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vpextrd $3, %xmm1, %eax +; AVX512-NEXT:    movw %ax, 9(%rdi) +; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX512-NEXT:    movw %cx, 6(%rdi) +; AVX512-NEXT:    vpextrd $1, %xmm1, %edx +; AVX512-NEXT:    movw %dx, 3(%rdi) +; AVX512-NEXT:    vmovd %xmm1, %esi +; AVX512-NEXT:    movw %si, (%rdi) +; AVX512-NEXT:    shrl $16, %eax +; AVX512-NEXT:    movb %al, 11(%rdi) +; AVX512-NEXT:    shrl $16, %ecx +; AVX512-NEXT:    movb %cl, 8(%rdi) +; AVX512-NEXT:    shrl $16, %edx +; AVX512-NEXT:    movb %dl, 5(%rdi) +; AVX512-NEXT:    shrl $16, %esi +; AVX512-NEXT:    movb %sil, 2(%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) +  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i24> %val, <4 x i24>* %p2 +  ret <4 x i32> %res +} + +define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: ssubo_v4i1: +; SSE:       # %bb.0: +; SSE-NEXT:    pslld $31, %xmm1 +; SSE-NEXT:    psrad $31, %xmm1 +; SSE-NEXT:    pslld $31, %xmm0 +; SSE-NEXT:    psrad $31, %xmm0 +; SSE-NEXT:    psubd %xmm1, %xmm0 +; SSE-NEXT:    movdqa %xmm0, %xmm1 +; SSE-NEXT:    pslld $31, %xmm1 +; SSE-NEXT:    psrad $31, %xmm1 +; SSE-NEXT:    pcmpeqd %xmm1, %xmm0 +; SSE-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE-NEXT:    pxor %xmm2, %xmm0 +; SSE-NEXT:    movmskps %xmm1, %eax +; SSE-NEXT:    movb %al, (%rdi) +; SSE-NEXT:    retq +; +; AVX1-LABEL: ssubo_v4i1: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vmovmskps %xmm1, %eax +; AVX1-NEXT:    movb %al, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v4i1: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1 +; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vmovmskps %xmm1, %eax +; AVX2-NEXT:    movb %al, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v4i1: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 +; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k2 +; AVX512-NEXT:    kxorw %k2, %k1, %k3 +; AVX512-NEXT:    kxorw %k2, %k0, %k0 +; AVX512-NEXT:    kxnorw %k0, %k1, %k1 +; AVX512-NEXT:    kandnw %k1, %k3, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    kmovd %k0, %eax +; AVX512-NEXT:    movb %al, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) +  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i1> %val, <4 x i1>* %p2 +  ret <4 x i32> %res +} + +define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i128: +; SSE2:       # %bb.0: +; SSE2-NEXT:    pushq %rbp +; SSE2-NEXT:    pushq %rbx +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT:    movq %rcx, %rax +; SSE2-NEXT:    sbbq %r11, %rax +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    testq %rcx, %rcx +; SSE2-NEXT:    setns %cl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %bpl +; SSE2-NEXT:    testq %r11, %r11 +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %cl +; SSE2-NEXT:    andb %bpl, %cl +; SSE2-NEXT:    movzbl %cl, %ebp +; SSE2-NEXT:    testq %r9, %r9 +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    testq %rsi, %rsi +; SSE2-NEXT:    setns %cl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %r11b +; SSE2-NEXT:    subq %r8, %rdi +; SSE2-NEXT:    sbbq %r9, %rsi +; SSE2-NEXT:    setns %bl +; SSE2-NEXT:    cmpb %bl, %cl +; SSE2-NEXT:    setne %cl +; SSE2-NEXT:    andb %r11b, %cl +; SSE2-NEXT:    movzbl %cl, %ecx +; SSE2-NEXT:    movd %ecx, %xmm0 +; SSE2-NEXT:    pinsrw $4, %ebp, %xmm0 +; SSE2-NEXT:    movq %rdx, 16(%r10) +; SSE2-NEXT:    movq %rdi, (%r10) +; SSE2-NEXT:    movq %rax, 24(%r10) +; SSE2-NEXT:    movq %rsi, 8(%r10) +; SSE2-NEXT:    psllq $63, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    popq %rbx +; SSE2-NEXT:    popq %rbp +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: ssubo_v2i128: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    pushq %rbp +; SSSE3-NEXT:    pushq %rbx +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT:    movq %rcx, %rax +; SSSE3-NEXT:    sbbq %r11, %rax +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    testq %rcx, %rcx +; SSSE3-NEXT:    setns %cl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %bpl +; SSSE3-NEXT:    testq %r11, %r11 +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %cl +; SSSE3-NEXT:    andb %bpl, %cl +; SSSE3-NEXT:    movzbl %cl, %ebp +; SSSE3-NEXT:    testq %r9, %r9 +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    testq %rsi, %rsi +; SSSE3-NEXT:    setns %cl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %r11b +; SSSE3-NEXT:    subq %r8, %rdi +; SSSE3-NEXT:    sbbq %r9, %rsi +; SSSE3-NEXT:    setns %bl +; SSSE3-NEXT:    cmpb %bl, %cl +; SSSE3-NEXT:    setne %cl +; SSSE3-NEXT:    andb %r11b, %cl +; SSSE3-NEXT:    movzbl %cl, %ecx +; SSSE3-NEXT:    movd %ecx, %xmm0 +; SSSE3-NEXT:    pinsrw $4, %ebp, %xmm0 +; SSSE3-NEXT:    movq %rdx, 16(%r10) +; SSSE3-NEXT:    movq %rdi, (%r10) +; SSSE3-NEXT:    movq %rax, 24(%r10) +; SSSE3-NEXT:    movq %rsi, 8(%r10) +; SSSE3-NEXT:    psllq $63, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    popq %rbx +; SSSE3-NEXT:    popq %rbp +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: ssubo_v2i128: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pushq %rbp +; SSE41-NEXT:    pushq %rbx +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT:    movq %rcx, %rax +; SSE41-NEXT:    sbbq %r11, %rax +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    testq %rcx, %rcx +; SSE41-NEXT:    setns %cl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %bpl +; SSE41-NEXT:    testq %r11, %r11 +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %cl +; SSE41-NEXT:    andb %bpl, %cl +; SSE41-NEXT:    movzbl %cl, %ebp +; SSE41-NEXT:    testq %r9, %r9 +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    testq %rsi, %rsi +; SSE41-NEXT:    setns %cl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %r11b +; SSE41-NEXT:    subq %r8, %rdi +; SSE41-NEXT:    sbbq %r9, %rsi +; SSE41-NEXT:    setns %bl +; SSE41-NEXT:    cmpb %bl, %cl +; SSE41-NEXT:    setne %cl +; SSE41-NEXT:    andb %r11b, %cl +; SSE41-NEXT:    movzbl %cl, %ecx +; SSE41-NEXT:    movd %ecx, %xmm0 +; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT:    movq %rdx, 16(%r10) +; SSE41-NEXT:    movq %rdi, (%r10) +; SSE41-NEXT:    movq %rax, 24(%r10) +; SSE41-NEXT:    movq %rsi, 8(%r10) +; SSE41-NEXT:    psllq $63, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT:    popq %rbx +; SSE41-NEXT:    popq %rbp +; SSE41-NEXT:    retq +; +; AVX1-LABEL: ssubo_v2i128: +; AVX1:       # %bb.0: +; AVX1-NEXT:    pushq %rbp +; AVX1-NEXT:    pushq %rbx +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT:    movq %rcx, %rax +; AVX1-NEXT:    sbbq %r11, %rax +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    testq %rcx, %rcx +; AVX1-NEXT:    setns %cl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %bpl +; AVX1-NEXT:    testq %r11, %r11 +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %cl +; AVX1-NEXT:    andb %bpl, %cl +; AVX1-NEXT:    movzbl %cl, %ebp +; AVX1-NEXT:    testq %r9, %r9 +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    testq %rsi, %rsi +; AVX1-NEXT:    setns %cl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %r11b +; AVX1-NEXT:    subq %r8, %rdi +; AVX1-NEXT:    sbbq %r9, %rsi +; AVX1-NEXT:    setns %bl +; AVX1-NEXT:    cmpb %bl, %cl +; AVX1-NEXT:    setne %cl +; AVX1-NEXT:    andb %r11b, %cl +; AVX1-NEXT:    movzbl %cl, %ecx +; AVX1-NEXT:    vmovd %ecx, %xmm0 +; AVX1-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT:    movq %rdx, 16(%r10) +; AVX1-NEXT:    movq %rdi, (%r10) +; AVX1-NEXT:    movq %rax, 24(%r10) +; AVX1-NEXT:    movq %rsi, 8(%r10) +; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    popq %rbx +; AVX1-NEXT:    popq %rbp +; AVX1-NEXT:    retq +; +; AVX2-LABEL: ssubo_v2i128: +; AVX2:       # %bb.0: +; AVX2-NEXT:    pushq %rbp +; AVX2-NEXT:    pushq %rbx +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT:    movq %rcx, %rax +; AVX2-NEXT:    sbbq %r11, %rax +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    testq %rcx, %rcx +; AVX2-NEXT:    setns %cl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %bpl +; AVX2-NEXT:    testq %r11, %r11 +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %cl +; AVX2-NEXT:    andb %bpl, %cl +; AVX2-NEXT:    movzbl %cl, %ebp +; AVX2-NEXT:    testq %r9, %r9 +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    testq %rsi, %rsi +; AVX2-NEXT:    setns %cl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %r11b +; AVX2-NEXT:    subq %r8, %rdi +; AVX2-NEXT:    sbbq %r9, %rsi +; AVX2-NEXT:    setns %bl +; AVX2-NEXT:    cmpb %bl, %cl +; AVX2-NEXT:    setne %cl +; AVX2-NEXT:    andb %r11b, %cl +; AVX2-NEXT:    movzbl %cl, %ecx +; AVX2-NEXT:    vmovd %ecx, %xmm0 +; AVX2-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT:    movq %rdx, 16(%r10) +; AVX2-NEXT:    movq %rdi, (%r10) +; AVX2-NEXT:    movq %rax, 24(%r10) +; AVX2-NEXT:    movq %rsi, 8(%r10) +; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    popq %rbx +; AVX2-NEXT:    popq %rbp +; AVX2-NEXT:    retq +; +; AVX512-LABEL: ssubo_v2i128: +; AVX512:       # %bb.0: +; AVX512-NEXT:    pushq %r14 +; AVX512-NEXT:    pushq %rbx +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT:    movq %rcx, %r14 +; AVX512-NEXT:    sbbq %r11, %r14 +; AVX512-NEXT:    setns %bl +; AVX512-NEXT:    testq %rcx, %rcx +; AVX512-NEXT:    setns %cl +; AVX512-NEXT:    cmpb %bl, %cl +; AVX512-NEXT:    setne %bl +; AVX512-NEXT:    testq %r11, %r11 +; AVX512-NEXT:    setns %al +; AVX512-NEXT:    cmpb %al, %cl +; AVX512-NEXT:    setne %al +; AVX512-NEXT:    andb %bl, %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    testq %r9, %r9 +; AVX512-NEXT:    setns %al +; AVX512-NEXT:    testq %rsi, %rsi +; AVX512-NEXT:    setns %cl +; AVX512-NEXT:    cmpb %al, %cl +; AVX512-NEXT:    setne %al +; AVX512-NEXT:    subq %r8, %rdi +; AVX512-NEXT:    sbbq %r9, %rsi +; AVX512-NEXT:    setns %bl +; AVX512-NEXT:    cmpb %bl, %cl +; AVX512-NEXT:    setne %cl +; AVX512-NEXT:    andb %al, %cl +; AVX512-NEXT:    movb %cl, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT:    movq %rdx, 16(%r10) +; AVX512-NEXT:    movq %rdi, (%r10) +; AVX512-NEXT:    movq %r14, 24(%r10) +; AVX512-NEXT:    movq %rsi, 8(%r10) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    popq %rbx +; AVX512-NEXT:    popq %r14 +; AVX512-NEXT:    retq +  %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) +  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i128> %val, <2 x i128>* %p2 +  ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll new file mode 100644 index 00000000000..b040cd9916f --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -0,0 +1,1381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: uaddo_v1i32: +; SSE:       # %bb.0: +; SSE-NEXT:    addl %esi, %edi +; SSE-NEXT:    sbbl %eax, %eax +; SSE-NEXT:    movl %edi, (%rdx) +; SSE-NEXT:    retq +; +; AVX-LABEL: uaddo_v1i32: +; AVX:       # %bb.0: +; AVX-NEXT:    addl %esi, %edi +; AVX-NEXT:    sbbl %eax, %eax +; AVX-NEXT:    movl %edi, (%rdx) +; AVX-NEXT:    retq +  %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) +  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 +  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 +  %res = sext <1 x i1> %obit to <1 x i32> +  store <1 x i32> %val, <1 x i32>* %p2 +  ret <1 x i32> %res +} + +define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v2i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT:    pand %xmm2, %xmm1 +; SSE2-NEXT:    pand %xmm2, %xmm0 +; SSE2-NEXT:    paddq %xmm1, %xmm0 +; SSE2-NEXT:    pand %xmm0, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm3, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT:    movq %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v2i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSSE3-NEXT:    pand %xmm2, %xmm1 +; SSSE3-NEXT:    pand %xmm2, %xmm0 +; SSSE3-NEXT:    paddq %xmm1, %xmm0 +; SSSE3-NEXT:    pand %xmm0, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm3, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT:    movq %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v2i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT:    paddq %xmm1, %xmm0 +; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT:    pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT:    movq %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v2i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v2i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v2i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT:    vpmovqd %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    retq +  %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) +  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i32> %val, <2 x i32>* %p2 +  ret <2 x i32> %res +} + +define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v3i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    paddd %xmm0, %xmm1 +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    pxor %xmm1, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT:    movq %xmm1, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT:    movd %xmm1, 8(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v3i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    paddd %xmm0, %xmm1 +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    pxor %xmm1, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT:    movq %xmm1, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT:    movd %xmm1, 8(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v3i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddd %xmm0, %xmm1 +; SSE41-NEXT:    pmaxud %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi) +; SSE41-NEXT:    movq %xmm1, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v3i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v3i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v3i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT:    vmovq %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) +  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 +  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 +  %res = sext <3 x i1> %obit to <3 x i32> +  store <3 x i32> %val, <3 x i32>* %p2 +  ret <3 x i32> %res +} + +define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v4i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    paddd %xmm0, %xmm1 +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    pxor %xmm1, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm1, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v4i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    paddd %xmm0, %xmm1 +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    pxor %xmm1, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT:    movdqa %xmm1, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v4i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddd %xmm0, %xmm1 +; SSE41-NEXT:    pmaxud %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    movdqa %xmm1, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v4i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v4i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v4i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) +  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i32> %val, <4 x i32>* %p2 +  ret <4 x i32> %res +} + +define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v6i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq %rdi, %rax +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT:    movd %r8d, %xmm1 +; SSE2-NEXT:    movd %ecx, %xmm2 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT:    movd %edx, %xmm3 +; SSE2-NEXT:    movd %esi, %xmm1 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT:    movd %r9d, %xmm2 +; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    paddd %xmm1, %xmm0 +; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, (%rcx) +; SSE2-NEXT:    pxor %xmm4, %xmm0 +; SSE2-NEXT:    pxor %xmm4, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT:    paddd %xmm2, %xmm3 +; SSE2-NEXT:    movq %xmm3, 16(%rcx) +; SSE2-NEXT:    pxor %xmm4, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT:    movq %xmm2, 16(%rdi) +; SSE2-NEXT:    movdqa %xmm1, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v6i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq %rdi, %rax +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT:    movd %r8d, %xmm1 +; SSSE3-NEXT:    movd %ecx, %xmm2 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT:    movd %edx, %xmm3 +; SSSE3-NEXT:    movd %esi, %xmm1 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT:    movd %r9d, %xmm2 +; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    paddd %xmm1, %xmm0 +; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, (%rcx) +; SSSE3-NEXT:    pxor %xmm4, %xmm0 +; SSSE3-NEXT:    pxor %xmm4, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT:    paddd %xmm2, %xmm3 +; SSSE3-NEXT:    movq %xmm3, 16(%rcx) +; SSSE3-NEXT:    pxor %xmm4, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT:    movq %xmm2, 16(%rdi) +; SSSE3-NEXT:    movdqa %xmm1, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v6i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq %rdi, %rax +; SSE41-NEXT:    movd %esi, %xmm0 +; SSE41-NEXT:    pinsrd $1, %edx, %xmm0 +; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    movd %r9d, %xmm2 +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    paddd %xmm0, %xmm3 +; SSE41-NEXT:    pmaxud %xmm3, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm0 +; SSE41-NEXT:    paddd %xmm2, %xmm1 +; SSE41-NEXT:    pmaxud %xmm1, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2 +; SSE41-NEXT:    pxor %xmm4, %xmm2 +; SSE41-NEXT:    movq %xmm1, 16(%rcx) +; SSE41-NEXT:    movdqa %xmm3, (%rcx) +; SSE41-NEXT:    movq %xmm2, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v6i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v6i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v6i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) +  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 +  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 +  %res = sext <6 x i1> %obit to <6 x i32> +  store <6 x i32> %val, <6 x i32>* %p2 +  ret <6 x i32> %res +} + +define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v8i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    paddd %xmm0, %xmm2 +; SSE2-NEXT:    pxor %xmm4, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, (%rdi) +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT:    paddd %xmm1, %xmm3 +; SSE2-NEXT:    pxor %xmm4, %xmm1 +; SSE2-NEXT:    pxor %xmm3, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT:    movdqa %xmm3, 16(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v8i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    paddd %xmm0, %xmm2 +; SSSE3-NEXT:    pxor %xmm4, %xmm0 +; SSSE3-NEXT:    movdqa %xmm2, (%rdi) +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT:    paddd %xmm1, %xmm3 +; SSSE3-NEXT:    pxor %xmm4, %xmm1 +; SSSE3-NEXT:    pxor %xmm3, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v8i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddd %xmm0, %xmm2 +; SSE41-NEXT:    pmaxud %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT:    pxor %xmm4, %xmm0 +; SSE41-NEXT:    paddd %xmm1, %xmm3 +; SSE41-NEXT:    pmaxud %xmm3, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1 +; SSE41-NEXT:    pxor %xmm4, %xmm1 +; SSE41-NEXT:    movdqa %xmm3, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm2, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v8i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovaps %ymm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v8i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v8i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) +  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i32> %val, <8 x i32>* %p2 +  ret <8 x i32> %res +} + +define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v16i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    paddd %xmm0, %xmm4 +; SSE2-NEXT:    pxor %xmm8, %xmm0 +; SSE2-NEXT:    movdqa %xmm4, (%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT:    paddd %xmm1, %xmm5 +; SSE2-NEXT:    pxor %xmm8, %xmm1 +; SSE2-NEXT:    movdqa %xmm5, 16(%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm5 +; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT:    paddd %xmm2, %xmm6 +; SSE2-NEXT:    pxor %xmm8, %xmm2 +; SSE2-NEXT:    movdqa %xmm6, 32(%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm6 +; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT:    paddd %xmm3, %xmm7 +; SSE2-NEXT:    pxor %xmm8, %xmm3 +; SSE2-NEXT:    pxor %xmm7, %xmm8 +; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT:    movdqa %xmm7, 48(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v16i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    paddd %xmm0, %xmm4 +; SSSE3-NEXT:    pxor %xmm8, %xmm0 +; SSSE3-NEXT:    movdqa %xmm4, (%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0 +; SSSE3-NEXT:    paddd %xmm1, %xmm5 +; SSSE3-NEXT:    pxor %xmm8, %xmm1 +; SSSE3-NEXT:    movdqa %xmm5, 16(%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm5 +; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT:    paddd %xmm2, %xmm6 +; SSSE3-NEXT:    pxor %xmm8, %xmm2 +; SSSE3-NEXT:    movdqa %xmm6, 32(%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm6 +; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT:    paddd %xmm3, %xmm7 +; SSSE3-NEXT:    pxor %xmm8, %xmm3 +; SSSE3-NEXT:    pxor %xmm7, %xmm8 +; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT:    movdqa %xmm7, 48(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v16i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddd %xmm0, %xmm4 +; SSE41-NEXT:    pmaxud %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm8, %xmm8 +; SSE41-NEXT:    pxor %xmm8, %xmm0 +; SSE41-NEXT:    paddd %xmm1, %xmm5 +; SSE41-NEXT:    pmaxud %xmm5, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT:    pxor %xmm8, %xmm1 +; SSE41-NEXT:    paddd %xmm2, %xmm6 +; SSE41-NEXT:    pmaxud %xmm6, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT:    pxor %xmm8, %xmm2 +; SSE41-NEXT:    paddd %xmm3, %xmm7 +; SSE41-NEXT:    pmaxud %xmm7, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT:    pxor %xmm8, %xmm3 +; SSE41-NEXT:    movdqa %xmm7, 48(%rdi) +; SSE41-NEXT:    movdqa %xmm6, 32(%rdi) +; SSE41-NEXT:    movdqa %xmm5, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm4, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v16i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpmaxud %xmm5, %xmm4, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT:    vpmaxud %xmm7, %xmm5, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7 +; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm4 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT:    vmovaps %ymm3, 32(%rdi) +; AVX1-NEXT:    vmovaps %ymm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v16i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT:    vpmaxud %ymm1, %ymm3, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT:    vpmaxud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT:    vmovdqa %ymm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v16i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT:    vpcmpltud %zmm0, %zmm1, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) +  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i32> %val, <16 x i32>* %p2 +  ret <16 x i32> %res +} + +define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: uaddo_v16i8: +; SSE2:       # %bb.0: +; SSE2-NEXT:    paddb %xmm0, %xmm1 +; SSE2-NEXT:    pmaxub %xmm1, %xmm0 +; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm0, %xmm3 +; SSE2-NEXT:    movdqa %xmm3, %xmm4 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT:    movdqa %xmm4, %xmm0 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT:    pslld $31, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm4 +; SSE2-NEXT:    psrad $31, %xmm4 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT:    movdqa %xmm3, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm3 +; SSE2-NEXT:    psrad $31, %xmm3 +; SSE2-NEXT:    movdqa %xmm1, (%rdi) +; SSE2-NEXT:    movdqa %xmm4, %xmm1 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v16i8: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    paddb %xmm0, %xmm1 +; SSSE3-NEXT:    pmaxub %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm0, %xmm3 +; SSSE3-NEXT:    movdqa %xmm3, %xmm4 +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT:    movdqa %xmm4, %xmm0 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT:    pslld $31, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm4 +; SSSE3-NEXT:    psrad $31, %xmm4 +; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT:    movdqa %xmm3, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm3 +; SSSE3-NEXT:    psrad $31, %xmm3 +; SSSE3-NEXT:    movdqa %xmm1, (%rdi) +; SSSE3-NEXT:    movdqa %xmm4, %xmm1 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v16i8: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddb %xmm0, %xmm1 +; SSE41-NEXT:    pmaxub %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm0, %xmm3 +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm4 +; SSE41-NEXT:    psrad $31, %xmm4 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm3 +; SSE41-NEXT:    psrad $31, %xmm3 +; SSE41-NEXT:    movdqa %xmm1, (%rdi) +; SSE41-NEXT:    movdqa %xmm4, %xmm1 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v16i8: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT:    vmovdqa %xmm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v16i8: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0 +; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT:    vmovdqa %xmm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v16i8: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpltub %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) +  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i8> %val, <16 x i8>* %p2 +  ret <16 x i32> %res +} + +define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: uaddo_v8i16: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT:    paddw %xmm0, %xmm1 +; SSE2-NEXT:    pxor %xmm3, %xmm2 +; SSE2-NEXT:    pxor %xmm1, %xmm3 +; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT:    pslld $31, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    movdqa %xmm1, (%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm1 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v8i16: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm2 +; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT:    paddw %xmm0, %xmm1 +; SSSE3-NEXT:    pxor %xmm3, %xmm2 +; SSSE3-NEXT:    pxor %xmm1, %xmm3 +; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT:    pslld $31, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    movdqa %xmm1, (%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm1 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v8i16: +; SSE41:       # %bb.0: +; SSE41-NEXT:    paddw %xmm0, %xmm1 +; SSE41-NEXT:    pmaxuw %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm0, %xmm2 +; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT:    pslld $31, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    movdqa %xmm1, (%rdi) +; SSE41-NEXT:    movdqa %xmm2, %xmm1 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v8i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v8i16: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v8i16: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpltuw %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) +  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i16> %val, <8 x i16>* %p2 +  ret <8 x i32> %res +} + +define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE-LABEL: uaddo_v2i64: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE-NEXT:    paddq %xmm0, %xmm1 +; SSE-NEXT:    pxor %xmm2, %xmm0 +; SSE-NEXT:    pxor %xmm1, %xmm2 +; SSE-NEXT:    movdqa %xmm0, %xmm3 +; SSE-NEXT:    pcmpgtd %xmm2, %xmm3 +; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT:    pand %xmm4, %xmm2 +; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE-NEXT:    por %xmm2, %xmm0 +; SSE-NEXT:    movdqa %xmm1, (%rdi) +; SSE-NEXT:    retq +; +; AVX1-LABEL: uaddo_v2i64: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v2i64: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v2i64: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpltuq %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) +  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i64> %val, <2 x i64>* %p2 +  ret <2 x i32> %res +} + +define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: uaddo_v4i24: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT:    pand %xmm3, %xmm1 +; SSE2-NEXT:    pand %xmm3, %xmm2 +; SSE2-NEXT:    paddd %xmm1, %xmm2 +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT:    pxor %xmm3, %xmm0 +; SSE2-NEXT:    movd %xmm2, %eax +; SSE2-NEXT:    movw %ax, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT:    movd %xmm1, %ecx +; SSE2-NEXT:    movw %cx, 9(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT:    movd %xmm1, %edx +; SSE2-NEXT:    movw %dx, 6(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT:    movd %xmm1, %esi +; SSE2-NEXT:    movw %si, 3(%rdi) +; SSE2-NEXT:    shrl $16, %eax +; SSE2-NEXT:    movb %al, 2(%rdi) +; SSE2-NEXT:    shrl $16, %ecx +; SSE2-NEXT:    movb %cl, 11(%rdi) +; SSE2-NEXT:    shrl $16, %edx +; SSE2-NEXT:    movb %dl, 8(%rdi) +; SSE2-NEXT:    shrl $16, %esi +; SSE2-NEXT:    movb %sil, 5(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v4i24: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm2 +; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT:    pand %xmm3, %xmm1 +; SSSE3-NEXT:    pand %xmm3, %xmm2 +; SSSE3-NEXT:    paddd %xmm1, %xmm2 +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT:    pxor %xmm3, %xmm0 +; SSSE3-NEXT:    movd %xmm2, %eax +; SSSE3-NEXT:    movw %ax, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %ecx +; SSSE3-NEXT:    movw %cx, 9(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT:    movd %xmm1, %edx +; SSSE3-NEXT:    movw %dx, 6(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %esi +; SSSE3-NEXT:    movw %si, 3(%rdi) +; SSSE3-NEXT:    shrl $16, %eax +; SSSE3-NEXT:    movb %al, 2(%rdi) +; SSSE3-NEXT:    shrl $16, %ecx +; SSSE3-NEXT:    movb %cl, 11(%rdi) +; SSSE3-NEXT:    shrl $16, %edx +; SSSE3-NEXT:    movb %dl, 8(%rdi) +; SSSE3-NEXT:    shrl $16, %esi +; SSSE3-NEXT:    movb %sil, 5(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v4i24: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE41-NEXT:    pand %xmm2, %xmm1 +; SSE41-NEXT:    pand %xmm2, %xmm0 +; SSE41-NEXT:    paddd %xmm1, %xmm0 +; SSE41-NEXT:    pand %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    pextrd $3, %xmm0, %eax +; SSE41-NEXT:    movw %ax, 9(%rdi) +; SSE41-NEXT:    pextrd $2, %xmm0, %ecx +; SSE41-NEXT:    movw %cx, 6(%rdi) +; SSE41-NEXT:    pextrd $1, %xmm0, %edx +; SSE41-NEXT:    movw %dx, 3(%rdi) +; SSE41-NEXT:    movd %xmm0, %esi +; SSE41-NEXT:    movw %si, (%rdi) +; SSE41-NEXT:    shrl $16, %eax +; SSE41-NEXT:    movb %al, 11(%rdi) +; SSE41-NEXT:    shrl $16, %ecx +; SSE41-NEXT:    movb %cl, 8(%rdi) +; SSE41-NEXT:    shrl $16, %edx +; SSE41-NEXT:    movb %dl, 5(%rdi) +; SSE41-NEXT:    shrl $16, %esi +; SSE41-NEXT:    movb %sil, 2(%rdi) +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v4i24: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38] +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $3, %xmm1, %eax +; AVX1-NEXT:    movw %ax, 9(%rdi) +; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX1-NEXT:    movw %cx, 6(%rdi) +; AVX1-NEXT:    vpextrd $1, %xmm1, %edx +; AVX1-NEXT:    movw %dx, 3(%rdi) +; AVX1-NEXT:    vmovd %xmm1, %esi +; AVX1-NEXT:    movw %si, (%rdi) +; AVX1-NEXT:    shrl $16, %eax +; AVX1-NEXT:    movb %al, 11(%rdi) +; AVX1-NEXT:    shrl $16, %ecx +; AVX1-NEXT:    movb %cl, 8(%rdi) +; AVX1-NEXT:    shrl $16, %edx +; AVX1-NEXT:    movb %dl, 5(%rdi) +; AVX1-NEXT:    shrl $16, %esi +; AVX1-NEXT:    movb %sil, 2(%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v4i24: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $3, %xmm1, %eax +; AVX2-NEXT:    movw %ax, 9(%rdi) +; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX2-NEXT:    movw %cx, 6(%rdi) +; AVX2-NEXT:    vpextrd $1, %xmm1, %edx +; AVX2-NEXT:    movw %dx, 3(%rdi) +; AVX2-NEXT:    vmovd %xmm1, %esi +; AVX2-NEXT:    movw %si, (%rdi) +; AVX2-NEXT:    shrl $16, %eax +; AVX2-NEXT:    movb %al, 11(%rdi) +; AVX2-NEXT:    shrl $16, %ecx +; AVX2-NEXT:    movb %cl, 8(%rdi) +; AVX2-NEXT:    shrl $16, %edx +; AVX2-NEXT:    movb %dl, 5(%rdi) +; AVX2-NEXT:    shrl $16, %esi +; AVX2-NEXT:    movb %sil, 2(%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v4i24: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vpextrd $3, %xmm1, %eax +; AVX512-NEXT:    movw %ax, 9(%rdi) +; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX512-NEXT:    movw %cx, 6(%rdi) +; AVX512-NEXT:    vpextrd $1, %xmm1, %edx +; AVX512-NEXT:    movw %dx, 3(%rdi) +; AVX512-NEXT:    vmovd %xmm1, %esi +; AVX512-NEXT:    movw %si, (%rdi) +; AVX512-NEXT:    shrl $16, %eax +; AVX512-NEXT:    movb %al, 11(%rdi) +; AVX512-NEXT:    shrl $16, %ecx +; AVX512-NEXT:    movb %cl, 8(%rdi) +; AVX512-NEXT:    shrl $16, %edx +; AVX512-NEXT:    movb %dl, 5(%rdi) +; AVX512-NEXT:    shrl $16, %esi +; AVX512-NEXT:    movb %sil, 2(%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) +  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i24> %val, <4 x i24>* %p2 +  ret <4 x i32> %res +} + +define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: uaddo_v4i1: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE-NEXT:    pand %xmm2, %xmm1 +; SSE-NEXT:    pand %xmm2, %xmm0 +; SSE-NEXT:    paddd %xmm1, %xmm0 +; SSE-NEXT:    pand %xmm0, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE-NEXT:    pxor %xmm2, %xmm1 +; SSE-NEXT:    pslld $31, %xmm0 +; SSE-NEXT:    movmskps %xmm0, %eax +; SSE-NEXT:    movb %al, (%rdi) +; SSE-NEXT:    movdqa %xmm1, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: uaddo_v4i1: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vmovmskps %xmm1, %eax +; AVX1-NEXT:    movb %al, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v4i1: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT:    vmovmskps %xmm1, %eax +; AVX2-NEXT:    movb %al, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v4i1: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT:    kxorw %k1, %k0, %k2 +; AVX512-NEXT:    kxnorw %k1, %k0, %k1 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1 {%k1} +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    kmovd %k2, %eax +; AVX512-NEXT:    movb %al, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) +  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i1> %val, <4 x i1>* %p2 +  ret <4 x i32> %res +} + +define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: uaddo_v2i128: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    setb %al +; SSE2-NEXT:    movzbl %al, %r11d +; SSE2-NEXT:    addq %r8, %rdi +; SSE2-NEXT:    adcq %r9, %rsi +; SSE2-NEXT:    setb %al +; SSE2-NEXT:    movzbl %al, %eax +; SSE2-NEXT:    movd %eax, %xmm0 +; SSE2-NEXT:    pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT:    movq %rdx, 16(%r10) +; SSE2-NEXT:    movq %rdi, (%r10) +; SSE2-NEXT:    movq %rcx, 24(%r10) +; SSE2-NEXT:    movq %rsi, 8(%r10) +; SSE2-NEXT:    psllq $63, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: uaddo_v2i128: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    setb %al +; SSSE3-NEXT:    movzbl %al, %r11d +; SSSE3-NEXT:    addq %r8, %rdi +; SSSE3-NEXT:    adcq %r9, %rsi +; SSSE3-NEXT:    setb %al +; SSSE3-NEXT:    movzbl %al, %eax +; SSSE3-NEXT:    movd %eax, %xmm0 +; SSSE3-NEXT:    pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT:    movq %rdx, 16(%r10) +; SSSE3-NEXT:    movq %rdi, (%r10) +; SSSE3-NEXT:    movq %rcx, 24(%r10) +; SSSE3-NEXT:    movq %rsi, 8(%r10) +; SSSE3-NEXT:    psllq $63, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: uaddo_v2i128: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    setb %al +; SSE41-NEXT:    movzbl %al, %r11d +; SSE41-NEXT:    addq %r8, %rdi +; SSE41-NEXT:    adcq %r9, %rsi +; SSE41-NEXT:    setb %al +; SSE41-NEXT:    movzbl %al, %eax +; SSE41-NEXT:    movd %eax, %xmm0 +; SSE41-NEXT:    pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT:    movq %rdx, 16(%r10) +; SSE41-NEXT:    movq %rdi, (%r10) +; SSE41-NEXT:    movq %rcx, 24(%r10) +; SSE41-NEXT:    movq %rsi, 8(%r10) +; SSE41-NEXT:    psllq $63, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT:    retq +; +; AVX1-LABEL: uaddo_v2i128: +; AVX1:       # %bb.0: +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT:    setb %al +; AVX1-NEXT:    movzbl %al, %r11d +; AVX1-NEXT:    addq %r8, %rdi +; AVX1-NEXT:    adcq %r9, %rsi +; AVX1-NEXT:    setb %al +; AVX1-NEXT:    movzbl %al, %eax +; AVX1-NEXT:    vmovd %eax, %xmm0 +; AVX1-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT:    movq %rdx, 16(%r10) +; AVX1-NEXT:    movq %rdi, (%r10) +; AVX1-NEXT:    movq %rcx, 24(%r10) +; AVX1-NEXT:    movq %rsi, 8(%r10) +; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    retq +; +; AVX2-LABEL: uaddo_v2i128: +; AVX2:       # %bb.0: +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT:    setb %al +; AVX2-NEXT:    movzbl %al, %r11d +; AVX2-NEXT:    addq %r8, %rdi +; AVX2-NEXT:    adcq %r9, %rsi +; AVX2-NEXT:    setb %al +; AVX2-NEXT:    movzbl %al, %eax +; AVX2-NEXT:    vmovd %eax, %xmm0 +; AVX2-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT:    movq %rdx, 16(%r10) +; AVX2-NEXT:    movq %rdi, (%r10) +; AVX2-NEXT:    movq %rcx, 24(%r10) +; AVX2-NEXT:    movq %rsi, 8(%r10) +; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    retq +; +; AVX512-LABEL: uaddo_v2i128: +; AVX512:       # %bb.0: +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT:    setb %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    addq %r8, %rdi +; AVX512-NEXT:    adcq %r9, %rsi +; AVX512-NEXT:    setb %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT:    movq %rdx, 16(%r10) +; AVX512-NEXT:    movq %rdi, (%r10) +; AVX512-NEXT:    movq %rcx, 24(%r10) +; AVX512-NEXT:    movq %rsi, 8(%r10) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) +  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i128> %val, <2 x i128>* %p2 +  ret <2 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll new file mode 100644 index 00000000000..10de326c356 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -0,0 +1,1422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: usubo_v1i32: +; SSE:       # %bb.0: +; SSE-NEXT:    subl %esi, %edi +; SSE-NEXT:    sbbl %eax, %eax +; SSE-NEXT:    movl %edi, (%rdx) +; SSE-NEXT:    retq +; +; AVX-LABEL: usubo_v1i32: +; AVX:       # %bb.0: +; AVX-NEXT:    subl %esi, %edi +; AVX-NEXT:    sbbl %eax, %eax +; AVX-NEXT:    movl %edi, (%rdx) +; AVX-NEXT:    retq +  %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) +  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 +  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 +  %res = sext <1 x i1> %obit to <1 x i32> +  store <1 x i32> %val, <1 x i32>* %p2 +  ret <1 x i32> %res +} + +define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v2i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT:    pand %xmm2, %xmm1 +; SSE2-NEXT:    pand %xmm2, %xmm0 +; SSE2-NEXT:    psubq %xmm1, %xmm0 +; SSE2-NEXT:    pand %xmm0, %xmm2 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT:    pxor %xmm3, %xmm1 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT:    movq %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v2i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSSE3-NEXT:    pand %xmm2, %xmm1 +; SSSE3-NEXT:    pand %xmm2, %xmm0 +; SSSE3-NEXT:    psubq %xmm1, %xmm0 +; SSSE3-NEXT:    pand %xmm0, %xmm2 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT:    pxor %xmm3, %xmm1 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT:    movq %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v2i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    pxor %xmm2, %xmm2 +; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT:    psubq %xmm1, %xmm0 +; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT:    pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT:    movq %xmm0, (%rdi) +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v2i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v2i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v2i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT:    vpmovqd %xmm0, (%rdi) +; AVX512-NEXT:    vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    retq +  %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) +  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i32> %val, <2 x i32>* %p2 +  ret <2 x i32> %res +} + +define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v3i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm3 +; SSE2-NEXT:    psubd %xmm1, %xmm0 +; SSE2-NEXT:    pxor %xmm0, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT:    movq %xmm0, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT:    movd %xmm0, 8(%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v3i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm3 +; SSSE3-NEXT:    psubd %xmm1, %xmm0 +; SSSE3-NEXT:    pxor %xmm0, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT:    movq %xmm0, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT:    movd %xmm0, 8(%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v3i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm2 +; SSE41-NEXT:    psubd %xmm1, %xmm2 +; SSE41-NEXT:    pminud %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm1, %xmm0 +; SSE41-NEXT:    pextrd $2, %xmm2, 8(%rdi) +; SSE41-NEXT:    movq %xmm2, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v3i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT:    vmovq %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v3i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT:    vmovq %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v3i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT:    vmovq %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) +  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 +  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 +  %res = sext <3 x i1> %obit to <3 x i32> +  store <3 x i32> %val, <3 x i32>* %p2 +  ret <3 x i32> %res +} + +define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v4i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm3 +; SSE2-NEXT:    psubd %xmm1, %xmm0 +; SSE2-NEXT:    pxor %xmm0, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm2, %xmm0 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v4i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm3 +; SSSE3-NEXT:    psubd %xmm1, %xmm0 +; SSSE3-NEXT:    pxor %xmm0, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm2, %xmm0 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v4i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm2 +; SSE41-NEXT:    psubd %xmm1, %xmm2 +; SSE41-NEXT:    pminud %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm1, %xmm0 +; SSE41-NEXT:    movdqa %xmm2, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v4i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v4i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v4i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) +  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i32> %val, <4 x i32>* %p2 +  ret <4 x i32> %res +} + +define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v6i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq %rdi, %rax +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT:    movd %r8d, %xmm0 +; SSE2-NEXT:    movd %ecx, %xmm1 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT:    movd %edx, %xmm3 +; SSE2-NEXT:    movd %esi, %xmm0 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT:    movd %r9d, %xmm1 +; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    movdqa %xmm0, %xmm4 +; SSE2-NEXT:    psubd %xmm2, %xmm4 +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm4, (%rcx) +; SSE2-NEXT:    pxor %xmm2, %xmm4 +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    psubd %xmm3, %xmm0 +; SSE2-NEXT:    movq %xmm0, 16(%rcx) +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    pxor %xmm2, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT:    movq %xmm0, 16(%rdi) +; SSE2-NEXT:    movdqa %xmm4, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v6i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq %rdi, %rax +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT:    movd %r8d, %xmm0 +; SSSE3-NEXT:    movd %ecx, %xmm1 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT:    movd %edx, %xmm3 +; SSSE3-NEXT:    movd %esi, %xmm0 +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT:    movd %r9d, %xmm1 +; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    movdqa %xmm0, %xmm4 +; SSSE3-NEXT:    psubd %xmm2, %xmm4 +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm4, (%rcx) +; SSSE3-NEXT:    pxor %xmm2, %xmm4 +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    psubd %xmm3, %xmm0 +; SSSE3-NEXT:    movq %xmm0, 16(%rcx) +; SSSE3-NEXT:    pxor %xmm2, %xmm0 +; SSSE3-NEXT:    pxor %xmm2, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT:    movq %xmm0, 16(%rdi) +; SSSE3-NEXT:    movdqa %xmm4, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v6i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq %rdi, %rax +; SSE41-NEXT:    movd %esi, %xmm0 +; SSE41-NEXT:    pinsrd $1, %edx, %xmm0 +; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT:    pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT:    movd %r9d, %xmm2 +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    movdqa %xmm0, %xmm4 +; SSE41-NEXT:    psubd %xmm3, %xmm4 +; SSE41-NEXT:    pminud %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm3, %xmm0 +; SSE41-NEXT:    movdqa %xmm2, %xmm5 +; SSE41-NEXT:    psubd %xmm1, %xmm5 +; SSE41-NEXT:    pminud %xmm5, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT:    pxor %xmm3, %xmm2 +; SSE41-NEXT:    movq %xmm5, 16(%rcx) +; SSE41-NEXT:    movdqa %xmm4, (%rcx) +; SSE41-NEXT:    movq %xmm2, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm0, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v6i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v6i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v6i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT:    vmovq %xmm2, 16(%rdi) +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) +  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 +  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 +  %res = sext <6 x i1> %obit to <6 x i32> +  store <6 x i32> %val, <6 x i32>* %p2 +  ret <6 x i32> %res +} + +define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v8i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm5 +; SSE2-NEXT:    pxor %xmm4, %xmm5 +; SSE2-NEXT:    psubd %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    pxor %xmm4, %xmm0 +; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT:    movdqa %xmm1, %xmm2 +; SSE2-NEXT:    pxor %xmm4, %xmm2 +; SSE2-NEXT:    psubd %xmm3, %xmm1 +; SSE2-NEXT:    pxor %xmm1, %xmm4 +; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT:    movdqa %xmm1, 16(%rdi) +; SSE2-NEXT:    movdqa %xmm4, %xmm1 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v8i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm5 +; SSSE3-NEXT:    pxor %xmm4, %xmm5 +; SSSE3-NEXT:    psubd %xmm2, %xmm0 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    pxor %xmm4, %xmm0 +; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT:    movdqa %xmm1, %xmm2 +; SSSE3-NEXT:    pxor %xmm4, %xmm2 +; SSSE3-NEXT:    psubd %xmm3, %xmm1 +; SSSE3-NEXT:    pxor %xmm1, %xmm4 +; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi) +; SSSE3-NEXT:    movdqa %xmm4, %xmm1 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v8i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm4 +; SSE41-NEXT:    psubd %xmm2, %xmm4 +; SSE41-NEXT:    pminud %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT:    pxor %xmm2, %xmm0 +; SSE41-NEXT:    movdqa %xmm1, %xmm5 +; SSE41-NEXT:    psubd %xmm3, %xmm5 +; SSE41-NEXT:    pminud %xmm5, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    movdqa %xmm5, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm4, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v8i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovaps %ymm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v8i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v8i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %ymm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) +  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i32> %val, <8 x i32>* %p2 +  ret <8 x i32> %res +} + +define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v16i32: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT:    movdqa %xmm0, %xmm9 +; SSE2-NEXT:    pxor %xmm8, %xmm9 +; SSE2-NEXT:    psubd %xmm4, %xmm0 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm0 +; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT:    movdqa %xmm1, %xmm4 +; SSE2-NEXT:    pxor %xmm8, %xmm4 +; SSE2-NEXT:    psubd %xmm5, %xmm1 +; SSE2-NEXT:    movdqa %xmm1, 16(%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm1 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT:    movdqa %xmm2, %xmm4 +; SSE2-NEXT:    pxor %xmm8, %xmm4 +; SSE2-NEXT:    psubd %xmm6, %xmm2 +; SSE2-NEXT:    movdqa %xmm2, 32(%rdi) +; SSE2-NEXT:    pxor %xmm8, %xmm2 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT:    movdqa %xmm3, %xmm4 +; SSE2-NEXT:    pxor %xmm8, %xmm4 +; SSE2-NEXT:    psubd %xmm7, %xmm3 +; SSE2-NEXT:    pxor %xmm3, %xmm8 +; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT:    movdqa %xmm3, 48(%rdi) +; SSE2-NEXT:    movdqa %xmm8, %xmm3 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v16i32: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT:    movdqa %xmm0, %xmm9 +; SSSE3-NEXT:    pxor %xmm8, %xmm9 +; SSSE3-NEXT:    psubd %xmm4, %xmm0 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm0 +; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT:    movdqa %xmm1, %xmm4 +; SSSE3-NEXT:    pxor %xmm8, %xmm4 +; SSSE3-NEXT:    psubd %xmm5, %xmm1 +; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm1 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT:    movdqa %xmm2, %xmm4 +; SSSE3-NEXT:    pxor %xmm8, %xmm4 +; SSSE3-NEXT:    psubd %xmm6, %xmm2 +; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi) +; SSSE3-NEXT:    pxor %xmm8, %xmm2 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT:    movdqa %xmm3, %xmm4 +; SSSE3-NEXT:    pxor %xmm8, %xmm4 +; SSSE3-NEXT:    psubd %xmm7, %xmm3 +; SSSE3-NEXT:    pxor %xmm3, %xmm8 +; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi) +; SSSE3-NEXT:    movdqa %xmm8, %xmm3 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v16i32: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm8 +; SSE41-NEXT:    psubd %xmm4, %xmm8 +; SSE41-NEXT:    pminud %xmm8, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9 +; SSE41-NEXT:    pxor %xmm9, %xmm0 +; SSE41-NEXT:    movdqa %xmm1, %xmm4 +; SSE41-NEXT:    psubd %xmm5, %xmm4 +; SSE41-NEXT:    pminud %xmm4, %xmm1 +; SSE41-NEXT:    pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT:    pxor %xmm9, %xmm1 +; SSE41-NEXT:    movdqa %xmm2, %xmm5 +; SSE41-NEXT:    psubd %xmm6, %xmm5 +; SSE41-NEXT:    pminud %xmm5, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT:    pxor %xmm9, %xmm2 +; SSE41-NEXT:    movdqa %xmm3, %xmm6 +; SSE41-NEXT:    psubd %xmm7, %xmm6 +; SSE41-NEXT:    pminud %xmm6, %xmm3 +; SSE41-NEXT:    pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT:    pxor %xmm9, %xmm3 +; SSE41-NEXT:    movdqa %xmm6, 48(%rdi) +; SSE41-NEXT:    movdqa %xmm5, 32(%rdi) +; SSE41-NEXT:    movdqa %xmm4, 16(%rdi) +; SSE41-NEXT:    movdqa %xmm8, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v16i32: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT:    vpsubd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT:    vpminud %xmm7, %xmm5, %xmm7 +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm5, %xmm7 +; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7 +; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT:    vpminud %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT:    vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm4 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT:    vmovaps %ymm3, 32(%rdi) +; AVX1-NEXT:    vmovaps %ymm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v16i32: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT:    vpminud %ymm1, %ymm3, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT:    vpminud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT:    vmovdqa %ymm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v16i32: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) +  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i32> %val, <16 x i32>* %p2 +  ret <16 x i32> %res +} + +define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: usubo_v16i8: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm4 +; SSE2-NEXT:    psubb %xmm1, %xmm4 +; SSE2-NEXT:    pminub %xmm4, %xmm0 +; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT:    pxor %xmm0, %xmm3 +; SSE2-NEXT:    movdqa %xmm3, %xmm1 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT:    pslld $31, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT:    movdqa %xmm3, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm3 +; SSE2-NEXT:    psrad $31, %xmm3 +; SSE2-NEXT:    movdqa %xmm4, (%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v16i8: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm4 +; SSSE3-NEXT:    psubb %xmm1, %xmm4 +; SSSE3-NEXT:    pminub %xmm4, %xmm0 +; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0 +; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT:    pxor %xmm0, %xmm3 +; SSSE3-NEXT:    movdqa %xmm3, %xmm1 +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT:    pslld $31, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT:    movdqa %xmm3, %xmm2 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm3 +; SSSE3-NEXT:    psrad $31, %xmm3 +; SSSE3-NEXT:    movdqa %xmm4, (%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v16i8: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm4 +; SSE41-NEXT:    psubb %xmm1, %xmm4 +; SSE41-NEXT:    pminub %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT:    pxor %xmm0, %xmm3 +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm2 +; SSE41-NEXT:    psrad $31, %xmm2 +; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT:    pslld $31, %xmm3 +; SSE41-NEXT:    psrad $31, %xmm3 +; SSE41-NEXT:    movdqa %xmm4, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v16i8: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT:    vpminub %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT:    vmovdqa %xmm2, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v16i8: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT:    vpminub %xmm0, %xmm2, %xmm0 +; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT:    vmovdqa %xmm2, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v16i8: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) +  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 +  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 +  %res = sext <16 x i1> %obit to <16 x i32> +  store <16 x i8> %val, <16 x i8>* %p2 +  ret <16 x i32> %res +} + +define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: usubo_v8i16: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT:    movdqa %xmm0, %xmm3 +; SSE2-NEXT:    pxor %xmm2, %xmm3 +; SSE2-NEXT:    psubw %xmm1, %xmm0 +; SSE2-NEXT:    pxor %xmm0, %xmm2 +; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT:    movdqa %xmm2, %xmm1 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT:    pslld $31, %xmm1 +; SSE2-NEXT:    psrad $31, %xmm1 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT:    pslld $31, %xmm2 +; SSE2-NEXT:    psrad $31, %xmm2 +; SSE2-NEXT:    movdqa %xmm0, (%rdi) +; SSE2-NEXT:    movdqa %xmm1, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, %xmm1 +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v8i16: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT:    movdqa %xmm0, %xmm3 +; SSSE3-NEXT:    pxor %xmm2, %xmm3 +; SSSE3-NEXT:    psubw %xmm1, %xmm0 +; SSSE3-NEXT:    pxor %xmm0, %xmm2 +; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT:    movdqa %xmm2, %xmm1 +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT:    pslld $31, %xmm1 +; SSSE3-NEXT:    psrad $31, %xmm1 +; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT:    pslld $31, %xmm2 +; SSSE3-NEXT:    psrad $31, %xmm2 +; SSSE3-NEXT:    movdqa %xmm0, (%rdi) +; SSSE3-NEXT:    movdqa %xmm1, %xmm0 +; SSSE3-NEXT:    movdqa %xmm2, %xmm1 +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v8i16: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa %xmm0, %xmm2 +; SSE41-NEXT:    psubw %xmm1, %xmm2 +; SSE41-NEXT:    pminuw %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm0, %xmm1 +; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT:    pslld $31, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT:    pslld $31, %xmm1 +; SSE41-NEXT:    psrad $31, %xmm1 +; SSE41-NEXT:    movdqa %xmm2, (%rdi) +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v8i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpminuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v8i16: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpminuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v8i16: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnleuw %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) +  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 +  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 +  %res = sext <8 x i1> %obit to <8 x i32> +  store <8 x i16> %val, <8 x i16>* %p2 +  ret <8 x i32> %res +} + +define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE-LABEL: usubo_v2i64: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE-NEXT:    movdqa %xmm0, %xmm3 +; SSE-NEXT:    pxor %xmm2, %xmm3 +; SSE-NEXT:    psubq %xmm1, %xmm0 +; SSE-NEXT:    pxor %xmm0, %xmm2 +; SSE-NEXT:    movdqa %xmm2, %xmm1 +; SSE-NEXT:    pcmpgtd %xmm3, %xmm1 +; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE-NEXT:    pcmpeqd %xmm3, %xmm2 +; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT:    pand %xmm4, %xmm2 +; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT:    por %xmm2, %xmm1 +; SSE-NEXT:    movdqa %xmm0, (%rdi) +; SSE-NEXT:    movdqa %xmm1, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: usubo_v2i64: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v2i64: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v2i64: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpcmpnleuq %xmm0, %xmm1, %k1 +; AVX512-NEXT:    vmovdqa %xmm1, (%rdi) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) +  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i64> %val, <2 x i64>* %p2 +  ret <2 x i32> %res +} + +define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: usubo_v4i24: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT:    pand %xmm3, %xmm1 +; SSE2-NEXT:    pand %xmm3, %xmm2 +; SSE2-NEXT:    psubd %xmm1, %xmm2 +; SSE2-NEXT:    pand %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT:    pxor %xmm3, %xmm0 +; SSE2-NEXT:    movd %xmm2, %eax +; SSE2-NEXT:    movw %ax, (%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT:    movd %xmm1, %ecx +; SSE2-NEXT:    movw %cx, 9(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT:    movd %xmm1, %edx +; SSE2-NEXT:    movw %dx, 6(%rdi) +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT:    movd %xmm1, %esi +; SSE2-NEXT:    movw %si, 3(%rdi) +; SSE2-NEXT:    shrl $16, %eax +; SSE2-NEXT:    movb %al, 2(%rdi) +; SSE2-NEXT:    shrl $16, %ecx +; SSE2-NEXT:    movb %cl, 11(%rdi) +; SSE2-NEXT:    shrl $16, %edx +; SSE2-NEXT:    movb %dl, 8(%rdi) +; SSE2-NEXT:    shrl $16, %esi +; SSE2-NEXT:    movb %sil, 5(%rdi) +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v4i24: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movdqa %xmm0, %xmm2 +; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT:    pand %xmm3, %xmm1 +; SSSE3-NEXT:    pand %xmm3, %xmm2 +; SSSE3-NEXT:    psubd %xmm1, %xmm2 +; SSSE3-NEXT:    pand %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT:    pxor %xmm3, %xmm0 +; SSSE3-NEXT:    movd %xmm2, %eax +; SSSE3-NEXT:    movw %ax, (%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %ecx +; SSSE3-NEXT:    movw %cx, 9(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT:    movd %xmm1, %edx +; SSSE3-NEXT:    movw %dx, 6(%rdi) +; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT:    movd %xmm1, %esi +; SSSE3-NEXT:    movw %si, 3(%rdi) +; SSSE3-NEXT:    shrl $16, %eax +; SSSE3-NEXT:    movb %al, 2(%rdi) +; SSSE3-NEXT:    shrl $16, %ecx +; SSSE3-NEXT:    movb %cl, 11(%rdi) +; SSSE3-NEXT:    shrl $16, %edx +; SSSE3-NEXT:    movb %dl, 8(%rdi) +; SSSE3-NEXT:    shrl $16, %esi +; SSSE3-NEXT:    movb %sil, 5(%rdi) +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v4i24: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE41-NEXT:    pand %xmm2, %xmm1 +; SSE41-NEXT:    pand %xmm2, %xmm0 +; SSE41-NEXT:    psubd %xmm1, %xmm0 +; SSE41-NEXT:    pand %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT:    pxor %xmm2, %xmm1 +; SSE41-NEXT:    pextrd $3, %xmm0, %eax +; SSE41-NEXT:    movw %ax, 9(%rdi) +; SSE41-NEXT:    pextrd $2, %xmm0, %ecx +; SSE41-NEXT:    movw %cx, 6(%rdi) +; SSE41-NEXT:    pextrd $1, %xmm0, %edx +; SSE41-NEXT:    movw %dx, 3(%rdi) +; SSE41-NEXT:    movd %xmm0, %esi +; SSE41-NEXT:    movw %si, (%rdi) +; SSE41-NEXT:    shrl $16, %eax +; SSE41-NEXT:    movb %al, 11(%rdi) +; SSE41-NEXT:    shrl $16, %ecx +; SSE41-NEXT:    movb %cl, 8(%rdi) +; SSE41-NEXT:    shrl $16, %edx +; SSE41-NEXT:    movb %dl, 5(%rdi) +; SSE41-NEXT:    shrl $16, %esi +; SSE41-NEXT:    movb %sil, 2(%rdi) +; SSE41-NEXT:    movdqa %xmm1, %xmm0 +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v4i24: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38] +; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpextrd $3, %xmm1, %eax +; AVX1-NEXT:    movw %ax, 9(%rdi) +; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX1-NEXT:    movw %cx, 6(%rdi) +; AVX1-NEXT:    vpextrd $1, %xmm1, %edx +; AVX1-NEXT:    movw %dx, 3(%rdi) +; AVX1-NEXT:    vmovd %xmm1, %esi +; AVX1-NEXT:    movw %si, (%rdi) +; AVX1-NEXT:    shrl $16, %eax +; AVX1-NEXT:    movb %al, 11(%rdi) +; AVX1-NEXT:    shrl $16, %ecx +; AVX1-NEXT:    movb %cl, 8(%rdi) +; AVX1-NEXT:    shrl $16, %edx +; AVX1-NEXT:    movb %dl, 5(%rdi) +; AVX1-NEXT:    shrl $16, %esi +; AVX1-NEXT:    movb %sil, 2(%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v4i24: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpextrd $3, %xmm1, %eax +; AVX2-NEXT:    movw %ax, 9(%rdi) +; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX2-NEXT:    movw %cx, 6(%rdi) +; AVX2-NEXT:    vpextrd $1, %xmm1, %edx +; AVX2-NEXT:    movw %dx, 3(%rdi) +; AVX2-NEXT:    vmovd %xmm1, %esi +; AVX2-NEXT:    movw %si, (%rdi) +; AVX2-NEXT:    shrl $16, %eax +; AVX2-NEXT:    movb %al, 11(%rdi) +; AVX2-NEXT:    shrl $16, %ecx +; AVX2-NEXT:    movb %cl, 8(%rdi) +; AVX2-NEXT:    shrl $16, %edx +; AVX2-NEXT:    movb %dl, 5(%rdi) +; AVX2-NEXT:    shrl $16, %esi +; AVX2-NEXT:    movb %sil, 2(%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v4i24: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vpextrd $3, %xmm1, %eax +; AVX512-NEXT:    movw %ax, 9(%rdi) +; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx +; AVX512-NEXT:    movw %cx, 6(%rdi) +; AVX512-NEXT:    vpextrd $1, %xmm1, %edx +; AVX512-NEXT:    movw %dx, 3(%rdi) +; AVX512-NEXT:    vmovd %xmm1, %esi +; AVX512-NEXT:    movw %si, (%rdi) +; AVX512-NEXT:    shrl $16, %eax +; AVX512-NEXT:    movb %al, 11(%rdi) +; AVX512-NEXT:    shrl $16, %ecx +; AVX512-NEXT:    movb %cl, 8(%rdi) +; AVX512-NEXT:    shrl $16, %edx +; AVX512-NEXT:    movb %dl, 5(%rdi) +; AVX512-NEXT:    shrl $16, %esi +; AVX512-NEXT:    movb %sil, 2(%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) +  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i24> %val, <4 x i24>* %p2 +  ret <4 x i32> %res +} + +define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: usubo_v4i1: +; SSE:       # %bb.0: +; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE-NEXT:    pand %xmm2, %xmm1 +; SSE-NEXT:    pand %xmm2, %xmm0 +; SSE-NEXT:    psubd %xmm1, %xmm0 +; SSE-NEXT:    pand %xmm0, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm0, %xmm2 +; SSE-NEXT:    pcmpeqd %xmm1, %xmm1 +; SSE-NEXT:    pxor %xmm2, %xmm1 +; SSE-NEXT:    pslld $31, %xmm0 +; SSE-NEXT:    movmskps %xmm0, %eax +; SSE-NEXT:    movb %al, (%rdi) +; SSE-NEXT:    movdqa %xmm1, %xmm0 +; SSE-NEXT:    retq +; +; AVX1-LABEL: usubo_v4i1: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT:    vmovmskps %xmm1, %eax +; AVX1-NEXT:    movb %al, (%rdi) +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v4i1: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT:    vmovmskps %xmm1, %eax +; AVX2-NEXT:    movb %al, (%rdi) +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v4i1: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT:    kxorw %k1, %k0, %k1 +; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k2 {%k1} +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k2} {z} +; AVX512-NEXT:    kmovd %k1, %eax +; AVX512-NEXT:    movb %al, (%rdi) +; AVX512-NEXT:    retq +  %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) +  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 +  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 +  %res = sext <4 x i1> %obit to <4 x i32> +  store <4 x i1> %val, <4 x i1>* %p2 +  ret <4 x i32> %res +} + +define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: usubo_v2i128: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT:    setb %al +; SSE2-NEXT:    movzbl %al, %r11d +; SSE2-NEXT:    subq %r8, %rdi +; SSE2-NEXT:    sbbq %r9, %rsi +; SSE2-NEXT:    setb %al +; SSE2-NEXT:    movzbl %al, %eax +; SSE2-NEXT:    movd %eax, %xmm0 +; SSE2-NEXT:    pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT:    movq %rdx, 16(%r10) +; SSE2-NEXT:    movq %rdi, (%r10) +; SSE2-NEXT:    movq %rcx, 24(%r10) +; SSE2-NEXT:    movq %rsi, 8(%r10) +; SSE2-NEXT:    psllq $63, %xmm0 +; SSE2-NEXT:    psrad $31, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT:    retq +; +; SSSE3-LABEL: usubo_v2i128: +; SSSE3:       # %bb.0: +; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT:    setb %al +; SSSE3-NEXT:    movzbl %al, %r11d +; SSSE3-NEXT:    subq %r8, %rdi +; SSSE3-NEXT:    sbbq %r9, %rsi +; SSSE3-NEXT:    setb %al +; SSSE3-NEXT:    movzbl %al, %eax +; SSSE3-NEXT:    movd %eax, %xmm0 +; SSSE3-NEXT:    pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT:    movq %rdx, 16(%r10) +; SSSE3-NEXT:    movq %rdi, (%r10) +; SSSE3-NEXT:    movq %rcx, 24(%r10) +; SSSE3-NEXT:    movq %rsi, 8(%r10) +; SSSE3-NEXT:    psllq $63, %xmm0 +; SSSE3-NEXT:    psrad $31, %xmm0 +; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT:    retq +; +; SSE41-LABEL: usubo_v2i128: +; SSE41:       # %bb.0: +; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT:    setb %al +; SSE41-NEXT:    movzbl %al, %r11d +; SSE41-NEXT:    subq %r8, %rdi +; SSE41-NEXT:    sbbq %r9, %rsi +; SSE41-NEXT:    setb %al +; SSE41-NEXT:    movzbl %al, %eax +; SSE41-NEXT:    movd %eax, %xmm0 +; SSE41-NEXT:    pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT:    movq %rdx, 16(%r10) +; SSE41-NEXT:    movq %rdi, (%r10) +; SSE41-NEXT:    movq %rcx, 24(%r10) +; SSE41-NEXT:    movq %rsi, 8(%r10) +; SSE41-NEXT:    psllq $63, %xmm0 +; SSE41-NEXT:    psrad $31, %xmm0 +; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT:    retq +; +; AVX1-LABEL: usubo_v2i128: +; AVX1:       # %bb.0: +; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT:    setb %al +; AVX1-NEXT:    movzbl %al, %r11d +; AVX1-NEXT:    subq %r8, %rdi +; AVX1-NEXT:    sbbq %r9, %rsi +; AVX1-NEXT:    setb %al +; AVX1-NEXT:    movzbl %al, %eax +; AVX1-NEXT:    vmovd %eax, %xmm0 +; AVX1-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT:    movq %rdx, 16(%r10) +; AVX1-NEXT:    movq %rdi, (%r10) +; AVX1-NEXT:    movq %rcx, 24(%r10) +; AVX1-NEXT:    movq %rsi, 8(%r10) +; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    retq +; +; AVX2-LABEL: usubo_v2i128: +; AVX2:       # %bb.0: +; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT:    setb %al +; AVX2-NEXT:    movzbl %al, %r11d +; AVX2-NEXT:    subq %r8, %rdi +; AVX2-NEXT:    sbbq %r9, %rsi +; AVX2-NEXT:    setb %al +; AVX2-NEXT:    movzbl %al, %eax +; AVX2-NEXT:    vmovd %eax, %xmm0 +; AVX2-NEXT:    vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT:    movq %rdx, 16(%r10) +; AVX2-NEXT:    movq %rdi, (%r10) +; AVX2-NEXT:    movq %rcx, 24(%r10) +; AVX2-NEXT:    movq %rsi, 8(%r10) +; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT:    retq +; +; AVX512-LABEL: usubo_v2i128: +; AVX512:       # %bb.0: +; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT:    setb %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    subq %r8, %rdi +; AVX512-NEXT:    sbbq %r9, %rsi +; AVX512-NEXT:    setb %al +; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT:    movq %rdx, 16(%r10) +; AVX512-NEXT:    movq %rdi, (%r10) +; AVX512-NEXT:    movq %rcx, 24(%r10) +; AVX512-NEXT:    movq %rsi, 8(%r10) +; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT:    retq +  %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) +  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 +  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 +  %res = sext <2 x i1> %obit to <2 x i32> +  store <2 x i128> %val, <2 x i128>* %p2 +  ret <2 x i32> %res +}  | 

