2 files changed, 388 insertions, 13 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 50cd8ab1dbc..e3fee6fbb63 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29144,12 +29144,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT)
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -29162,8 +29168,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
-  // The second operand of the comparison can be either -1 or 0.
-  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+         SplatVal == 1) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
@@ -29292,11 +29307,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is appropriate
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
+  // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
@@ -29305,15 +29318,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
 
+  // The operand is expected to be zero extended from i8
+  // (verified in detectZextAbsDiff). 
+  // In order to convert to i64 and above, additional any/zero/sign 
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend 
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || 
+	  Root.getOpcode() == ISD::ZERO_EXTEND ||
+	  Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -29324,7 +29350,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
-  // Create the SAD instruction
+  // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
 
@@ -29346,10 +29372,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     }
   }
 
-  // Return the lowest i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  MVT Type = Extract->getSimpleValueType(0);
+  unsigned TypeSizeInBits = Type.getSizeInBits();
+  // Return the lowest TypeSizeInBits bits.
+  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
diff --git a/llvm/test/CodeGen/X86/sad_variations.ll b/llvm/test/CodeGen/X86/sad_variations.ll
new file mode 100644
index 00000000000..1d826cf41a4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sad_variations.ll
@@ -0,0 +1,347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+
+define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
+; SSE2-LABEL: sad8_32bit_icmp_sge:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sge:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sge:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
+; SSE2-LABEL: sad8_32bit_icmp_sgt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sgt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sgt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
+; SSE2-LABEL: sad8_32bit_icmp_sle:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sle:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sle:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
+; SSE2-LABEL: sad8_32bit_icmp_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = sext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = zext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i64>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i64>
+  %6 = sub nsw <8 x i64> %2, %5
+  %7 = icmp slt <8 x i64> %6, zeroinitializer
+  %8 = sub nsw <8 x i64> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6
+  %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %9
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %10 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %10
+}