2 files changed, 165 insertions, 58 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5de8c7a4fdb..49bc662d129 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -325,6 +325,117 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+                                  InstCombiner::BuilderTy &Builder) {
+  bool LogicalShift = false;
+  bool ShiftLeft = false;
+
+  switch (II.getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+    LogicalShift = false;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    LogicalShift = true;
+    ShiftLeft = false;
+    break;
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+    LogicalShift = true;
+    ShiftLeft = true;
+    break;
+  }
+  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!CShift)
+    return nullptr;
+
+  auto Vec = II.getArgOperand(0);
+  auto VT = cast<VectorType>(II.getType());
+  auto SVT = VT->getVectorElementType();
+  int NumElts = VT->getNumElements();
+  int BitWidth = SVT->getIntegerBitWidth();
+
+  // Collect each element's shift amount.
+  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+  bool AnyOutOfRange = false;
+  SmallVector<int, 8> ShiftAmts;
+  for (int I = 0; I < NumElts; ++I) {
+    auto *CElt = CShift->getAggregateElement(I);
+    if (CElt && isa<UndefValue>(CElt)) {
+      ShiftAmts.push_back(-1);
+      continue;
+    }
+
+    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+    if (!COp)
+      return nullptr;
+
+    // Handle out of range shifts.
+    // If LogicalShift - set to BitWidth (special case).
+    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+    APInt ShiftVal = COp->getValue();
+    if (ShiftVal.uge(BitWidth)) {
+      AnyOutOfRange = LogicalShift;
+      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+      continue;
+    }
+
+    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+  }
+
+  // If all elements out of range or UNDEF, return vector of zeros/undefs.
+  // ArithmeticShift should only hit this if they are all UNDEF.
+  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+  if (llvm::all_of(ShiftAmts, OutOfRange)) {
+    SmallVector<Constant *, 8> ConstantVec;
+    for (int Idx : ShiftAmts) {
+      if (Idx < 0) {
+        ConstantVec.push_back(UndefValue::get(SVT));
+      } else {
+        assert(LogicalShift && "Logical shift expected");
+        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+      }
+    }
+    return ConstantVector::get(ConstantVec);
+  }
+
+  // We can't handle only some out of range values with generic logical shifts.
+  if (AnyOutOfRange)
+    return nullptr;
+
+  // Build the shift amount constant vector.
+  SmallVector<Constant *, 8> ShiftVecAmts;
+  for (int Idx : ShiftAmts) {
+    if (Idx < 0)
+      ShiftVecAmts.push_back(UndefValue::get(SVT));
+    else
+      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+  }
+  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+  if (ShiftLeft)
+    return Builder.CreateShl(Vec, ShiftVec);
+
+  if (LogicalShift)
+    return Builder.CreateLShr(Vec, ShiftVec);
+
+  return Builder.CreateAShr(Vec, ShiftVec);
+}
+
 static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
@@ -1656,6 +1767,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    if (Value *V = simplifyX86varShift(*II, *Builder))
+      return replaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::x86_sse41_insertps:
     if (Value *V = simplifyX86insertps(*II, *Builder))
       return replaceInstUsesWith(*II, V);
diff --git a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll
index 55b64b5afe9..012a826a765 100644
--- a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll
@@ -940,8 +940,7 @@ define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) {
 
 define <4 x i32> @avx2_psrav_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -949,8 +948,7 @@ define <4 x i32> @avx2_psrav_d_128_0(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrav_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -958,7 +956,7 @@ define <8 x i32> @avx2_psrav_d_256_0(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrav_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 64>)
@@ -967,7 +965,7 @@ define <4 x i32> @avx2_psrav_d_128_var(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrav_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
@@ -976,7 +974,7 @@ define <8 x i32> @avx2_psrav_d_256_var(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrav_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
@@ -985,7 +983,7 @@ define <4 x i32> @avx2_psrav_d_128_allbig(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrav_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 undef, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
@@ -994,7 +992,7 @@ define <8 x i32> @avx2_psrav_d_256_allbig(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrav_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 64>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 64>, i32 undef, i32 0
@@ -1004,7 +1002,7 @@ define <4 x i32> @avx2_psrav_d_128_undef(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrav_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrav_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1018,8 +1016,7 @@ define <8 x i32> @avx2_psrav_d_256_undef(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrlv_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -1027,8 +1024,7 @@ define <4 x i32> @avx2_psrlv_d_128_0(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrlv_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -1036,7 +1032,7 @@ define <8 x i32> @avx2_psrlv_d_256_0(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrlv_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
@@ -1045,7 +1041,7 @@ define <4 x i32> @avx2_psrlv_d_128_var(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrlv_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
@@ -1072,8 +1068,7 @@ define <8 x i32> @avx2_psrlv_d_256_big(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrlv_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
   ret <4 x i32> %1
@@ -1081,8 +1076,7 @@ define <4 x i32> @avx2_psrlv_d_128_allbig(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrlv_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
   ret <8 x i32> %1
@@ -1090,7 +1084,7 @@ define <8 x i32> @avx2_psrlv_d_256_allbig(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psrlv_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
@@ -1100,7 +1094,7 @@ define <4 x i32> @avx2_psrlv_d_128_undef(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psrlv_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psrlv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1110,8 +1104,7 @@ define <8 x i32> @avx2_psrlv_d_256_undef(<8 x i32> %v) {
 
 define <2 x i64> @avx2_psrlv_q_128_0(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> %v
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
   ret <2 x i64> %1
@@ -1119,8 +1112,7 @@ define <2 x i64> @avx2_psrlv_q_128_0(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psrlv_q_256_0(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> %v
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
   ret <4 x i64> %1
@@ -1128,7 +1120,7 @@ define <4 x i64> @avx2_psrlv_q_256_0(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psrlv_q_128_var(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 8>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
@@ -1137,7 +1129,7 @@ define <2 x i64> @avx2_psrlv_q_128_var(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psrlv_q_256_var(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
@@ -1164,8 +1156,7 @@ define <4 x i64> @avx2_psrlv_q_256_big(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psrlv_q_128_allbig(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
   ret <2 x i64> %1
@@ -1173,8 +1164,7 @@ define <2 x i64> @avx2_psrlv_q_128_allbig(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
   ret <4 x i64> %1
@@ -1182,7 +1172,7 @@ define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> %v, <i64 0, i64 undef>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
@@ -1192,7 +1182,7 @@ define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psrlv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> <i64 undef, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
@@ -1206,8 +1196,7 @@ define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
 
 define <4 x i32> @avx2_psllv_d_128_0(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> %v
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> zeroinitializer)
   ret <4 x i32> %1
@@ -1215,8 +1204,7 @@ define <4 x i32> @avx2_psllv_d_128_0(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psllv_d_256_0(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> %v
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> zeroinitializer)
   ret <8 x i32> %1
@@ -1224,7 +1212,7 @@ define <8 x i32> @avx2_psllv_d_256_0(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psllv_d_128_var(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 0, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 0, i32 8, i32 16, i32 31>)
@@ -1233,7 +1221,7 @@ define <4 x i32> @avx2_psllv_d_128_var(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psllv_d_256_var(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 31, i32 24, i32 8, i32 0>)
@@ -1260,8 +1248,7 @@ define <8 x i32> @avx2_psllv_d_256_big(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psllv_d_128_allbig(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
 ;
   %1 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 32, i32 100, i32 -255, i32 undef>)
   ret <4 x i32> %1
@@ -1269,8 +1256,7 @@ define <4 x i32> @avx2_psllv_d_128_allbig(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psllv_d_256_allbig(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ;
   %1 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 undef, i32 100, i32 255, i32 55555, i32 -32, i32 -100, i32 -255, i32 -55555>)
   ret <8 x i32> %1
@@ -1278,7 +1264,7 @@ define <8 x i32> @avx2_psllv_d_256_allbig(<8 x i32> %v) {
 
 define <4 x i32> @avx2_psllv_d_128_undef(<4 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %v, <4 x i32> <i32 undef, i32 8, i32 16, i32 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> %v, <i32 undef, i32 8, i32 16, i32 31>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> <i32 0, i32 8, i32 16, i32 31>, i32 undef, i32 0
@@ -1288,7 +1274,7 @@ define <4 x i32> @avx2_psllv_d_128_undef(<4 x i32> %v) {
 
 define <8 x i32> @avx2_psllv_d_256_undef(<8 x i32> %v) {
 ; CHECK-LABEL: @avx2_psllv_d_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %v, <8 x i32> <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> %v, <i32 0, i32 undef, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %1 = insertelement <8 x i32> <i32 0, i32 8, i32 16, i32 31, i32 31, i32 24, i32 8, i32 0>, i32 undef, i32 1
@@ -1298,8 +1284,7 @@ define <8 x i32> @avx2_psllv_d_256_undef(<8 x i32> %v) {
 
 define <2 x i64> @avx2_psllv_q_128_0(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> %v
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> zeroinitializer)
   ret <2 x i64> %1
@@ -1307,8 +1292,7 @@ define <2 x i64> @avx2_psllv_q_128_0(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psllv_q_256_0(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> %v
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> zeroinitializer)
   ret <4 x i64> %1
@@ -1316,7 +1300,7 @@ define <4 x i64> @avx2_psllv_q_256_0(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psllv_q_128_var(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 8>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 8>)
@@ -1325,7 +1309,7 @@ define <2 x i64> @avx2_psllv_q_128_var(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psllv_q_256_var(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_var(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 0, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 0, i64 8, i64 16, i64 31>)
@@ -1352,8 +1336,7 @@ define <4 x i64> @avx2_psllv_q_256_big(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psllv_q_128_allbig(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
 ;
   %1 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 128, i64 -64>)
   ret <2 x i64> %1
@@ -1361,8 +1344,7 @@ define <2 x i64> @avx2_psllv_q_128_allbig(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_allbig(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 undef, i64 0, i64 0>
 ;
   %1 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 64, i64 undef, i64 -128, i64 -60>)
   ret <4 x i64> %1
@@ -1370,7 +1352,7 @@ define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
 
 define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_128_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> <i64 0, i64 undef>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> %v, <i64 0, i64 undef>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
@@ -1380,7 +1362,7 @@ define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
 
 define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) {
 ; CHECK-LABEL: @avx2_psllv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> <i64 undef, i64 8, i64 16, i64 31>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> %v, <i64 undef, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
   %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0