2 files changed, 30 insertions, 23 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 93d5d921801..7c24f848850 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -360,6 +360,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
          Name == "sse42.crc32.64.8" || // Added in 3.4
          Name.startswith("avx.vbroadcast.s") || // Added in 3.5
          Name.startswith("avx512.mask.palignr.") || // Added in 3.9
+         Name.startswith("avx512.mask.valign.") || // Added in 4.0
          Name.startswith("sse2.psll.dq") || // Added in 3.7
          Name.startswith("sse2.psrl.dq") || // Added in 3.7
          Name.startswith("avx2.psll.dq") || // Added in 3.7
@@ -572,13 +573,23 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
   return Builder.CreateSelect(Mask, Op0, Op1);
 }
 
-static Value *UpgradeX86PALIGNRIntrinsics(IRBuilder<> &Builder,
-                                          Value *Op0, Value *Op1, Value *Shift,
-                                          Value *Passthru, Value *Mask) {
+// Handle autoupgrade for masked PALIGNR and VALIGND/Q intrinsics.
+// PALIGNR handles large immediates by shifting while VALIGN masks the immediate
+// so we need to handle both cases. VALIGN also doesn't have 128-bit lanes.
+static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
+                                        Value *Op1, Value *Shift,
+                                        Value *Passthru, Value *Mask,
+                                        bool IsVALIGN) {
   unsigned ShiftVal = cast<llvm::ConstantInt>(Shift)->getZExtValue();
 
   unsigned NumElts = Op0->getType()->getVectorNumElements();
-  assert(NumElts % 16 == 0);
+  assert((IsVALIGN || NumElts % 16 == 0) && "Illegal NumElts for PALIGNR!");
+  assert((!IsVALIGN || NumElts <= 16) && "NumElts too large for VALIGN!");
+  assert(isPowerOf2_32(NumElts) && "NumElts not a power of 2!");
+
+  // Mask the immediate for VALIGN.
+  if (IsVALIGN)
+    ShiftVal &= (NumElts - 1);
 
   // If palignr is shifting the pair of vectors more than the size of two
   // lanes, emit zero.
@@ -595,10 +606,10 @@ static Value *UpgradeX86PALIGNRIntrinsics(IRBuilder<> &Builder,
 
   uint32_t Indices[64];
   // 256-bit palignr operates on 128-bit lanes so we need to handle that
-  for (unsigned l = 0; l != NumElts; l += 16) {
+  for (unsigned l = 0; l < NumElts; l += 16) {
     for (unsigned i = 0; i != 16; ++i) {
       unsigned Idx = ShiftVal + i;
-      if (Idx >= 16)
+      if (!IsVALIGN && Idx >= 16) // Disable wrap for VALIGN.
         Idx += NumElts - 16; // End of lane, switch operand.
       Indices[l + i] = Idx + l;
     }
@@ -1071,11 +1082,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
-      Rep = UpgradeX86PALIGNRIntrinsics(Builder, CI->getArgOperand(0),
-                                        CI->getArgOperand(1),
-                                        CI->getArgOperand(2),
-                                        CI->getArgOperand(3),
-                                        CI->getArgOperand(4));
+      Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
+                                      CI->getArgOperand(1),
+                                      CI->getArgOperand(2),
+                                      CI->getArgOperand(3),
+                                      CI->getArgOperand(4),
+                                      false);
+    } else if (IsX86 && Name.startswith("avx512.mask.valign.")) {
+      Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
+                                      CI->getArgOperand(1),
+                                      CI->getArgOperand(2),
+                                      CI->getArgOperand(3),
+                                      CI->getArgOperand(4),
+                                      true);
     } else if (IsX86 && (Name == "sse2.psll.dq" ||
                          Name == "avx2.psll.dq")) {
       // 128/256-bit shift left specified in bits.
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 3a0e7f101e4..515145a045a 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1171,18 +1171,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
-                     X86ISD::VALIGN, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,