summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/IR/IntrinsicsX86.td60
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp30
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp23
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h32
-rw-r--r--llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll118
-rw-r--r--llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll266
-rw-r--r--llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll152
-rw-r--r--llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll238
-rw-r--r--llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll226
-rw-r--r--llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll200
10 files changed, 1133 insertions, 212 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 7ec9f830876..21bf565d10f 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2657,54 +2657,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">,
+ def int_x86_avx512_vpmadd52h_uq_128 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">,
+ llvm_v2i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_128 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">,
+ llvm_v2i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52h_uq_256 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">,
+ llvm_v4i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_256 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">,
+ llvm_v4i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52h_uq_512 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">,
+ llvm_v8i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_512 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ llvm_v8i64_ty], [IntrNoMem]>;
}
// VNNI
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index ba2f3fa9248..bd4638f147a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -265,6 +265,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
Name.startswith("avx512.mask.pternlog.") || // Added in 7.0
Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0
+ Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0
+ Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0
Name == "sse.cvtsi2ss" || // Added in 7.0
Name == "sse.cvtsi642ss" || // Added in 7.0
Name == "sse2.cvtsi2sd" || // Added in 7.0
@@ -2569,6 +2571,34 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+ } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") ||
+ Name.startswith("avx512.maskz.vpmadd52"))) {
+ bool ZeroMask = Name[11] == 'z';
+ bool High = Name[20] == 'h' || Name[21] == 'h';
+ unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+ Intrinsic::ID IID;
+ if (VecWidth == 128 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
+ else if (VecWidth == 256 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
+ else if (VecWidth == 512 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
+ else if (VecWidth == 128 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
+ else if (VecWidth == 256 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
+ else if (VecWidth == 512 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
+ else
+ llvm_unreachable("Unexpected intrinsic");
+
+ Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+ CI->getArgOperand(2) };
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+ Args);
+ Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+ : CI->getArgOperand(0);
+ Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
} else if (IsX86 && Name.startswith("avx512.mask.") &&
upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
// Rep will be updated by the call in the condition.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 218ba047a2e..5b699b18dd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20624,26 +20624,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src3),
Mask, PassThru, Subtarget, DAG);
}
- case IFMA_OP_MASKZ:
- case IFMA_OP_MASK: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Src3 = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
- MVT VT = Op.getSimpleValueType();
- SDValue PassThru = Src1;
-
- // set PassThru element
- if (IntrData->Type == IFMA_OP_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
- // Node we need to swizzle the operands to pass the multiply operands
+ case IFMA_OP:
+ // NOTE: We need to swizzle the operands to pass the multiply operands
// first.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Src2, Src3, Src1),
- Mask, PassThru, Subtarget, DAG);
- }
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case CVTPD2PS:
// ISD::FP_ROUND has a second argument that indicates if the truncation
// does not change the value. Set it to 0 since it can change.
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index dca513bf7f8..d5263767db1 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -30,7 +30,7 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
- IFMA_OP_MASK, IFMA_OP_MASKZ,
+ IFMA_OP,
VPERM_2OP, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
@@ -1133,18 +1133,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
@@ -1325,18 +1313,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
@@ -1461,6 +1437,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..22e76025c60
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifma-builtins.c
+
+define <8 x i64> @test_mm512_madd52hi_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52hi_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52hi_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_madd52lo_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52lo_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52lo_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+ ret <8 x i64> %2
+}
+
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
new file mode 100644
index 00000000000..85fbe40928a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 1217138b226..862a010e4fe 100644
--- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,28 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +51,30 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +85,28 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -103,12 +117,18 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
@@ -120,8 +140,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) {
@@ -133,8 +153,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0,
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) {
@@ -144,8 +164,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
@@ -157,8 +177,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -169,8 +189,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -183,8 +205,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64>
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -195,8 +219,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -209,8 +235,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -221,8 +249,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0,
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -235,8 +265,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64>
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -247,8 +279,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i6
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -261,6 +295,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..0fe4cafa881
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifmavl-builtins.c
+
+define <2 x i64> @test_mm_madd52hi_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52hi_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52hi_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52hi_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52hi_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52hi_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+ ret <4 x i64> %2
+}
+
+define <2 x i64> @test_mm_madd52lo_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52lo_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52lo_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52lo_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52lo_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52lo_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+ ret <4 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll
new file mode 100644
index 00000000000..8795574539b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index 40312c9f524..25c0cd16d4e 100644
--- a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,33 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +56,31 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +91,31 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -103,24 +126,33 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -131,24 +163,33 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -159,24 +200,31 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -187,24 +235,31 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -215,12 +270,21 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
OpenPOWER on IntegriCloud