diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 60 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 30 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 23 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 32 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll | 118 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll | 266 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll | 152 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll | 238 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll | 226 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll | 200 |
10 files changed, 1133 insertions, 212 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 7ec9f830876..21bf565d10f 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2657,54 +2657,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52h_uq_128 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">, + def int_x86_avx512_vpmadd52h_uq_128 : + GCCBuiltin<"__builtin_ia32_vpmadd52huq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52h_uq_128 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52l_uq_128 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52l_uq_128 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">, + llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_avx512_vpmadd52l_uq_128 : + GCCBuiltin<"__builtin_ia32_vpmadd52luq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52h_uq_256 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">, - Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52h_uq_256 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">, - Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52l_uq_256 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">, + llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_avx512_vpmadd52h_uq_256 : + GCCBuiltin<"__builtin_ia32_vpmadd52huq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52l_uq_256 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">, + llvm_v4i64_ty], [IntrNoMem]>; + def int_x86_avx512_vpmadd52l_uq_256 : + GCCBuiltin<"__builtin_ia32_vpmadd52luq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52h_uq_512 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, - llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52h_uq_512 : - GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">, - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, - llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpmadd52l_uq_512 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">, + llvm_v4i64_ty], [IntrNoMem]>; + def int_x86_avx512_vpmadd52h_uq_512 : + GCCBuiltin<"__builtin_ia32_vpmadd52huq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, - llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vpmadd52l_uq_512 : - GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">, + llvm_v8i64_ty], [IntrNoMem]>; + def int_x86_avx512_vpmadd52l_uq_512 : + GCCBuiltin<"__builtin_ia32_vpmadd52luq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, - llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + llvm_v8i64_ty], [IntrNoMem]>; } // VNNI diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ba2f3fa9248..bd4638f147a 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -265,6 +265,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0 Name.startswith("avx512.mask.pternlog.") || // Added in 7.0 Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0 + Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0 + Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2569,6 +2571,34 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); + } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") || + Name.startswith("avx512.maskz.vpmadd52"))) { + bool ZeroMask = Name[11] == 'z'; + bool High = Name[20] == 'h' || Name[21] == 'h'; + unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); + Intrinsic::ID IID; + if (VecWidth == 128 && !High) + IID = Intrinsic::x86_avx512_vpmadd52l_uq_128; + else if (VecWidth == 256 && !High) + IID = Intrinsic::x86_avx512_vpmadd52l_uq_256; + else if (VecWidth == 512 && !High) + IID = Intrinsic::x86_avx512_vpmadd52l_uq_512; + else if (VecWidth == 128 && High) + IID = Intrinsic::x86_avx512_vpmadd52h_uq_128; + else if (VecWidth == 256 && High) + IID = Intrinsic::x86_avx512_vpmadd52h_uq_256; + else if (VecWidth == 512 && High) + IID = Intrinsic::x86_avx512_vpmadd52h_uq_512; + else + llvm_unreachable("Unexpected intrinsic"); + + Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1), + CI->getArgOperand(2) }; + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), + Args); + Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) + : CI->getArgOperand(0); + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 218ba047a2e..5b699b18dd9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20624,26 +20624,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Src3), Mask, PassThru, Subtarget, DAG); } - case IFMA_OP_MASKZ: - case IFMA_OP_MASK: { - SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); - SDValue Src3 = Op.getOperand(3); - SDValue Mask = Op.getOperand(4); - MVT VT = Op.getSimpleValueType(); - SDValue PassThru = Src1; - - // set PassThru element - if (IntrData->Type == IFMA_OP_MASKZ) - PassThru = getZeroVector(VT, Subtarget, DAG, dl); - - // Node we need to swizzle the operands to pass the multiply operands + case IFMA_OP: + // NOTE: We need to swizzle the operands to pass the multiply operands // first. - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, - dl, Op.getValueType(), - Src2, Src3, Src1), - Mask, PassThru, Subtarget, DAG); - } + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case CVTPD2PS: // ISD::FP_ROUND has a second argument that indicates if the truncation // does not change the value. Set it to 0 since it can change. diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index dca513bf7f8..d5263767db1 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -30,7 +30,7 @@ enum IntrinsicType : uint16_t { INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, - IFMA_OP_MASK, IFMA_OP_MASKZ, + IFMA_OP, VPERM_2OP, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, @@ -1133,18 +1133,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK, - X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK, - X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK, - X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), @@ -1325,18 +1313,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ, - X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ, - X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ, - X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ, - X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), @@ -1461,6 +1437,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0), X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0), diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll new file mode 100644 index 00000000000..22e76025c60 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifma-builtins.c + +define <8 x i64> @test_mm512_madd52hi_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) { +; X32-LABEL: test_mm512_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) + ret <8 x i64> %0 +} + +define <8 x i64> @test_mm512_mask_madd52hi_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) { +; X32-LABEL: test_mm512_mask_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W + ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_maskz_madd52hi_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) { +; X32-LABEL: test_mm512_maskz_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer + ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_madd52lo_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) { +; X32-LABEL: test_mm512_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) + ret <8 x i64> %0 +} + +define <8 x i64> @test_mm512_mask_madd52lo_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) { +; X32-LABEL: test_mm512_mask_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W + ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_maskz_madd52lo_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) { +; X32-LABEL: test_mm512_maskz_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer + ret <8 x i64> %2 +} + +declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) +declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll new file mode 100644 index 00000000000..85fbe40928a --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll @@ -0,0 +1,266 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s + +declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res4 = add <8 x i64> %res, %res1 + %res5 = add <8 x i64> %res3, %res2 + %res6 = add <8 x i64> %res5, %res4 + ret <8 x i64> %res6 +} + +declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res4 = add <8 x i64> %res, %res1 + %res5 = add <8 x i64> %res3, %res2 + %res6 = add <8 x i64> %res5, %res4 + ret <8 x i64> %res6 +} + +declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res4 = add <8 x i64> %res, %res1 + %res5 = add <8 x i64> %res3, %res2 + %res6 = add <8 x i64> %res5, %res4 + ret <8 x i64> %res6 +} + +declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res4 = add <8 x i64> %res, %res1 + %res5 = add <8 x i64> %res3, %res2 + %res6 = add <8 x i64> %res5, %res4 + ret <8 x i64> %res6 +} + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x2load = load i64, i64* %x2ptr + %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 + %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 +; CHECK-NEXT: retq + + %x1load = load i64, i64* %x1ptr + %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 + %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x2load = load i64, i64* %x2ptr + %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 + %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + + %x1load = load i64, i64* %x1ptr + %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 + %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x2 = load <8 x i64>, <8 x i64>* %x2ptr + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x2load = load i64, i64* %x2ptr + %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 + %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x1 = load <8 x i64>, <8 x i64>* %x1ptr + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + + %x1load = load i64, i64* %x1ptr + %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 + %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll index 1217138b226..862a010e4fe 100644 --- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s -declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -19,24 +19,28 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res4 = add <8 x i64> %res, %res1 - %res5 = add <8 x i64> %res3, %res2 + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0 + %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer + %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %res4 = add <8 x i64> %3, %6 + %res5 = add <8 x i64> %10, %9 %res6 = add <8 x i64> %res5, %res4 ret <8 x i64> %res6 } -declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -47,24 +51,30 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res4 = add <8 x i64> %res, %res1 - %res5 = add <8 x i64> %res3, %res2 + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer + %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer + %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %res4 = add <8 x i64> %3, %6 + %res5 = add <8 x i64> %10, %9 %res6 = add <8 x i64> %res5, %res4 ret <8 x i64> %res6 } -declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>) define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -75,24 +85,28 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res4 = add <8 x i64> %res, %res1 - %res5 = add <8 x i64> %res3, %res2 + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0 + %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer + %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %res4 = add <8 x i64> %3, %6 + %res5 = add <8 x i64> %10, %9 %res6 = add <8 x i64> %res5, %res4 ret <8 x i64> %res6 } -declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -103,12 +117,18 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res4 = add <8 x i64> %res, %res1 - %res5 = add <8 x i64> %res3, %res2 + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer + %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer + %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %res4 = add <8 x i64> %3, %6 + %res5 = add <8 x i64> %10, %9 %res6 = add <8 x i64> %res5, %res4 ret <8 x i64> %res6 } @@ -120,8 +140,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i ; CHECK-NEXT: retq %x2 = load <8 x i64>, <8 x i64>* %x2ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 } define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) { @@ -133,8 +153,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, %x2load = load i64, i64* %x2ptr %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 } define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) { @@ -144,8 +164,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0 ; CHECK-NEXT: retq %x1 = load <8 x i64>, <8 x i64>* %x1ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 } define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) { @@ -157,8 +177,8 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 %x1load = load i64, i64* %x1ptr %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 } define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { @@ -169,8 +189,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, < ; CHECK-NEXT: retq %x2 = load <8 x i64>, <8 x i64>* %x2ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) { @@ -183,8 +205,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x2load = load i64, i64* %x2ptr %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { @@ -195,8 +219,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64 ; CHECK-NEXT: retq %x1 = load <8 x i64>, <8 x i64>* %x1ptr - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) { @@ -209,8 +235,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 %x1load = load i64, i64* %x1ptr %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0 + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) { @@ -221,8 +249,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, ; CHECK-NEXT: retq %x2 = load <8 x i64>, <8 x i64>* %x2ptr - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) { @@ -235,8 +265,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x2load = load i64, i64* %x2ptr %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0 %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) { @@ -247,8 +279,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i6 ; CHECK-NEXT: retq %x1 = load <8 x i64>, <8 x i64>* %x1ptr - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) { @@ -261,6 +295,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< %x1load = load i64, i64* %x1ptr %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0 %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll new file mode 100644 index 00000000000..0fe4cafa881 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifmavl-builtins.c + +define <2 x i64> @test_mm_madd52hi_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) { +; X32-LABEL: test_mm_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) + ret <2 x i64> %0 +} + +define <2 x i64> @test_mm_mask_madd52hi_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) { +; X32-LABEL: test_mm_mask_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W + ret <2 x i64> %2 +} + +define <2 x i64> @test_mm_maskz_madd52hi_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) { +; X32-LABEL: test_mm_maskz_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer + ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_madd52hi_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) { +; X32-LABEL: test_mm256_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) + ret <4 x i64> %0 +} + +define <4 x i64> @test_mm256_mask_madd52hi_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) { +; X32-LABEL: test_mm256_mask_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W + ret <4 x i64> %2 +} + +define <4 x i64> @test_mm256_maskz_madd52hi_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) { +; X32-LABEL: test_mm256_maskz_madd52hi_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_madd52hi_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer + ret <4 x i64> %2 +} + +define <2 x i64> @test_mm_madd52lo_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) { +; X32-LABEL: test_mm_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) + ret <2 x i64> %0 +} + +define <2 x i64> @test_mm_mask_madd52lo_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) { +; X32-LABEL: test_mm_mask_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W + ret <2 x i64> %2 +} + +define <2 x i64> @test_mm_maskz_madd52lo_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) { +; X32-LABEL: test_mm_maskz_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> + %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer + ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_madd52lo_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) { +; X32-LABEL: test_mm256_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) + ret <4 x i64> %0 +} + +define <4 x i64> @test_mm256_mask_madd52lo_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) { +; X32-LABEL: test_mm256_mask_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W + ret <4 x i64> %2 +} + +define <4 x i64> @test_mm256_maskz_madd52lo_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) { +; X32-LABEL: test_mm256_maskz_madd52lo_epu64: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_madd52lo_epu64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) + %1 = bitcast i8 %__M to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer + ret <4 x i64> %2 +} + +declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) +declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll new file mode 100644 index 00000000000..8795574539b --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s + +declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm0, %xmm4 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + + %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res4 = add <2 x i64> %res, %res1 + %res5 = add <2 x i64> %res3, %res2 + %res6 = add <2 x i64> %res5, %res4 + ret <2 x i64> %res6 +} + +declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm0, %ymm4 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + + %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res4 = add <4 x i64> %res, %res1 + %res5 = add <4 x i64> %res3, %res2 + %res6 = add <4 x i64> %res5, %res4 + ret <4 x i64> %res6 +} + +declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm0, %xmm4 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + + %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res4 = add <2 x i64> %res, %res1 + %res5 = add <2 x i64> %res3, %res2 + %res6 = add <2 x i64> %res5, %res4 + ret <2 x i64> %res6 +} + +declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm0, %ymm4 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + + %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res4 = add <4 x i64> %res, %res1 + %res5 = add <4 x i64> %res3, %res2 + %res6 = add <4 x i64> %res5, %res4 + ret <4 x i64> %res6 +} + +declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm0, %xmm4 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + + %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res4 = add <2 x i64> %res, %res1 + %res5 = add <2 x i64> %res3, %res2 + %res6 = add <2 x i64> %res5, %res4 + ret <2 x i64> %res6 +} + +declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm0, %ymm4 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + + %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res4 = add <4 x i64> %res, %res1 + %res5 = add <4 x i64> %res3, %res2 + %res6 = add <4 x i64> %res5, %res4 + ret <4 x i64> %res6 +} + +declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %xmm0, %xmm4 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + + %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res4 = add <2 x i64> %res, %res1 + %res5 = add <2 x i64> %res3, %res2 + %res6 = add <2 x i64> %res5, %res4 + ret <2 x i64> %res6 +} + +declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovdqa %ymm0, %ymm4 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + + %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res4 = add <4 x i64> %res, %res1 + %res5 = add <4 x i64> %res3, %res2 + %res6 = add <4 x i64> %res5, %res4 + ret <4 x i64> %res6 +} diff --git a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 40312c9f524..25c0cd16d4e 100644 --- a/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s -declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm4 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -19,24 +19,33 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res4 = add <2 x i64> %res, %res1 - %res5 = add <2 x i64> %res3, %res2 + %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> + %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0 + %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> + %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0 + %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1> + %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer + %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %res4 = add <2 x i64> %3, %6 + %res5 = add <2 x i64> %10, %9 %res6 = add <2 x i64> %res5, %res4 ret <2 x i64> %res6 } -declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm4 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -47,24 +56,31 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res4 = add <4 x i64> %res, %res1 - %res5 = add <4 x i64> %res3, %res2 + %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0 + %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0 + %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer + %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %res4 = add <4 x i64> %3, %6 + %res5 = add <4 x i64> %10, %9 %res6 = add <4 x i64> %res5, %res4 ret <4 x i64> %res6 } -declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) - define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm4 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -75,24 +91,31 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res4 = add <2 x i64> %res, %res1 - %res5 = add <2 x i64> %res3, %res2 + %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> + %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer + %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> + %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer + %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1> + %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer + %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %res4 = add <2 x i64> %3, %6 + %res5 = add <2 x i64> %10, %9 %res6 = add <2 x i64> %res5, %res4 ret <2 x i64> %res6 } -declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) - define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm4 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -103,24 +126,33 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res4 = add <4 x i64> %res, %res1 - %res5 = add <4 x i64> %res3, %res2 + %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer + %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer + %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer + %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %res4 = add <4 x i64> %3, %6 + %res5 = add <4 x i64> %10, %9 %res6 = add <4 x i64> %res5, %res4 ret <4 x i64> %res6 } -declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm4 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -131,24 +163,33 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res4 = add <2 x i64> %res, %res1 - %res5 = add <2 x i64> %res3, %res2 + %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> + %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0 + %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> + %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0 + %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1> + %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer + %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %res4 = add <2 x i64> %3, %6 + %res5 = add <2 x i64> %10, %9 %res6 = add <2 x i64> %res5, %res4 ret <2 x i64> %res6 } -declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>) define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm4 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -159,24 +200,31 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res4 = add <4 x i64> %res, %res1 - %res5 = add <4 x i64> %res3, %res2 + %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0 + %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0 + %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer + %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %res4 = add <4 x i64> %3, %6 + %res5 = add <4 x i64> %10, %9 %res6 = add <4 x i64> %res5, %res4 ret <4 x i64> %res6 } -declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) - define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %xmm0, %xmm4 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -187,24 +235,31 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) - %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res4 = add <2 x i64> %res, %res1 - %res5 = add <2 x i64> %res3, %res2 + %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> + %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer + %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> + %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer + %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1> + %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer + %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %res4 = add <2 x i64> %3, %6 + %res5 = add <2 x i64> %10, %9 %res6 = add <2 x i64> %res5, %res4 ret <2 x i64> %res6 } -declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) - define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa %ymm0, %ymm4 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -215,12 +270,21 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res4 = add <4 x i64> %res, %res1 - %res5 = add <4 x i64> %res3, %res2 + %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer + %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer) + %5 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer + %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer) + %8 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer + %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %res4 = add <4 x i64> %3, %6 + %res5 = add <4 x i64> %10, %9 %res6 = add <4 x i64> %res5, %res4 ret <4 x i64> %res6 } |

