diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-06-10 06:01:36 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-06-10 06:01:36 +0000 |
| commit | 98a79934af6cf88547ad3cc0ee6f08a746354c51 (patch) | |
| tree | b5fd58ca6c0e149433e605a97513503342e50286 | |
| parent | 69d6418d607fdfc36f621cb5fbd8c4ce3f4a89bc (diff) | |
| download | bcm5719-llvm-98a79934af6cf88547ad3cc0ee6f08a746354c51.tar.gz bcm5719-llvm-98a79934af6cf88547ad3cc0ee6f08a746354c51.zip | |
[X86] Remove masking from the 512-bit masked floating point add/sub/mul/div intrinsics. Use a select in IR instead.
llvm-svn: 334358
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 33 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 78 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 19 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 24 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 67 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 861 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 367 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/X86/x86-avx512.ll | 208 |
8 files changed, 1319 insertions, 338 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 7c3f0a31496..c36068423bb 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3744,30 +3744,31 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">, + def int_x86_avx512_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 1172464ad03..eefd034b0b9 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -216,22 +216,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.cvtw2mask.") || // Added in 7.0 Name.startswith("avx512.cvtd2mask.") || // Added in 7.0 Name.startswith("avx512.cvtq2mask.") || // Added in 7.0 - Name == "avx512.mask.add.pd.128" || // Added in 4.0 - Name == "avx512.mask.add.pd.256" || // Added in 4.0 - Name == "avx512.mask.add.ps.128" || // Added in 4.0 - Name == "avx512.mask.add.ps.256" || // Added in 4.0 - Name == "avx512.mask.div.pd.128" || // Added in 4.0 - Name == "avx512.mask.div.pd.256" || // Added in 4.0 - Name == "avx512.mask.div.ps.128" || // Added in 4.0 - Name == "avx512.mask.div.ps.256" || // Added in 4.0 - Name == "avx512.mask.mul.pd.128" || // Added in 4.0 - Name == "avx512.mask.mul.pd.256" || // Added in 4.0 - Name == "avx512.mask.mul.ps.128" || // Added in 4.0 - Name == "avx512.mask.mul.ps.256" || // Added in 4.0 - Name == "avx512.mask.sub.pd.128" || // Added in 4.0 - Name == "avx512.mask.sub.pd.256" || // Added in 4.0 - Name == "avx512.mask.sub.ps.128" || // Added in 4.0 - Name == "avx512.mask.sub.ps.256" || // Added in 4.0 Name == "avx512.mask.max.pd.128" || // Added in 5.0 Name == "avx512.mask.max.pd.256" || // Added in 5.0 Name == "avx512.mask.max.ps.128" || // Added in 5.0 @@ -278,6 +262,10 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.maskz.vpdpwssd.") || // Added in 7.0 Name.startswith("avx512.mask.vpdpwssds.") || // Added in 7.0 Name.startswith("avx512.maskz.vpdpwssds.") || // Added in 7.0 + Name.startswith("avx512.mask.add.p") || // Added in 7.0 + Name.startswith("avx512.mask.sub.p") || // Added in 7.0 + Name.startswith("avx512.mask.mul.p") || // Added in 7.0 + Name.startswith("avx512.mask.div.p") || // Added in 7.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2213,20 +2201,68 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1)); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); - } else if (IsX86 && (Name.startswith("avx512.mask.add.p"))) { - Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); + } else if (IsX86 && Name.startswith("avx512.mask.add.p")) { + if (Name.endswith(".512")) { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_add_ps_512; + else + IID = Intrinsic::x86_avx512_add_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + } else { + Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); + } Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.div.p")) { - Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); + if (Name.endswith(".512")) { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_div_ps_512; + else + IID = Intrinsic::x86_avx512_div_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + } else { + Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); + } Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.mul.p")) { - Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); + if (Name.endswith(".512")) { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_mul_ps_512; + else + IID = Intrinsic::x86_avx512_mul_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + } else { + Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); + } Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.sub.p")) { - Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); + if (Name.endswith(".512")) { + Intrinsic::ID IID; + if (Name[17] == 's') + IID = Intrinsic::x86_avx512_sub_ps_512; + else + IID = Intrinsic::x86_avx512_sub_pd_512; + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4) }); + } else { + Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); + } Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 72c455132b4..87fa5b23b07 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20312,9 +20312,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch(IntrData->Type) { case INTR_TYPE_1OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); - case INTR_TYPE_2OP: - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2)); + case INTR_TYPE_2OP: { + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(3); + if (!isRoundModeCurDirection(Rnd)) { + return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Rnd); + } + } + + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 9e3810b10ca..124c2d8dd0a 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -449,6 +449,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), @@ -463,12 +465,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, - X86ISD::FADD_RND), - X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, - X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FADDS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, @@ -658,10 +658,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::DBPSADBW, 0), - X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, - X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, - X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FDIVS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, @@ -766,10 +762,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMINS, X86ISD::FMINS_RND), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK, X86ISD::FMINS, X86ISD::FMINS_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, - X86ISD::FMUL_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, - X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, @@ -990,10 +982,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSQRTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, - X86ISD::FSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, - X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, @@ -1197,6 +1185,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mul_pd_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mul_ps_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), @@ -1289,6 +1279,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index ca2b38dce28..9eb8d5d1aea 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1034,36 +1034,6 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II, return nullptr; } -// Emit a select instruction and appropriate bitcasts to help simplify -// masked intrinsics. -static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1, - InstCombiner::BuilderTy &Builder) { - unsigned VWidth = Op0->getType()->getVectorNumElements(); - - // If the mask is all ones we don't need the select. But we need to check - // only the bit thats will be used in case VWidth is less than 8. - if (auto *C = dyn_cast<ConstantInt>(Mask)) - if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue()) - return Op0; - - auto *MaskTy = VectorType::get(Builder.getInt1Ty(), - cast<IntegerType>(Mask->getType())->getBitWidth()); - Mask = Builder.CreateBitCast(Mask, MaskTy); - - // If we have less than 8 elements, then the starting mask was an i8 and - // we need to extract down to the right number of elements. - if (VWidth < 8) { - uint32_t Indices[4]; - for (unsigned i = 0; i != VWidth; ++i) - Indices[i] = i; - Mask = Builder.CreateShuffleVector(Mask, Mask, - makeArrayRef(Indices, VWidth), - "extract"); - } - - return Builder.CreateSelect(Mask, Op0, Op1); -} - static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) { Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); @@ -2341,17 +2311,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } - case Intrinsic::x86_avx512_mask_add_ps_512: - case Intrinsic::x86_avx512_mask_div_ps_512: - case Intrinsic::x86_avx512_mask_mul_ps_512: - case Intrinsic::x86_avx512_mask_sub_ps_512: - case Intrinsic::x86_avx512_mask_add_pd_512: - case Intrinsic::x86_avx512_mask_div_pd_512: - case Intrinsic::x86_avx512_mask_mul_pd_512: - case Intrinsic::x86_avx512_mask_sub_pd_512: + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + case Intrinsic::x86_avx512_div_pd_512: + case Intrinsic::x86_avx512_mul_pd_512: + case Intrinsic::x86_avx512_sub_pd_512: // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular // IR operations. - if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { + if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { if (R->getValue() == 4) { Value *Arg0 = II->getArgOperand(0); Value *Arg1 = II->getArgOperand(1); @@ -2359,27 +2329,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *V; switch (II->getIntrinsicID()) { default: llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_mask_add_ps_512: - case Intrinsic::x86_avx512_mask_add_pd_512: + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_add_pd_512: V = Builder.CreateFAdd(Arg0, Arg1); break; - case Intrinsic::x86_avx512_mask_sub_ps_512: - case Intrinsic::x86_avx512_mask_sub_pd_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_sub_pd_512: V = Builder.CreateFSub(Arg0, Arg1); break; - case Intrinsic::x86_avx512_mask_mul_ps_512: - case Intrinsic::x86_avx512_mask_mul_pd_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_mul_pd_512: V = Builder.CreateFMul(Arg0, Arg1); break; - case Intrinsic::x86_avx512_mask_div_ps_512: - case Intrinsic::x86_avx512_mask_div_pd_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_div_pd_512: V = Builder.CreateFDiv(Arg0, Arg1); break; } - // Create a select for the masking. - V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), - Builder); return replaceInstUsesWith(*II, V); } } diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 9b6a1e8f633..7803ff75012 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -6941,3 +6941,864 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 %res2 = add <16 x i32> %res, %res1 ret <16 x i32> %res2 } + +declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rd: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_ru: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vsubps_rz: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rn: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rd: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_ru: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_vmulps_rz: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x59,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +;; mask float +define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_vmulps_mask_rn: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_rn: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_vmulps_mask_rd: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_rd: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_vmulps_mask_ru: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_ru: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_vmulps_mask_rz: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_rz: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + +;; With Passthru value +define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_vmulps_mask_passthru_rn: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x59,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_passthru_rn: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x59,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 0) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_vmulps_mask_passthru_rd: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x59,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_passthru_rd: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x59,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 1) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_vmulps_mask_passthru_ru: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x59,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_passthru_ru: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x59,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_vmulps_mask_passthru_rz: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x59,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulps_mask_passthru_rz: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x59,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float> %passthru, i16 %mask, i32 3) + ret <16 x float> %res +} + +;; mask double +define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; X86-LABEL: test_vmulpd_mask_rn: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulpd_mask_rn: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 0) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; X86-LABEL: test_vmulpd_mask_rd: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulpd_mask_rd: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 1) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; X86-LABEL: test_vmulpd_mask_ru: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulpd_mask_ru: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 2) + ret <8 x double> %res +} + +define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { +; X86-LABEL: test_vmulpd_mask_rz: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x59,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_vmulpd_mask_rz: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x59,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double> zeroinitializer, i8 %mask, i32 3) + ret <8 x double> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_add_round_ps_rn_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_add_round_ps_rn_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_add_round_ps_rd_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_add_round_ps_rd_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_add_round_ps_ru_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_add_round_ps_ru_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_add_round_ps_rz_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_add_round_ps_rz_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_add_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_add_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_add_round_ps_rn_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x58,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_add_round_ps_rn_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x58,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_add_round_ps_rd_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x58,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_add_round_ps_rd_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x58,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_add_round_ps_ru_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x58,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_add_round_ps_ru_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x58,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_add_round_ps_rz_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x58,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_add_round_ps_rz_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x58,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_add_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x58,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_add_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x58,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rn_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rd_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_ru_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_rz_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_add_round_ps_current: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_sub_round_ps_rn_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5c,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_sub_round_ps_rn_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5c,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_sub_round_ps_rd_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x5c,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_sub_round_ps_rd_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x5c,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_sub_round_ps_ru_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x5c,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_sub_round_ps_ru_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x5c,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_sub_round_ps_rz_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x5c,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_sub_round_ps_rz_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x5c,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_sub_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5c,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_sub_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5c,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_sub_round_ps_current: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x5c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_div_round_ps_rn_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5e,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_div_round_ps_rn_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x5e,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_div_round_ps_rd_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x5e,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_div_round_ps_rd_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x5e,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_div_round_ps_ru_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x5e,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_div_round_ps_ru_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x5e,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_div_round_ps_rz_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x5e,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_div_round_ps_rz_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x5e,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; X86-LABEL: test_mm512_maskz_div_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5e,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_maskz_div_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x5e,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_div_round_ps_rn_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5e,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_div_round_ps_rn_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x5e,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_div_round_ps_rd_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x5e,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_div_round_ps_rd_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x5e,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_div_round_ps_ru_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x5e,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_div_round_ps_ru_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x5e,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_div_round_ps_rz_sae: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x5e,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_div_round_ps_rz_sae: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x5e,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { +; X86-LABEL: test_mm512_mask_div_round_ps_current: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5e,0xd1] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mm512_mask_div_round_ps_current: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x5e,0xd1] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) + ret <16 x float> %res +} + + +define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rn_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rd_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float> %res +} +define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_ru_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_rz_sae: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} + +define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { +; CHECK-LABEL: test_mm512_div_round_ps_current: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x5e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index daf97b2d133..5c656031f91 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1186,18 +1186,17 @@ define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 ) -declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) +declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) +declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: test_vsubps_rn: ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + ret <16 x float> %1 } define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) { @@ -1205,9 +1204,8 @@ define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + ret <16 x float> %1 } define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) { @@ -1215,9 +1213,8 @@ define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + ret <16 x float> %1 } define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) { @@ -1225,9 +1222,8 @@ define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + ret <16 x float> %1 } define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) { @@ -1235,9 +1231,8 @@ define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + ret <16 x float> %1 } define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) { @@ -1245,9 +1240,8 @@ define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + ret <16 x float> %1 } define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) { @@ -1255,9 +1249,8 @@ define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + ret <16 x float> %1 } define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) { @@ -1265,9 +1258,8 @@ define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + ret <16 x float> %1 } ;; mask float @@ -1277,9 +1269,10 @@ define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1288,9 +1281,10 @@ define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1299,9 +1293,10 @@ define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1310,9 +1305,10 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> zeroinitializer, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } ;; With Passthru value @@ -1323,9 +1319,10 @@ define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { @@ -1335,9 +1332,10 @@ define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { @@ -1347,9 +1345,10 @@ define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 } define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) { @@ -1359,9 +1358,10 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1, - <16 x float> %passthru, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 } ;; mask double @@ -1371,9 +1371,10 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 0) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) { @@ -1382,9 +1383,10 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 1) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 1) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) { @@ -1393,9 +1395,10 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 2) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 2) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { @@ -1404,9 +1407,10 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1, - <8 x double> zeroinitializer, i8 %mask, i32 3) - ret <8 x double> %res + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 3) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1415,26 +1419,34 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } + define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } + define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1443,19 +1455,22 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } - define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_add_round_ps_current: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1465,9 +1480,12 @@ define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae: ; CHECK: ## %bb.0: @@ -1475,9 +1493,12 @@ define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae: ; CHECK: ## %bb.0: @@ -1485,8 +1506,10 @@ define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1496,11 +1519,12 @@ define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } - define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_add_round_ps_current: ; CHECK: ## %bb.0: @@ -1508,34 +1532,37 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } - define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + ret <16 x float> %1 } + define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + ret <16 x float> %1 } + define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + ret <16 x float> %1 } define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1543,8 +1570,8 @@ define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK: ## %bb.0: ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + ret <16 x float> %1 } define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1552,10 +1579,10 @@ define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK: ## %bb.0: ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 } -declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512..add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae: @@ -1564,9 +1591,12 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae: ; CHECK: ## %bb.0: @@ -1574,9 +1604,12 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae: ; CHECK: ## %bb.0: @@ -1584,8 +1617,10 @@ define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1595,11 +1630,12 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } - define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_sub_round_ps_current: ; CHECK: ## %bb.0: @@ -1607,8 +1643,10 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1616,24 +1654,26 @@ define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x floa ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + ret <16 x float> %1 } + define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + ret <16 x float> %1 } + define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + ret <16 x float> %1 } define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1641,8 +1681,8 @@ define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + ret <16 x float> %1 } define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1650,8 +1690,8 @@ define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK: ## %bb.0: ; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 } define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1660,26 +1700,34 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } + define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } + define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1688,19 +1736,22 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } - define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_div_round_ps_current: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1710,9 +1761,12 @@ define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae: ; CHECK: ## %bb.0: @@ -1720,9 +1774,12 @@ define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } + define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae: ; CHECK: ## %bb.0: @@ -1730,8 +1787,10 @@ define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { @@ -1741,11 +1800,12 @@ define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } - define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) { ; CHECK-LABEL: test_mm512_mask_div_round_ps_current: ; CHECK: ## %bb.0: @@ -1753,34 +1813,37 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src + ret <16 x float> %3 } - define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0) + ret <16 x float> %1 } + define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 1) + ret <16 x float> %1 } + define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae: ; CHECK: ## %bb.0: ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 2) + ret <16 x float> %1 } define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1788,8 +1851,8 @@ define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x floa ; CHECK: ## %bb.0: ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 3) + ret <16 x float> %1 } define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) { @@ -1797,10 +1860,10 @@ define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x flo ; CHECK: ## %bb.0: ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4) + ret <16 x float> %1 } -declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae: diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index e13ffc50535..4a5ae932ca1 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -2554,23 +2554,23 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt ret <64 x i8> %3 } -declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_add_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4) + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) ret <16 x float> %1 } define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_add_ps_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8) + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8) ret <16 x float> %1 } @@ -2581,36 +2581,42 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: @test_add_ps_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <16 x float> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } -declare <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_add_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4) + %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4) ret <8 x double> %1 } define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_add_pd_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8) + %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8) ret <8 x double> %1 } @@ -2621,36 +2627,42 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_add_pd_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <8 x double> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.add.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } -declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_sub_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4) + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4) ret <16 x float> %1 } define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_sub_ps_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8) + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8) ret <16 x float> %1 } @@ -2661,36 +2673,42 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: @test_sub_ps_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <16 x float> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } -declare <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_sub_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4) + %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4) ret <8 x double> %1 } define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_sub_pd_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8) + %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8) ret <8 x double> %1 } @@ -2701,36 +2719,42 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_sub_pd_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <8 x double> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.sub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } -declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_mul_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4) + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4) ret <16 x float> %1 } define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_mul_ps_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8) + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8) ret <16 x float> %1 } @@ -2741,36 +2765,42 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: @test_mul_ps_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <16 x float> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } -declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_mul_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4) + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4) ret <8 x double> %1 } define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_mul_pd_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8) + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8) ret <8 x double> %1 } @@ -2781,36 +2811,42 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mul_pd_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <8 x double> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } -declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_div_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 4) + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4) ret <16 x float> %1 } define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @test_div_ps_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> undef, i16 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> undef, i16 -1, i32 8) + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8) ret <16 x float> %1 } @@ -2821,36 +2857,42 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: @test_div_ps_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], <16 x float> [[C:%.*]], i16 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <16 x float> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] ; - %1 = tail call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 8) - ret <16 x float> %1 + %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c + ret <16 x float> %3 } -declare <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_div_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 4) + %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4) ret <8 x double> %1 } define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @test_div_pd_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> undef, i8 -1, i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> undef, i8 -1, i32 8) + %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8) ret <8 x double> %1 } @@ -2861,17 +2903,23 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_div_pd_mask_round( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], <8 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) -; CHECK-NEXT: ret <8 x double> [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] ; - %1 = tail call <8 x double> @llvm.x86.avx512.mask.div.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 8) - ret <8 x double> %1 + %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c + ret <8 x double> %3 } declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) |

