diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 51 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 139 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 22 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFMA.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 17 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 10 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 37 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 667 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 487 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-scalar_mask.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/fma-fneg-combine.ll | 26 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/X86/x86-avx512.ll | 660 |
13 files changed, 1710 insertions, 462 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 95a4c341d64..905afc130d8 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1933,57 +1933,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vfmadd_sd : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vfmadd_ss : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_maskz_vfmadd_sd : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_maskz_vfmadd_ss : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfmadd_sd : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfmadd_ss : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfmsub_sd : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfmsub_ss : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfnmsub_sd : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_mask3_vfnmsub_ss : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, - llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_vpmadd52h_uq_128 : GCCBuiltin<"__builtin_ia32_vpmadd52huq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 9c9e5570184..ef62a23b535 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -81,17 +81,17 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("fma.vfmsubadd.") || // Added in 7.0 Name.startswith("fma.vfnmadd.") || // Added in 7.0 Name.startswith("fma.vfnmsub.") || // Added in 7.0 - Name.startswith("avx512.mask.vfmadd.p") || // Added in 7.0 - Name.startswith("avx512.mask.vfnmadd.p") || // Added in 7.0 - Name.startswith("avx512.mask.vfnmsub.p") || // Added in 7.0 - Name.startswith("avx512.mask3.vfmadd.p") || // Added in 7.0 - Name.startswith("avx512.maskz.vfmadd.p") || // Added in 7.0 - Name.startswith("avx512.mask3.vfmsub.p") || // Added in 7.0 - Name.startswith("avx512.mask3.vfnmsub.p") || // Added in 7.0 - Name.startswith("avx512.mask.vfmaddsub.p") || // Added in 7.0 - Name.startswith("avx512.maskz.vfmaddsub.p") || // Added in 7.0 - Name.startswith("avx512.mask3.vfmaddsub.p") || // Added in 7.0 - Name.startswith("avx512.mask3.vfmsubadd.p") || // Added in 7.0 + Name.startswith("avx512.mask.vfmadd.") || // Added in 7.0 + Name.startswith("avx512.mask.vfnmadd.") || // Added in 7.0 + Name.startswith("avx512.mask.vfnmsub.") || // Added in 7.0 + Name.startswith("avx512.mask3.vfmadd.") || // Added in 7.0 + Name.startswith("avx512.maskz.vfmadd.") || // Added in 7.0 + Name.startswith("avx512.mask3.vfmsub.") || // Added in 7.0 + Name.startswith("avx512.mask3.vfnmsub.") || // Added in 7.0 + Name.startswith("avx512.mask.vfmaddsub.") || // Added in 7.0 + Name.startswith("avx512.maskz.vfmaddsub.") || // Added in 7.0 + Name.startswith("avx512.mask3.vfmaddsub.") || // Added in 7.0 + Name.startswith("avx512.mask3.vfmsubadd.") || // Added in 7.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 Name.startswith("avx512.kunpck") || //added in 6.0 @@ -826,7 +826,7 @@ static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask, static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask, Value *Op0, Value *Op1) { - // If the mask is all ones just emit the align operation. + // If the mask is all ones just emit the first operation. if (const auto *C = dyn_cast<Constant>(Mask)) if (C->isAllOnesValue()) return Op0; @@ -835,6 +835,21 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask, return Builder.CreateSelect(Mask, Op0, Op1); } +static Value *EmitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask, + Value *Op0, Value *Op1) { + // If the mask is all ones just emit the first operation. + if (const auto *C = dyn_cast<Constant>(Mask)) + if (C->isAllOnesValue()) + return Op0; + + llvm::VectorType *MaskTy = + llvm::VectorType::get(Builder.getInt1Ty(), + Mask->getType()->getIntegerBitWidth()); + Mask = Builder.CreateBitCast(Mask, MaskTy); + Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); + return Builder.CreateSelect(Mask, Op0, Op1); +} + // Handle autoupgrade for masked PALIGNR and VALIGND/Q intrinsics. // PALIGNR handles large immediates by shifting while VALIGN masks the immediate // so we need to handle both cases. VALIGN also doesn't have 128-bit lanes. @@ -2806,6 +2821,64 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()), Rep, (uint64_t)0); + } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.s") || + Name.startswith("avx512.maskz.vfmadd.s") || + Name.startswith("avx512.mask3.vfmadd.s") || + Name.startswith("avx512.mask3.vfmsub.s") || + Name.startswith("avx512.mask3.vfnmsub.s"))) { + bool IsMask3 = Name[11] == '3'; + bool IsMaskZ = Name[11] == 'z'; + // Drop the "avx512.mask." to make it easier. + Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12); + bool NegMul = Name[2] == 'n'; + bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's'; + + Value *A = CI->getArgOperand(0); + Value *B = CI->getArgOperand(1); + Value *C = CI->getArgOperand(2); + + if (NegMul && (IsMask3 || IsMaskZ)) + A = Builder.CreateFNeg(A); + if (NegMul && !(IsMask3 || IsMaskZ)) + B = Builder.CreateFNeg(B); + if (NegAcc) + C = Builder.CreateFNeg(C); + + A = Builder.CreateExtractElement(A, (uint64_t)0); + B = Builder.CreateExtractElement(B, (uint64_t)0); + C = Builder.CreateExtractElement(C, (uint64_t)0); + + if (!isa<ConstantInt>(CI->getArgOperand(4)) || + cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4) { + Value *Ops[] = { A, B, C, CI->getArgOperand(4) }; + + Intrinsic::ID IID; + if (Name.back() == 'd') + IID = Intrinsic::x86_avx512_vfmadd_f64; + else + IID = Intrinsic::x86_avx512_vfmadd_f32; + Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID); + Rep = Builder.CreateCall(FMA, Ops); + } else { + Function *FMA = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::fma, + A->getType()); + Rep = Builder.CreateCall(FMA, { A, B, C }); + } + + Value *PassThru = IsMaskZ ? Constant::getNullValue(Rep->getType()) : + IsMask3 ? C : A; + + // For Mask3 with NegAcc, we need to create a new extractelement that + // avoids the negation above. + if (NegAcc && IsMask3) + PassThru = Builder.CreateExtractElement(CI->getArgOperand(2), + (uint64_t)0); + + Rep = EmitX86ScalarSelect(Builder, CI->getArgOperand(3), + Rep, PassThru); + Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0), + Rep, (uint64_t)0); } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.p") || Name.startswith("avx512.mask.vfnmadd.p") || Name.startswith("avx512.mask.vfnmsub.p") || @@ -2820,6 +2893,17 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { bool NegMul = Name[2] == 'n'; bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's'; + Value *A = CI->getArgOperand(0); + Value *B = CI->getArgOperand(1); + Value *C = CI->getArgOperand(2); + + if (NegMul && (IsMask3 || IsMaskZ)) + A = Builder.CreateFNeg(A); + if (NegMul && !(IsMask3 || IsMaskZ)) + B = Builder.CreateFNeg(B); + if (NegAcc) + C = Builder.CreateFNeg(C); + if (CI->getNumArgOperands() == 5 && (!isa<ConstantInt>(CI->getArgOperand(4)) || cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) { @@ -2830,38 +2914,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { else IID = Intrinsic::x86_avx512_vfmadd_pd_512; - Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(4) }; - - if (NegMul) { - if (IsMaskZ || IsMask3) - Ops[0] = Builder.CreateFNeg(Ops[0]); - else - Ops[1] = Builder.CreateFNeg(Ops[1]); - } - if (NegAcc) - Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - Ops); + { A, B, C, CI->getArgOperand(4) }); } else { - - Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2) }; - - if (NegMul) { - if (IsMaskZ || IsMask3) - Ops[0] = Builder.CreateFNeg(Ops[0]); - else - Ops[1] = Builder.CreateFNeg(Ops[1]); - } - if (NegAcc) - Ops[2] = Builder.CreateFNeg(Ops[2]); - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - Ops[0]->getType()); - Rep = Builder.CreateCall(FMA, Ops); + A->getType()); + Rep = Builder.CreateCall(FMA, { A, B, C }); } Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) : diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fb923436959..50c616d382e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20710,39 +20710,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } - case FMA_OP_SCALAR_MASK: - case FMA_OP_SCALAR_MASK3: - case FMA_OP_SCALAR_MASKZ: { - SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); - SDValue Src3 = Op.getOperand(3); - SDValue Mask = Op.getOperand(4); - MVT VT = Op.getSimpleValueType(); - SDValue PassThru = SDValue(); - - // set PassThru element - if (IntrData->Type == FMA_OP_SCALAR_MASKZ) - PassThru = getZeroVector(VT, Subtarget, DAG, dl); - else if (IntrData->Type == FMA_OP_SCALAR_MASK3) - PassThru = Src3; - else - PassThru = Src1; - - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) - return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, - Op.getValueType(), Src1, Src2, - Src3, Rnd), - Mask, PassThru, Subtarget, DAG); - } - - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, - Op.getValueType(), Src1, Src2, - Src3), - Mask, PassThru, Subtarget, DAG); - } case IFMA_OP: // NOTE: We need to swizzle the operands to pass the multiply operands // first. diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 98390cea49d..57899034bd6 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6826,6 +6826,13 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zr_Int") + VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (_.ScalarLdFrag addr:$src3)))))), @@ -6841,6 +6848,13 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zm_Int") + VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (Op _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), @@ -6948,6 +6962,14 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)))))), + (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int") + VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 376f643050f..e6fdac6832b 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -355,6 +355,13 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int") VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), addr:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op RC:$src2, (mem_frag addr:$src3), + (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))), + (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int") + VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), + addr:$src3)>; } } diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 0413fc9dfba..2dd60a1b8b5 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -28,8 +28,7 @@ enum IntrinsicType : uint16_t { INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_3OP_MASK, - FMA_OP_MASK, FMA_OP_MASKZ, - FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_SCALAR, IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, COMPRESS_EXPAND_IN_REG, @@ -879,9 +878,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), @@ -908,14 +904,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK, X86ISD::VPSHUFBITQMB, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), - - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), - - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -933,9 +921,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ, X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 29ae67af7b7..cdf5746bb97 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2535,16 +2535,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx512_mask_min_ss_round: case Intrinsic::x86_avx512_mask_max_sd_round: case Intrinsic::x86_avx512_mask_min_sd_round: - case Intrinsic::x86_avx512_mask_vfmadd_ss: - case Intrinsic::x86_avx512_mask_vfmadd_sd: - case Intrinsic::x86_avx512_maskz_vfmadd_ss: - case Intrinsic::x86_avx512_maskz_vfmadd_sd: - case Intrinsic::x86_avx512_mask3_vfmadd_ss: - case Intrinsic::x86_avx512_mask3_vfmadd_sd: - case Intrinsic::x86_avx512_mask3_vfmsub_ss: - case Intrinsic::x86_avx512_mask3_vfmsub_sd: - case Intrinsic::x86_avx512_mask3_vfnmsub_ss: - case Intrinsic::x86_avx512_mask3_vfnmsub_sd: case Intrinsic::x86_sse_cmp_ss: case Intrinsic::x86_sse_min_ss: case Intrinsic::x86_sse_max_ss: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 97d24019eb6..425f5ce384b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1497,10 +1497,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::x86_avx512_mask_sub_sd_round: case Intrinsic::x86_avx512_mask_max_sd_round: case Intrinsic::x86_avx512_mask_min_sd_round: - case Intrinsic::x86_avx512_mask_vfmadd_ss: - case Intrinsic::x86_avx512_mask_vfmadd_sd: - case Intrinsic::x86_avx512_maskz_vfmadd_ss: - case Intrinsic::x86_avx512_maskz_vfmadd_sd: TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } @@ -1527,39 +1523,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; - case Intrinsic::x86_avx512_mask3_vfmadd_ss: - case Intrinsic::x86_avx512_mask3_vfmadd_sd: - case Intrinsic::x86_avx512_mask3_vfmsub_ss: - case Intrinsic::x86_avx512_mask3_vfmsub_sd: - case Intrinsic::x86_avx512_mask3_vfnmsub_ss: - case Intrinsic::x86_avx512_mask3_vfnmsub_sd: - // These intrinsics get the passthru bits from operand 2. - TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts, - UndefElts, Depth + 1); - if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; } - - // If lowest element of a scalar op isn't used then use Arg2. - if (!DemandedElts[0]) { - Worklist.Add(II); - return II->getArgOperand(2); - } - - // Only lower element is used for operand 0 and 1. - DemandedElts = 1; - TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, - UndefElts2, Depth + 1); - if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } - TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, - UndefElts3, Depth + 1); - if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } - - // Lower element is undefined if all three lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0] || !UndefElts3[0]) - UndefElts.clearBit(0); - - break; - case Intrinsic::x86_sse2_packssdw_128: case Intrinsic::x86_sse2_packsswb_128: case Intrinsic::x86_sse2_packuswb_128: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 77dacadbd36..a37129aaf69 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -8784,3 +8784,670 @@ define <8 x i64>@test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 %res4 = add <8 x i64> %res3, %res2 ret <8 x i64> %res4 } + +declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8] +; X86-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xda] +; X86-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] +; X86-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2] +; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 +; X86-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8] +; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea] +; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc] +; X86-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0] +; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd: +; X64: ## %bb.0: +; X64-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8] +; X64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xda] +; X64-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0] +; X64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2] +; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 +; X64-NEXT: vmovapd %xmm0, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xe8] +; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xea] +; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa9,0xc2] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x58,0xcc] +; X64-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd1,0x58,0xc0] +; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x58,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xd8] +; X86-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xda] +; X86-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] +; X86-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2] +; X86-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 +; X86-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8] +; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea] +; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc] +; X86-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0] +; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss: +; X64: ## %bb.0: +; X64-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xd8] +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xda] +; X64-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0] +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2] +; X64-NEXT: ## xmm4 = (xmm1 * xmm4) + xmm2 +; X64-NEXT: vmovaps %xmm0, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xe8] +; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xea] +; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa9,0xc2] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm1 ## encoding: [0xc5,0xe0,0x58,0xcc] +; X64-NEXT: vaddps %xmm0, %xmm5, %xmm0 ## encoding: [0xc5,0xd0,0x58,0xc0] +; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8] +; X86-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xda] +; X86-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xf9,0xa9,0xc2] +; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8] +; X64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xda] +; X64-NEXT: ## xmm3 = (xmm1 * xmm3) + xmm2 +; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xf9,0xa9,0xc2] +; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2] +; X86-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2] +; X64-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X86-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0xd9] +; X86-NEXT: ## xmm3 = (xmm0 * xmm1) + xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1] +; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 +; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9] +; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: +; X64: ## %bb.0: +; X64-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X64-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0xd9] +; X64-NEXT: ## xmm3 = (xmm0 * xmm1) + xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1] +; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 +; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe9] +; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xb9,0xd1] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X86-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0xd9] +; X86-NEXT: ## xmm3 = (xmm0 * xmm1) + xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1] +; X86-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 +; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9] +; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: +; X64: ## %bb.0: +; X64-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X64-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0xd9] +; X64-NEXT: ## xmm3 = (xmm0 * xmm1) + xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1] +; X64-NEXT: ## xmm4 = (xmm0 * xmm1) + xmm4 +; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe9] +; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xb9,0xd1] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) { +; X86-LABEL: fmadd_ss_mask_memfold: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: vmovss (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02] +; X86-NEXT: ## xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss (%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x09] +; X86-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vfmadd213ss %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xc8] +; X86-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm0 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; X86-NEXT: vmovss %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x02] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: fmadd_ss_mask_memfold: +; X64: ## %bb.0: +; X64-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; X64-NEXT: ## xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vmovss (%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0e] +; X64-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X64-NEXT: vfmadd213ss %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xc8] +; X64-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm0 +; X64-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] +; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; X64-NEXT: vmovss %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07] +; X64-NEXT: retq ## encoding: [0xc3] + %a.val = load float, float* %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, float* %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, float* %a + ret void +} + +define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) { +; X86-LABEL: fmadd_ss_maskz_memfold: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: vmovss (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02] +; X86-NEXT: ## xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vfmadd231ss (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x01] +; X86-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; X86-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc8] +; X86-NEXT: vmovss %xmm1, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x0a] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: fmadd_ss_maskz_memfold: +; X64: ## %bb.0: +; X64-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; X64-NEXT: ## xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vfmadd231ss (%rsi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x06] +; X64-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X64-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; X64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc8] +; X64-NEXT: vmovss %xmm1, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x0f] +; X64-NEXT: retq ## encoding: [0xc3] + %a.val = load float, float* %a + %av0 = insertelement <4 x float> undef, float %a.val, i32 0 + %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 + %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 + %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 + + %b.val = load float, float* %b + %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 + %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 + %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + + %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) + + %sr = extractelement <4 x float> %vr, i32 0 + store float %sr, float* %a + ret void +} + +define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) { +; X86-LABEL: fmadd_sd_mask_memfold: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: vmovsd (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02] +; X86-NEXT: ## xmm0 = mem[0],zero +; X86-NEXT: vmovsd (%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x09] +; X86-NEXT: ## xmm1 = mem[0],zero +; X86-NEXT: vfmadd213sd %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xc8] +; X86-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm0 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; X86-NEXT: vmovsd %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x02] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: fmadd_sd_mask_memfold: +; X64: ## %bb.0: +; X64-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; X64-NEXT: ## xmm0 = mem[0],zero +; X64-NEXT: vmovsd (%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0e] +; X64-NEXT: ## xmm1 = mem[0],zero +; X64-NEXT: vfmadd213sd %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xc8] +; X64-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm0 +; X64-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] +; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; X64-NEXT: vmovsd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] +; X64-NEXT: retq ## encoding: [0xc3] + %a.val = load double, double* %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, double* %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, double* %a + ret void +} + +define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) { +; X86-LABEL: fmadd_sd_maskz_memfold: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: vmovsd (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02] +; X86-NEXT: ## xmm0 = mem[0],zero +; X86-NEXT: vfmadd231sd (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x01] +; X86-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9] +; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x10,0xc8] +; X86-NEXT: vmovsd %xmm1, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x0a] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: fmadd_sd_maskz_memfold: +; X64: ## %bb.0: +; X64-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; X64-NEXT: ## xmm0 = mem[0],zero +; X64-NEXT: vfmadd231sd (%rsi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x06] +; X64-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X64-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9] +; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x10,0xc8] +; X64-NEXT: vmovsd %xmm1, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x0f] +; X64-NEXT: retq ## encoding: [0xc3] + %a.val = load double, double* %a + %av0 = insertelement <2 x double> undef, double %a.val, i32 0 + %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 + + %b.val = load double, double* %b + %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 + %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 + + %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) + + %sr = extractelement <2 x double> %vr, i32 0 + store double %sr, double* %a + ret void +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X86-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbb,0xd9] +; X86-NEXT: ## xmm3 = (xmm0 * xmm1) - xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1] +; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 +; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9] +; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: +; X64: ## %bb.0: +; X64-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X64-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbb,0xd9] +; X64-NEXT: ## xmm3 = (xmm0 * xmm1) - xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1] +; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 +; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe9] +; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbb,0xd1] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X86-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbb,0xd9] +; X86-NEXT: ## xmm3 = (xmm0 * xmm1) - xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1] +; X86-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 +; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9] +; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: +; X64: ## %bb.0: +; X64-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X64-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbb,0xd9] +; X64-NEXT: ## xmm3 = (xmm0 * xmm1) - xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1] +; X64-NEXT: ## xmm4 = (xmm0 * xmm1) - xmm4 +; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe9] +; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbb,0xd1] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X86-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbf,0xd9] +; X86-NEXT: ## xmm3 = -(xmm0 * xmm1) - xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X86-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1] +; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 +; X86-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9] +; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1] +; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X86-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: +; X64: ## %bb.0: +; X64-NEXT: vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda] +; X64-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbf,0xd9] +; X64-NEXT: ## xmm3 = -(xmm0 * xmm1) - xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2] +; X64-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1] +; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 +; X64-NEXT: vmovapd %xmm2, %xmm5 ## encoding: [0xc5,0xf9,0x28,0xea] +; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe9] +; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x79,0xbf,0xd1] +; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc4] +; X64-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd1,0x58,0xca] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: +; X86: ## %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04] +; X86-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X86-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbf,0xd9] +; X86-NEXT: ## xmm3 = -(xmm0 * xmm1) - xmm3 +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X86-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1] +; X86-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 +; X86-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9] +; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1] +; X86-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X86-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: +; X64: ## %bb.0: +; X64-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda] +; X64-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbf,0xd9] +; X64-NEXT: ## xmm3 = -(xmm0 * xmm1) - xmm3 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2] +; X64-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1] +; X64-NEXT: ## xmm4 = -(xmm0 * xmm1) - xmm4 +; X64-NEXT: vmovaps %xmm2, %xmm5 ## encoding: [0xc5,0xf8,0x28,0xea] +; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe9] +; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x79,0xbf,0xd1] +; X64-NEXT: vaddps %xmm4, %xmm3, %xmm0 ## encoding: [0xc5,0xe0,0x58,0xc4] +; X64-NEXT: vaddps %xmm2, %xmm5, %xmm1 ## encoding: [0xc5,0xd0,0x58,0xca] +; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { +; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ## encoding: [0x8a,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vfmadd231ss (%eax), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0x08] +; X86-NEXT: ## xmm1 = (xmm0 * mem) + xmm1 +; X86-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0x0f] +; X64-NEXT: ## xmm1 = (xmm0 * mem) + xmm1 +; X64-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ## encoding: [0x8a,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vfmadd132ss (%eax), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x99,0x00] +; X86-NEXT: ## xmm0 = (xmm0 * mem) + xmm1 +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x99,0x07] +; X64-NEXT: ## xmm0 = (xmm0 * mem) + xmm1 +; X64-NEXT: retq ## encoding: [0xc3] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; CHECK-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) + ret < 4 x float> %res +} diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index bca1326bd8d..c538972266b 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4340,7 +4340,8 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ret <2 x double> %res4 } -declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) +declare double @llvm.fma.f64(double, double, double) #1 +declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: @@ -4357,18 +4358,38 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) - %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) - %res4 = fadd <2 x double> %res, %res1 - %res5 = fadd <2 x double> %res2, %res3 + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = insertelement <2 x double> %x0, double %4, i64 0 + %6 = extractelement <2 x double> %x0, i64 0 + %7 = extractelement <2 x double> %x1, i64 0 + %8 = extractelement <2 x double> %x2, i64 0 + %9 = call double @llvm.fma.f64(double %6, double %7, double %8) + %10 = bitcast i8 %x3 to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, double %9, double %6 + %13 = insertelement <2 x double> %x0, double %12, i64 0 + %14 = extractelement <2 x double> %x0, i64 0 + %15 = extractelement <2 x double> %x1, i64 0 + %16 = extractelement <2 x double> %x2, i64 0 + %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 3) + %18 = insertelement <2 x double> %x0, double %17, i64 0 + %19 = extractelement <2 x double> %x0, i64 0 + %20 = extractelement <2 x double> %x1, i64 0 + %21 = extractelement <2 x double> %x2, i64 0 + %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3) + %23 = bitcast i8 %x3 to <8 x i1> + %24 = extractelement <8 x i1> %23, i64 0 + %25 = select i1 %24, double %22, double %19 + %26 = insertelement <2 x double> %x0, double %25, i64 0 + %res4 = fadd <2 x double> %5, %13 + %res5 = fadd <2 x double> %18, %26 %res6 = fadd <2 x double> %res4, %res5 ret <2 x double> %res6 } -declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss: ; CHECK: ## %bb.0: @@ -4384,18 +4405,38 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: vaddps %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) - %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) - %res4 = fadd <4 x float> %res, %res1 - %res5 = fadd <4 x float> %res2, %res3 + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = insertelement <4 x float> %x0, float %4, i64 0 + %6 = extractelement <4 x float> %x0, i64 0 + %7 = extractelement <4 x float> %x1, i64 0 + %8 = extractelement <4 x float> %x2, i64 0 + %9 = call float @llvm.fma.f32(float %6, float %7, float %8) + %10 = bitcast i8 %x3 to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, float %9, float %6 + %13 = insertelement <4 x float> %x0, float %12, i64 0 + %14 = extractelement <4 x float> %x0, i64 0 + %15 = extractelement <4 x float> %x1, i64 0 + %16 = extractelement <4 x float> %x2, i64 0 + %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 3) + %18 = insertelement <4 x float> %x0, float %17, i64 0 + %19 = extractelement <4 x float> %x0, i64 0 + %20 = extractelement <4 x float> %x1, i64 0 + %21 = extractelement <4 x float> %x2, i64 0 + %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3) + %23 = bitcast i8 %x3 to <8 x i1> + %24 = extractelement <8 x i1> %23, i64 0 + %25 = select i1 %24, float %22, float %19 + %26 = insertelement <4 x float> %x0, float %25, i64 0 + %res4 = fadd <4 x float> %5, %13 + %res5 = fadd <4 x float> %18, %26 %res6 = fadd <4 x float> %res4, %res5 ret <4 x float> %res6 } -declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: ; CHECK: ## %bb.0: @@ -4405,13 +4446,28 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) - %res2 = fadd <2 x double> %res, %res1 + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double 0.000000e+00 + %8 = insertelement <2 x double> %x0, double %7, i64 0 + %9 = extractelement <2 x double> %x0, i64 0 + %10 = extractelement <2 x double> %x1, i64 0 + %11 = extractelement <2 x double> %x2, i64 0 + %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 3) + %13 = bitcast i8 %x3 to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, double %12, double 0.000000e+00 + %16 = insertelement <2 x double> %x0, double %15, i64 0 + %res2 = fadd <2 x double> %8, %16 ret <2 x double> %res2 } -declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: @@ -4419,12 +4475,25 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float 0.000000e+00 + %8 = insertelement <4 x float> %x0, float %7, i64 0 + %9 = extractelement <4 x float> %x0, i64 0 + %10 = extractelement <4 x float> %x1, i64 0 + %11 = extractelement <4 x float> %x2, i64 0 + %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 3) + %13 = bitcast i8 %x3 to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, float %12, float 0.000000e+00 + %16 = insertelement <4 x float> %x0, float %15, i64 0 + %res2 = fadd <4 x float> %8, %16 + ret <4 x float> %8 } -declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: @@ -4441,18 +4510,38 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) - %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) - %res4 = fadd <2 x double> %res, %res1 - %res5 = fadd <2 x double> %res2, %res3 + %1 = extractelement <2 x double> %x0, i64 0 + %2 = extractelement <2 x double> %x1, i64 0 + %3 = extractelement <2 x double> %x2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = insertelement <2 x double> %x2, double %4, i64 0 + %6 = extractelement <2 x double> %x0, i64 0 + %7 = extractelement <2 x double> %x1, i64 0 + %8 = extractelement <2 x double> %x2, i64 0 + %9 = call double @llvm.fma.f64(double %6, double %7, double %8) + %10 = bitcast i8 %x3 to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, double %9, double %8 + %13 = insertelement <2 x double> %x2, double %12, i64 0 + %14 = extractelement <2 x double> %x0, i64 0 + %15 = extractelement <2 x double> %x1, i64 0 + %16 = extractelement <2 x double> %x2, i64 0 + %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 3) + %18 = insertelement <2 x double> %x2, double %17, i64 0 + %19 = extractelement <2 x double> %x0, i64 0 + %20 = extractelement <2 x double> %x1, i64 0 + %21 = extractelement <2 x double> %x2, i64 0 + %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3) + %23 = bitcast i8 %x3 to <8 x i1> + %24 = extractelement <8 x i1> %23, i64 0 + %25 = select i1 %24, double %22, double %21 + %26 = insertelement <2 x double> %x2, double %25, i64 0 + %res4 = fadd <2 x double> %5, %13 + %res5 = fadd <2 x double> %18, %26 %res6 = fadd <2 x double> %res4, %res5 ret <2 x double> %res6 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: ; CHECK: ## %bb.0: @@ -4468,12 +4557,34 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) - %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) - %res4 = fadd <4 x float> %res, %res1 - %res5 = fadd <4 x float> %res2, %res3 + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %x2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = insertelement <4 x float> %x2, float %4, i64 0 + %6 = extractelement <4 x float> %x0, i64 0 + %7 = extractelement <4 x float> %x1, i64 0 + %8 = extractelement <4 x float> %x2, i64 0 + %9 = call float @llvm.fma.f32(float %6, float %7, float %8) + %10 = bitcast i8 %x3 to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, float %9, float %8 + %13 = insertelement <4 x float> %x2, float %12, i64 0 + %14 = extractelement <4 x float> %x0, i64 0 + %15 = extractelement <4 x float> %x1, i64 0 + %16 = extractelement <4 x float> %x2, i64 0 + %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 3) + %18 = insertelement <4 x float> %x2, float %17, i64 0 + %19 = extractelement <4 x float> %x0, i64 0 + %20 = extractelement <4 x float> %x1, i64 0 + %21 = extractelement <4 x float> %x2, i64 0 + %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3) + %23 = bitcast i8 %x3 to <8 x i1> + %24 = extractelement <8 x i1> %23, i64 0 + %25 = select i1 %24, float %22, float %21 + %26 = insertelement <4 x float> %x2, float %25, i64 0 + %res4 = fadd <4 x float> %5, %13 + %res5 = fadd <4 x float> %18, %26 %res6 = fadd <4 x float> %res4, %res5 ret <4 x float> %res6 } @@ -4482,8 +4593,10 @@ define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) { ; CHECK-LABEL: fmadd_ss_mask_memfold: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0 ; CHECK-NEXT: kmovw %edx, %k1 -; CHECK-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load float, float* %a @@ -4496,11 +4609,16 @@ define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) { %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 - %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - - %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) - - %sr = extractelement <4 x float> %vr, i32 0 + %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 + %1 = extractelement <4 x float> %av, i64 0 + %2 = extractelement <4 x float> %bv, i64 0 + %3 = extractelement <4 x float> %av, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %1 + %8 = insertelement <4 x float> %av, float %7, i64 0 + %sr = extractelement <4 x float> %8, i32 0 store float %sr, float* %a ret void } @@ -4509,9 +4627,11 @@ define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) { ; CHECK-LABEL: fmadd_ss_maskz_memfold: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; CHECK-NEXT: kmovw %edx, %k1 -; CHECK-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovss %xmm1, (%rdi) ; CHECK-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -4524,10 +4644,15 @@ define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) { %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 - - %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4) - - %sr = extractelement <4 x float> %vr, i32 0 + %1 = extractelement <4 x float> %av, i64 0 + %2 = extractelement <4 x float> %bv, i64 0 + %3 = extractelement <4 x float> %av, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float 0.000000e+00 + %8 = insertelement <4 x float> %av, float %7, i64 0 + %sr = extractelement <4 x float> %8, i32 0 store float %sr, float* %a ret void } @@ -4536,9 +4661,11 @@ define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) { ; CHECK-LABEL: fmadd_sd_mask_memfold: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0 ; CHECK-NEXT: kmovw %edx, %k1 -; CHECK-NEXT: vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 -; CHECK-NEXT: vmovlpd %xmm0, (%rdi) +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmovsd %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load double, double* %a %av0 = insertelement <2 x double> undef, double %a.val, i32 0 @@ -4547,10 +4674,15 @@ define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) { %b.val = load double, double* %b %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - - %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) - - %sr = extractelement <2 x double> %vr, i32 0 + %1 = extractelement <2 x double> %av, i64 0 + %2 = extractelement <2 x double> %bv, i64 0 + %3 = extractelement <2 x double> %av, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double %1 + %8 = insertelement <2 x double> %av, double %7, i64 0 + %sr = extractelement <2 x double> %8, i32 0 store double %sr, double* %a ret void } @@ -4559,9 +4691,11 @@ define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) { ; CHECK-LABEL: fmadd_sd_maskz_memfold: ; CHECK: ## %bb.0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; CHECK-NEXT: kmovw %edx, %k1 -; CHECK-NEXT: vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 -; CHECK-NEXT: vmovlpd %xmm0, (%rdi) +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovsd %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovsd %xmm1, (%rdi) ; CHECK-NEXT: retq %a.val = load double, double* %a %av0 = insertelement <2 x double> undef, double %a.val, i32 0 @@ -4570,16 +4704,19 @@ define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) { %b.val = load double, double* %b %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 - - %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4) - - %sr = extractelement <2 x double> %vr, i32 0 + %1 = extractelement <2 x double> %av, i64 0 + %2 = extractelement <2 x double> %bv, i64 0 + %3 = extractelement <2 x double> %av, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = bitcast i8 %c to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, double %4, double 0.000000e+00 + %8 = insertelement <2 x double> %av, double %7, i64 0 + %sr = extractelement <2 x double> %8, i32 0 store double %sr, double* %a ret void } -declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: ; CHECK: ## %bb.0: @@ -4595,18 +4732,46 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) - %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) - %res4 = fadd <2 x double> %res, %res1 - %res5 = fadd <2 x double> %res2, %res3 + %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %2 = extractelement <2 x double> %x0, i64 0 + %3 = extractelement <2 x double> %x1, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = extractelement <2 x double> %x2, i64 0 + %7 = insertelement <2 x double> %x2, double %5, i64 0 + %8 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %9 = extractelement <2 x double> %x0, i64 0 + %10 = extractelement <2 x double> %x1, i64 0 + %11 = extractelement <2 x double> %8, i64 0 + %12 = call double @llvm.fma.f64(double %9, double %10, double %11) + %13 = extractelement <2 x double> %x2, i64 0 + %14 = bitcast i8 %x3 to <8 x i1> + %15 = extractelement <8 x i1> %14, i64 0 + %16 = select i1 %15, double %12, double %13 + %17 = insertelement <2 x double> %x2, double %16, i64 0 + %18 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %19 = extractelement <2 x double> %x0, i64 0 + %20 = extractelement <2 x double> %x1, i64 0 + %21 = extractelement <2 x double> %18, i64 0 + %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 3) + %23 = extractelement <2 x double> %x2, i64 0 + %24 = insertelement <2 x double> %x2, double %22, i64 0 + %25 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %26 = extractelement <2 x double> %x0, i64 0 + %27 = extractelement <2 x double> %x1, i64 0 + %28 = extractelement <2 x double> %25, i64 0 + %29 = call double @llvm.x86.avx512.vfmadd.f64(double %26, double %27, double %28, i32 3) + %30 = extractelement <2 x double> %x2, i64 0 + %31 = bitcast i8 %x3 to <8 x i1> + %32 = extractelement <8 x i1> %31, i64 0 + %33 = select i1 %32, double %29, double %30 + %34 = insertelement <2 x double> %x2, double %33, i64 0 + %res4 = fadd <2 x double> %7, %17 + %res5 = fadd <2 x double> %24, %34 %res6 = fadd <2 x double> %res4, %res5 ret <2 x double> %res6 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: ; CHECK: ## %bb.0: @@ -4622,18 +4787,46 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) - %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) - %res4 = fadd <4 x float> %res, %res1 - %res5 = fadd <4 x float> %res2, %res3 + %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %2 = extractelement <4 x float> %x0, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = extractelement <4 x float> %1, i64 0 + %5 = call float @llvm.fma.f32(float %2, float %3, float %4) + %6 = extractelement <4 x float> %x2, i64 0 + %7 = insertelement <4 x float> %x2, float %5, i64 0 + %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %9 = extractelement <4 x float> %x0, i64 0 + %10 = extractelement <4 x float> %x1, i64 0 + %11 = extractelement <4 x float> %8, i64 0 + %12 = call float @llvm.fma.f32(float %9, float %10, float %11) + %13 = extractelement <4 x float> %x2, i64 0 + %14 = bitcast i8 %x3 to <8 x i1> + %15 = extractelement <8 x i1> %14, i64 0 + %16 = select i1 %15, float %12, float %13 + %17 = insertelement <4 x float> %x2, float %16, i64 0 + %18 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %19 = extractelement <4 x float> %x0, i64 0 + %20 = extractelement <4 x float> %x1, i64 0 + %21 = extractelement <4 x float> %18, i64 0 + %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 3) + %23 = extractelement <4 x float> %x2, i64 0 + %24 = insertelement <4 x float> %x2, float %22, i64 0 + %25 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %26 = extractelement <4 x float> %x0, i64 0 + %27 = extractelement <4 x float> %x1, i64 0 + %28 = extractelement <4 x float> %25, i64 0 + %29 = call float @llvm.x86.avx512.vfmadd.f32(float %26, float %27, float %28, i32 3) + %30 = extractelement <4 x float> %x2, i64 0 + %31 = bitcast i8 %x3 to <8 x i1> + %32 = extractelement <8 x i1> %31, i64 0 + %33 = select i1 %32, float %29, float %30 + %34 = insertelement <4 x float> %x2, float %33, i64 0 + %res4 = fadd <4 x float> %7, %17 + %res5 = fadd <4 x float> %24, %34 %res6 = fadd <4 x float> %res4, %res5 ret <4 x float> %res6 } -declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: ; CHECK: ## %bb.0: @@ -4649,18 +4842,50 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) - %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) - %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) - %res4 = fadd <2 x double> %res, %res1 - %res5 = fadd <2 x double> %res2, %res3 + %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0 + %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %3 = extractelement <2 x double> %1, i64 0 + %4 = extractelement <2 x double> %x1, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = extractelement <2 x double> %x2, i64 0 + %8 = insertelement <2 x double> %x2, double %6, i64 0 + %9 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0 + %10 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %11 = extractelement <2 x double> %9, i64 0 + %12 = extractelement <2 x double> %x1, i64 0 + %13 = extractelement <2 x double> %10, i64 0 + %14 = call double @llvm.fma.f64(double %11, double %12, double %13) + %15 = extractelement <2 x double> %x2, i64 0 + %16 = bitcast i8 %x3 to <8 x i1> + %17 = extractelement <8 x i1> %16, i64 0 + %18 = select i1 %17, double %14, double %15 + %19 = insertelement <2 x double> %x2, double %18, i64 0 + %20 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0 + %21 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %22 = extractelement <2 x double> %20, i64 0 + %23 = extractelement <2 x double> %x1, i64 0 + %24 = extractelement <2 x double> %21, i64 0 + %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 3) + %26 = extractelement <2 x double> %x2, i64 0 + %27 = insertelement <2 x double> %x2, double %25, i64 0 + %28 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0 + %29 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2 + %30 = extractelement <2 x double> %28, i64 0 + %31 = extractelement <2 x double> %x1, i64 0 + %32 = extractelement <2 x double> %29, i64 0 + %33 = call double @llvm.x86.avx512.vfmadd.f64(double %30, double %31, double %32, i32 3) + %34 = extractelement <2 x double> %x2, i64 0 + %35 = bitcast i8 %x3 to <8 x i1> + %36 = extractelement <8 x i1> %35, i64 0 + %37 = select i1 %36, double %33, double %34 + %38 = insertelement <2 x double> %x2, double %37, i64 0 + %res4 = fadd <2 x double> %8, %19 + %res5 = fadd <2 x double> %27, %38 %res6 = fadd <2 x double> %res4, %res5 ret <2 x double> %res6 } -declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: ; CHECK: ## %bb.0: @@ -4676,12 +4901,46 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) - %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) - %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) - %res4 = fadd <4 x float> %res, %res1 - %res5 = fadd <4 x float> %res2, %res3 + %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0 + %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %3 = extractelement <4 x float> %1, i64 0 + %4 = extractelement <4 x float> %x1, i64 0 + %5 = extractelement <4 x float> %2, i64 0 + %6 = call float @llvm.fma.f32(float %3, float %4, float %5) + %7 = extractelement <4 x float> %x2, i64 0 + %8 = insertelement <4 x float> %x2, float %6, i64 0 + %9 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0 + %10 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %11 = extractelement <4 x float> %9, i64 0 + %12 = extractelement <4 x float> %x1, i64 0 + %13 = extractelement <4 x float> %10, i64 0 + %14 = call float @llvm.fma.f32(float %11, float %12, float %13) + %15 = extractelement <4 x float> %x2, i64 0 + %16 = bitcast i8 %x3 to <8 x i1> + %17 = extractelement <8 x i1> %16, i64 0 + %18 = select i1 %17, float %14, float %15 + %19 = insertelement <4 x float> %x2, float %18, i64 0 + %20 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0 + %21 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %22 = extractelement <4 x float> %20, i64 0 + %23 = extractelement <4 x float> %x1, i64 0 + %24 = extractelement <4 x float> %21, i64 0 + %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 3) + %26 = extractelement <4 x float> %x2, i64 0 + %27 = insertelement <4 x float> %x2, float %25, i64 0 + %28 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0 + %29 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2 + %30 = extractelement <4 x float> %28, i64 0 + %31 = extractelement <4 x float> %x1, i64 0 + %32 = extractelement <4 x float> %29, i64 0 + %33 = call float @llvm.x86.avx512.vfmadd.f32(float %30, float %31, float %32, i32 3) + %34 = extractelement <4 x float> %x2, i64 0 + %35 = bitcast i8 %x3 to <8 x i1> + %36 = extractelement <8 x i1> %35, i64 0 + %37 = select i1 %36, float %33, float %34 + %38 = insertelement <4 x float> %x2, float %37, i64 0 + %res4 = fadd <4 x float> %8, %19 + %res5 = fadd <4 x float> %27, %38 %res6 = fadd <4 x float> %res4, %res5 ret <4 x float> %res6 } @@ -4695,8 +4954,15 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) - ret < 4 x float> %res + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %vecinit.i, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %3 + %8 = insertelement <4 x float> %x1, float %7, i64 0 + ret <4 x float> %8 } define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { @@ -4707,22 +4973,33 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) - ret < 4 x float> %res + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %vecinit.i, i64 0 + %3 = extractelement <4 x float> %x1, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = bitcast i8 %x3 to <8 x i1> + %6 = extractelement <8 x i1> %5, i64 0 + %7 = select i1 %6, float %4, float %1 + %8 = insertelement <4 x float> %x0, float %7, i64 0 + ret <4 x float> %8 } define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 - %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) - ret < 4 x float> %res + %1 = extractelement <4 x float> %x0, i64 0 + %2 = extractelement <4 x float> %x1, i64 0 + %3 = extractelement <4 x float> %vecinit.i, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = select i1 false, float %4, float 0.000000e+00 + %6 = insertelement <4 x float> %x0, float %5, i64 0 + ret <4 x float> %6 } define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) { diff --git a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll index e0a91575636..1a98bd958e3 100644 --- a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll +++ b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll @@ -28,9 +28,6 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const0_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) ret < 4 x float> %res @@ -40,9 +37,8 @@ define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const0_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) ret < 4 x float> %res @@ -52,9 +48,6 @@ define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const2_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: movb $2, %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) ret < 4 x float> %res @@ -64,9 +57,8 @@ define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const2_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: movb $2, %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) ret < 4 x float> %res diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll index 6d02eaec36f..ce368744562 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -132,19 +132,21 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { ; SKX-LABEL: test11: ; SKX: # %bb.0: # %entry -; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm3 +; SKX-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} -; SKX-NEXT: vmovaps %xmm2, %xmm0 +; SKX-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; SKX-NEXT: vmovaps %xmm3, %xmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test11: ; KNL: # %bb.0: # %entry ; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] -; KNL-NEXT: vxorps %xmm3, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm3, %xmm2, %xmm3 +; KNL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} -; KNL-NEXT: vmovaps %xmm2, %xmm0 +; KNL-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; KNL-NEXT: vmovaps %xmm3, %xmm0 ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c @@ -199,16 +201,20 @@ entry: define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; SKX-LABEL: test13: ; SKX: # %bb.0: # %entry -; SKX-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm3 +; SKX-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovsd %xmm1, %xmm3, %xmm3 {%k1} +; SKX-NEXT: vmovapd %xmm3, %xmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test13: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm3 +; KNL-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vmovsd %xmm1, %xmm3, %xmm3 {%k1} +; KNL-NEXT: vmovapd %xmm3, %xmm0 ; KNL-NEXT: retq entry: diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index 32612ba36f1..6a966203e07 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -1123,12 +1123,19 @@ define <8 x double> @test_rndscale_pd_512_ceil(<8 x double> %src, <8 x double> % ret <8 x double> %1 } -declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare float @llvm.fma.f32(float, float, float) #1 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask_vfmadd_ss( -; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <4 x float> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP8]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -1136,22 +1143,41 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1 %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4) - ret <4 x float> %res + %7 = extractelement <4 x float> %a, i64 0 + %8 = extractelement <4 x float> %3, i64 0 + %9 = extractelement <4 x float> %6, i64 0 + %10 = call float @llvm.fma.f32(float %7, float %8, float %9) + %11 = bitcast i8 %mask to <8 x i1> + %12 = extractelement <8 x i1> %11, i64 0 + %13 = select i1 %12, float %10, float %7 + %14 = insertelement <4 x float> %a, float %13, i64 0 + ret <4 x float> %14 } define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask_vfmadd_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP1]] +; CHECK-NEXT: ret float [[TMP7]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 + %4 = extractelement <4 x float> %3, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %c, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float %4 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 0 + ret float %12 } define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { @@ -1161,34 +1187,67 @@ define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 + %4 = extractelement <4 x float> %3, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %c, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float %4 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 1 + ret float %12 } -declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) +declare double @llvm.fma.f64(double, double, double) #1 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask_vfmadd_sd( -; CHECK-NEXT: [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <2 x double> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP8]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4) - ret <2 x double> %res + %3 = extractelement <2 x double> %a, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = bitcast i8 %mask to <8 x i1> + %8 = extractelement <8 x i1> %7, i64 0 + %9 = select i1 %8, double %6, double %3 + %10 = insertelement <2 x double> %a, double %9, i64 0 + ret <2 x double> %10 } define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask_vfmadd_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP1]] +; CHECK-NEXT: ret double [[TMP7]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 + %2 = extractelement <2 x double> %1, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %c, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double %2 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 0 + ret double %10 } define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { @@ -1196,17 +1255,29 @@ define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x doub ; CHECK-NEXT: ret double 1.000000e+00 ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 + %2 = extractelement <2 x double> %1, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %c, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double %2 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 1 + ret double %10 } -declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_maskz_vfmadd_ss( -; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <4 x float> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP8]] ; %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -1214,22 +1285,41 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1 %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6, i8 %mask, i32 4) - ret <4 x float> %res + %7 = extractelement <4 x float> %a, i64 0 + %8 = extractelement <4 x float> %3, i64 0 + %9 = extractelement <4 x float> %6, i64 0 + %10 = call float @llvm.fma.f32(float %7, float %8, float %9) + %11 = bitcast i8 %mask to <8 x i1> + %12 = extractelement <8 x i1> %11, i64 0 + %13 = select i1 %12, float %10, float 0.000000e+00 + %14 = insertelement <4 x float> %a, float %13, i64 0 + ret <4 x float> %14 } define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_maskz_vfmadd_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00 +; CHECK-NEXT: ret float [[TMP7]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 + %4 = extractelement <4 x float> %3, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %c, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float 0.000000e+00 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 0 + ret float %12 } define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { @@ -1239,34 +1329,65 @@ define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 + %4 = extractelement <4 x float> %3, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %c, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float 0.000000e+00 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 1 + ret float %12 } -declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_maskz_vfmadd_sd( -; CHECK-NEXT: [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <2 x double> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP8]] ; %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2, i8 %mask, i32 4) - ret <2 x double> %res + %3 = extractelement <2 x double> %a, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = bitcast i8 %mask to <8 x i1> + %8 = extractelement <8 x i1> %7, i64 0 + %9 = select i1 %8, double %6, double 0.000000e+00 + %10 = insertelement <2 x double> %a, double %9, i64 0 + ret <2 x double> %10 } define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_maskz_vfmadd_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00 +; CHECK-NEXT: ret double [[TMP7]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 + %2 = extractelement <2 x double> %1, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %c, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double 0.000000e+00 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 0 + ret double %10 } define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { @@ -1274,17 +1395,29 @@ define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: ret double 1.000000e+00 ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 + %2 = extractelement <2 x double> %1, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %c, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double 0.000000e+00 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 1 + ret double %10 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmadd_ss( -; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <4 x float> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[C]], float [[TMP7]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP8]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -1292,22 +1425,41 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1 %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4) - ret <4 x float> %res + %7 = extractelement <4 x float> %3, i64 0 + %8 = extractelement <4 x float> %6, i64 0 + %9 = extractelement <4 x float> %c, i64 0 + %10 = call float @llvm.fma.f32(float %7, float %8, float %9) + %11 = bitcast i8 %mask to <8 x i1> + %12 = extractelement <8 x i1> %11, i64 0 + %13 = select i1 %12, float %10, float %9 + %14 = insertelement <4 x float> %c, float %13, i64 0 + ret <4 x float> %14 } define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmadd_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP3]] +; CHECK-NEXT: ret float [[TMP7]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 + %4 = extractelement <4 x float> %a, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %3, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float %6 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 0 + ret float %12 } define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { @@ -1317,34 +1469,65 @@ define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 + %4 = extractelement <4 x float> %a, i64 0 + %5 = extractelement <4 x float> %b, i64 0 + %6 = extractelement <4 x float> %3, i64 0 + %7 = call float @llvm.fma.f32(float %4, float %5, float %6) + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, float %7, float %6 + %11 = insertelement <4 x float> %3, float %10, i64 0 + %12 = extractelement <4 x float> %11, i32 1 + ret float %12 } -declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmadd_sd( -; CHECK-NEXT: [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <2 x double> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[C]], double [[TMP7]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP8]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4) - ret <2 x double> %res + %3 = extractelement <2 x double> %1, i64 0 + %4 = extractelement <2 x double> %2, i64 0 + %5 = extractelement <2 x double> %c, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = bitcast i8 %mask to <8 x i1> + %8 = extractelement <8 x i1> %7, i64 0 + %9 = select i1 %8, double %6, double %5 + %10 = insertelement <2 x double> %c, double %9, i64 0 + ret <2 x double> %10 } define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmadd_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP3]] +; CHECK-NEXT: ret double [[TMP7]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 + %2 = extractelement <2 x double> %a, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double %4 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 0 + ret double %10 } define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { @@ -1352,17 +1535,31 @@ define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: ret double 1.000000e+00 ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 + %2 = extractelement <2 x double> %a, i64 0 + %3 = extractelement <2 x double> %b, i64 0 + %4 = extractelement <2 x double> %1, i64 0 + %5 = call double @llvm.fma.f64(double %2, double %3, double %4) + %6 = bitcast i8 %mask to <8 x i1> + %7 = extractelement <8 x i1> %6, i64 0 + %8 = select i1 %7, double %5, double %4 + %9 = insertelement <2 x double> %1, double %8, i64 0 + %10 = extractelement <2 x double> %9, i32 1 + ret double %10 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmsub_ss( -; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <4 x float> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub float -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[C]], float [[TMP8]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP9]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -1370,22 +1567,47 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1 %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4) - ret <4 x float> %res + %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %8 = extractelement <4 x float> %3, i64 0 + %9 = extractelement <4 x float> %6, i64 0 + %10 = extractelement <4 x float> %7, i64 0 + %11 = call float @llvm.fma.f32(float %8, float %9, float %10) + %12 = extractelement <4 x float> %c, i64 0 + %13 = bitcast i8 %mask to <8 x i1> + %14 = extractelement <8 x i1> %13, i64 0 + %15 = select i1 %14, float %11, float %12 + %16 = insertelement <4 x float> %c, float %15, i64 0 + ret <4 x float> %16 } define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmsub_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub float -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP5]] +; CHECK-NEXT: ret float [[TMP8]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 + %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3 + %5 = extractelement <4 x float> %a, i64 0 + %6 = extractelement <4 x float> %b, i64 0 + %7 = extractelement <4 x float> %4, i64 0 + %8 = call float @llvm.fma.f32(float %5, float %6, float %7) + %9 = extractelement <4 x float> %3, i64 0 + %10 = bitcast i8 %mask to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, float %8, float %9 + %13 = insertelement <4 x float> %3, float %12, i64 0 + %14 = extractelement <4 x float> %13, i32 0 + ret float %14 } define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { @@ -1395,34 +1617,75 @@ define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 + %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3 + %5 = extractelement <4 x float> %a, i64 0 + %6 = extractelement <4 x float> %b, i64 0 + %7 = extractelement <4 x float> %4, i64 0 + %8 = call float @llvm.fma.f32(float %5, float %6, float %7) + %9 = extractelement <4 x float> %3, i64 0 + %10 = bitcast i8 %mask to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, float %8, float %9 + %13 = insertelement <4 x float> %3, float %12, i64 0 + %14 = extractelement <4 x float> %13, i32 1 + ret float %14 } -declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmsub_sd( -; CHECK-NEXT: [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <2 x double> [[RES]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub double -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[C]], double [[TMP8]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP9]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4) - ret <2 x double> %res + %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c + %4 = extractelement <2 x double> %1, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = extractelement <2 x double> %3, i64 0 + %7 = call double @llvm.fma.f64(double %4, double %5, double %6) + %8 = extractelement <2 x double> %c, i64 0 + %9 = bitcast i8 %mask to <8 x i1> + %10 = extractelement <8 x i1> %9, i64 0 + %11 = select i1 %10, double %7, double %8 + %12 = insertelement <2 x double> %c, double %11, i64 0 + ret <2 x double> %12 } define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfmsub_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub double -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP5]] +; CHECK-NEXT: ret double [[TMP8]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 + %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1 + %3 = extractelement <2 x double> %a, i64 0 + %4 = extractelement <2 x double> %b, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = extractelement <2 x double> %1, i64 0 + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, double %6, double %7 + %11 = insertelement <2 x double> %1, double %10, i64 0 + %12 = extractelement <2 x double> %11, i32 0 + ret double %12 } define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { @@ -1430,17 +1693,34 @@ define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: ret double 1.000000e+00 ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 + %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1 + %3 = extractelement <2 x double> %a, i64 0 + %4 = extractelement <2 x double> %b, i64 0 + %5 = extractelement <2 x double> %2, i64 0 + %6 = call double @llvm.fma.f64(double %3, double %4, double %5) + %7 = extractelement <2 x double> %1, i64 0 + %8 = bitcast i8 %mask to <8 x i1> + %9 = extractelement <8 x i1> %8, i64 0 + %10 = select i1 %9, double %6, double %7 + %11 = insertelement <2 x double> %1, double %10, i64 0 + %12 = extractelement <2 x double> %11, i32 1 + ret double %12 } -declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) - define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfnmsub_ss( -; CHECK-NEXT: [[RES:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <4 x float> [[RES]] +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub float -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 +; CHECK-NEXT: [[DOTRHS1:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub float -0.000000e+00, [[DOTRHS1]] +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[C]], float [[TMP8]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP9]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 @@ -1448,22 +1728,50 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1 %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %3, <4 x float> %6, <4 x float> %c, i8 %mask, i32 4) - ret <4 x float> %res + %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3 + %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %9 = extractelement <4 x float> %7, i64 0 + %10 = extractelement <4 x float> %6, i64 0 + %11 = extractelement <4 x float> %8, i64 0 + %12 = call float @llvm.fma.f32(float %9, float %10, float %11) + %13 = extractelement <4 x float> %c, i64 0 + %14 = bitcast i8 %mask to <8 x i1> + %15 = extractelement <8 x i1> %14, i64 0 + %16 = select i1 %15, float %12, float %13 + %17 = insertelement <4 x float> %c, float %16, i64 0 + ret <4 x float> %17 } define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub float -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS1:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub float -0.000000e+00, [[DOTRHS1]] +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP4]], float [[TMP5]] +; CHECK-NEXT: ret float [[TMP8]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 + %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a + %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3 + %6 = extractelement <4 x float> %4, i64 0 + %7 = extractelement <4 x float> %b, i64 0 + %8 = extractelement <4 x float> %5, i64 0 + %9 = call float @llvm.fma.f32(float %6, float %7, float %8) + %10 = extractelement <4 x float> %3, i64 0 + %11 = bitcast i8 %mask to <8 x i1> + %12 = extractelement <8 x i1> %11, i64 0 + %13 = select i1 %12, float %9, float %10 + %14 = insertelement <4 x float> %3, float %13, i64 0 + %15 = extractelement <4 x float> %14, i32 0 + ret float %15 } define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { @@ -1473,34 +1781,80 @@ define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 + %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a + %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3 + %6 = extractelement <4 x float> %4, i64 0 + %7 = extractelement <4 x float> %b, i64 0 + %8 = extractelement <4 x float> %5, i64 0 + %9 = call float @llvm.fma.f32(float %6, float %7, float %8) + %10 = extractelement <4 x float> %3, i64 0 + %11 = bitcast i8 %mask to <8 x i1> + %12 = extractelement <8 x i1> %11, i64 0 + %13 = select i1 %12, float %9, float %10 + %14 = insertelement <4 x float> %3, float %13, i64 0 + %15 = extractelement <4 x float> %14, i32 1 + ret float %15 } -declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) - define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfnmsub_sd( -; CHECK-NEXT: [[RES:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: ret <2 x double> [[RES]] +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub double -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS1:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub double -0.000000e+00, [[DOTRHS1]] +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[C]], double [[TMP8]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP9]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %1, <2 x double> %2, <2 x double> %c, i8 %mask, i32 4) - ret <2 x double> %res + %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1 + %4 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c + %5 = extractelement <2 x double> %3, i64 0 + %6 = extractelement <2 x double> %2, i64 0 + %7 = extractelement <2 x double> %4, i64 0 + %8 = call double @llvm.fma.f64(double %5, double %6, double %7) + %9 = extractelement <2 x double> %c, i64 0 + %10 = bitcast i8 %mask to <8 x i1> + %11 = extractelement <8 x i1> %10, i64 0 + %12 = select i1 %11, double %8, double %9 + %13 = insertelement <2 x double> %c, double %12, i64 0 + ret <2 x double> %13 } define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] +; CHECK-NEXT: [[DOTRHS:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub double -0.000000e+00, [[DOTRHS]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[DOTRHS1:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fsub double -0.000000e+00, [[DOTRHS1]] +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP4]], double [[TMP5]] +; CHECK-NEXT: ret double [[TMP8]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 + %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a + %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1 + %4 = extractelement <2 x double> %2, i64 0 + %5 = extractelement <2 x double> %b, i64 0 + %6 = extractelement <2 x double> %3, i64 0 + %7 = call double @llvm.fma.f64(double %4, double %5, double %6) + %8 = extractelement <2 x double> %1, i64 0 + %9 = bitcast i8 %mask to <8 x i1> + %10 = extractelement <8 x i1> %9, i64 0 + %11 = select i1 %10, double %7, double %8 + %12 = insertelement <2 x double> %1, double %11, i64 0 + %13 = extractelement <2 x double> %12, i32 0 + ret double %13 } define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { @@ -1508,9 +1862,19 @@ define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x do ; CHECK-NEXT: ret double 1.000000e+00 ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 + %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a + %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1 + %4 = extractelement <2 x double> %2, i64 0 + %5 = extractelement <2 x double> %b, i64 0 + %6 = extractelement <2 x double> %3, i64 0 + %7 = call double @llvm.fma.f64(double %4, double %5, double %6) + %8 = extractelement <2 x double> %1, i64 0 + %9 = bitcast i8 %mask to <8 x i1> + %10 = extractelement <8 x i1> %9, i64 0 + %11 = select i1 %10, double %7, double %8 + %12 = insertelement <2 x double> %1, double %11, i64 0 + %13 = extractelement <2 x double> %12, i32 1 + ret double %13 } declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) |

