diff options
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 37 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 18 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 186 |
5 files changed, 265 insertions, 11 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index f66e2c829f7..033edc3d067 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3941,6 +3941,43 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vfmadd_sd : + GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vfmadd_ss : + GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_maskz_vfmadd_sd : + GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_maskz_vfmadd_ss : + GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask3_vfmadd_sd : + GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask3_vfmadd_ss : + GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask3_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">, Intrinsic<[llvm_v2f64_ty], diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c97d304b97c..c0e8a72d1f9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16938,6 +16938,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case FMA_OP_SCALAR_MASK: + case FMA_OP_SCALAR_MASK3: + case FMA_OP_SCALAR_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == FMA_OP_SCALAR_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else if (IntrData->Type == FMA_OP_SCALAR_MASK3) + PassThru = Src3; + else + PassThru = Src1; + + SDValue Rnd = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, + Op.getValueType(), Src1, Src2, + Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } case TERLOG_OP_MASK: case TERLOG_OP_MASKZ: { SDValue Src1 = Op.getOperand(1); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index ad19cf7c5b3..368651935a3 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4713,9 +4713,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, string SUFF> { defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , - (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), - (_.VT (OpNode _.RC:$src2, _.RC:$src1, - (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))), + (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))), (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, @@ -4724,10 +4724,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (_.ScalarLdFrag addr:$src3))))>; defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , - (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), - (_.VT (OpNode _.RC:$src2, + (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnd _.RC:$src2, (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), - _.RC:$src1)), + _.RC:$src1, (i32 FROUND_CURRENT))), (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, @@ -4736,10 +4736,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , - (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), - (_.VT (OpNode _.RC:$src1, + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnd _.RC:$src1, (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), - _.RC:$src2)), + _.RC:$src2, (i32 FROUND_CURRENT))), (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 7b35da17a9e..cc77f9161a2 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -27,8 +27,9 @@ enum IntrinsicType { INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, - FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, - VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, + FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, + VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, @@ -1748,6 +1749,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1898,6 +1901,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1970,6 +1975,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 12d400fd581..c21379f0840 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -7356,3 +7356,189 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2 %res2 = add i8 %res, %res1 ret i8 %res2 } + +declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res +} +declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm2, %zmm5 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm2, %zmm5 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4) + ret < 4 x float> %res +} + + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %q = load float, float* %ptr_b + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) + ret < 4 x float> %res +} |

