summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@gmail.com>2016-10-26 04:59:58 +0000
committerCraig Topper <craig.topper@gmail.com>2016-10-26 04:59:58 +0000
commit812d3d30aede80b5f9e5b1fc0484700ecaf1afef (patch)
tree029ecae74275eb078a716136ec5ad8ca1fa66b1e
parenta3b7527ccb4400be6978b6ed6283afc202ecd753 (diff)
downloadbcm5719-llvm-812d3d30aede80b5f9e5b1fc0484700ecaf1afef.tar.gz
bcm5719-llvm-812d3d30aede80b5f9e5b1fc0484700ecaf1afef.zip
[AVX-512] Add scalar vfmsub/vfnmsub mask3 intrinsics
Summary: Clang's intrinsic header currently tries to negate the third operand of a vfmadd mask3 in order to create vfmsub, but this fails isel. This patch adds scalar vfmsub and vfnmsub mask3 that we can use instead to avoid the negate. This is consistent with the packed instructions. Reviewers: igorb, delena Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D25933 llvm-svn: 285173
-rw-r--r--llvm/include/llvm/IR/IntrinsicsX86.td24
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h4
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics.ll112
3 files changed, 140 insertions, 0 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 05f7056be82..08209890802 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -3074,6 +3074,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask3_vfmsub_sd :
+ GCCBuiltin<"__builtin_ia32_vfmsubsd3_mask3">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask3_vfmsub_ss :
+ GCCBuiltin<"__builtin_ia32_vfmsubss3_mask3">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
def int_x86_avx512_mask3_vfmsub_pd_128 :
GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">,
Intrinsic<[llvm_v2f64_ty],
@@ -3182,6 +3194,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask3_vfnmsub_sd :
+ GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask3_vfnmsub_ss :
+ GCCBuiltin<"__builtin_ia32_vfnmsubss3_mask3">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
def int_x86_avx512_mask_vfnmsub_pd_128 :
GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">,
Intrinsic<[llvm_v2f64_ty],
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index fa2f613468a..a8d3a6c9cb5 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1521,6 +1521,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1539,6 +1541,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 0c739763f54..711a6e6c297 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -5694,6 +5694,118 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
ret <4 x float> %res6
}
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm3
+; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm4
+; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm3
+; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm4
+; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm3
+; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm4
+; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm3
+; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm4
+; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
; CHECK: ## BB#0:
OpenPOWER on IntegriCloud