summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-07-16 23:10:58 +0000
committerCraig Topper <craig.topper@intel.com>2018-07-16 23:10:58 +0000
commit6751727d7689c3c7975e6accd60a4b24aa817d44 (patch)
treed4f665b2c238dd6d733c4a2a52fc1a8637d74151
parentcf2a9e28b1bb37181ae916043df155cede38ff18 (diff)
downloadbcm5719-llvm-6751727d7689c3c7975e6accd60a4b24aa817d44.tar.gz
bcm5719-llvm-6751727d7689c3c7975e6accd60a4b24aa817d44.zip
[X86] Add a missing FMA3 scalar intrinsic pattern.
This allows us to use 231 form to fold an insertelement on the add input to the fma. There is technically no software intrinsic that can use this until AVX512F, but it can be manually built up from other intrinsics. llvm-svn: 337223
-rw-r--r--llvm/lib/Target/X86/X86InstrFMA.td7
-rw-r--r--llvm/test/CodeGen/X86/fma-intrinsics-x86.ll31
2 files changed, 38 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 7a35d07495c..a559f62c8f3 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -341,6 +341,13 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
(VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
+ (Op RC:$src2, RC:$src3,
+ (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))),
+ (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int")
+ VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)),
+ (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>;
+
+ def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector
(Op RC:$src2,
(EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(mem_frag addr:$src3)))))),
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
index ec4936cf4ce..055e2bb16c1 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -64,6 +64,37 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
ret <4 x float> %5
}
+define <4 x float> @test_x86_fma_vfmadd_ss_231(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss_231:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 # encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
+; CHECK-FMA-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2
+; CHECK-FMA-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss_231:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
+; CHECK-AVX512VL-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2
+; CHECK-AVX512VL-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss_231:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x28,0x00]
+; CHECK-FMA-WIN-NEXT: vmovss (%rcx), %xmm1 # encoding: [0xc5,0xfa,0x10,0x09]
+; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT: vfmadd231ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xb9,0x02]
+; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * mem) + xmm0
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
+ %1 = extractelement <4 x float> %a0, i64 0
+ %2 = extractelement <4 x float> %a1, i64 0
+ %3 = extractelement <4 x float> %a2, i64 0
+ %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+ %5 = insertelement <4 x float> %a2, float %4, i64 0
+ ret <4 x float> %5
+}
+
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA: # %bb.0:
OpenPOWER on IntegriCloud