[X86] When pattern-matching scalar FMA3 intrinsics, don't re-arrange the first and second operands.

The semantics of the scalar FMA intrinsics are that the high vector elements are copied from the first source. The existing pattern switches src1 and src2 around, to match the "213" order, which ends up tying the original src2 to the dest. Since the actual scalar fma3 instructions copy the high elements from the dest register, the wrong values are copied. This modifies the pattern to leave src1 and src2 in their original order. Differential Revision: http://reviews.llvm.org/D9908 llvm-svn: 238131
author: Michael Kuperstein <michael.m.kuperstein@intel.com> 2015-05-25 12:35:25 +0000
committer: Michael Kuperstein <michael.m.kuperstein@intel.com> 2015-05-25 12:35:25 +0000
commit: f1452286764e89f5bc5aa6407e166bd949f98b1b (patch)
tree: d148fa6c18f520119cff5dfe57b2ba0cb3e31347
parent: 1c1391ba2499c0255427aaf0d831103c6fcd108e (diff)
download: bcm5719-llvm-f1452286764e89f5bc5aa6407e166bd949f98b1b.tar.gz
bcm5719-llvm-f1452286764e89f5bc5aa6407e166bd949f98b1b.zip
2 files changed, 31 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 2993e42443d..7cc3b599a73 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -183,19 +183,24 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
   defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
                         FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
 
+// These patterns use the 123 ordering, instead of 213, even though
+// they match the intrinsic to the 213 version of the instruction.
+// This is because src1 is tied to dest, and the scalar intrinsics
+// require the pass-through values to come from the first source
+// operand, not the second.
   def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
             (COPY_TO_REGCLASS
               (!cast<Instruction>(NAME#"SSr213r")
-                (COPY_TO_REGCLASS $src2, FR32),
                 (COPY_TO_REGCLASS $src1, FR32),
+                (COPY_TO_REGCLASS $src2, FR32),
                 (COPY_TO_REGCLASS $src3, FR32)),
               VR128)>;
 
   def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
             (COPY_TO_REGCLASS
               (!cast<Instruction>(NAME#"SDr213r")
-                (COPY_TO_REGCLASS $src2, FR64),
                 (COPY_TO_REGCLASS $src1, FR64),
+                (COPY_TO_REGCLASS $src2, FR64),
                 (COPY_TO_REGCLASS $src3, FR64)),
               VR128)>;
 }
diff --git a/llvm/test/CodeGen/X86/fma3-intrinsics.ll b/llvm/test/CodeGen/X86/fma3-intrinsics.ll
index 9a25096c7a5..fa9c252f30b 100644
--- a/llvm/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fma3-intrinsics.ll
@@ -3,7 +3,9 @@
 ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fmadd213ss (%r8), %xmm
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]]
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -24,7 +26,9 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
 
 define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fnmadd213ss (%r8), %xmm
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]]
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -46,7 +50,9 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
 
 
 define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fmsub213ss
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]]
   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -60,7 +66,9 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fnmsub213ss
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]]
   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -76,7 +84,9 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
 ;;;;
 
 define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-  ; CHECK: fmadd213sd
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]]
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
   ret <2 x double> %res
 }
@@ -90,7 +100,9 @@ define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-  ; CHECK: fnmadd213sd
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]]
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
   ret <2 x double> %res
 }
@@ -106,7 +118,9 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
 
 
 define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-  ; CHECK: fmsub213sd
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]]
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
   ret <2 x double> %res
 }
@@ -120,7 +134,9 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-  ; CHECK: fnmsub213sd
+  ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
+  ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
+  ; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]]
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
   ret <2 x double> %res
 }
author	Michael Kuperstein <michael.m.kuperstein@intel.com>	2015-05-25 12:35:25 +0000
committer	Michael Kuperstein <michael.m.kuperstein@intel.com>	2015-05-25 12:35:25 +0000
commit	f1452286764e89f5bc5aa6407e166bd949f98b1b (patch)
tree	d148fa6c18f520119cff5dfe57b2ba0cb3e31347
parent	1c1391ba2499c0255427aaf0d831103c6fcd108e (diff)
download	bcm5719-llvm-f1452286764e89f5bc5aa6407e166bd949f98b1b.tar.gz bcm5719-llvm-f1452286764e89f5bc5aa6407e166bd949f98b1b.zip