summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
authorAndrew Kaylor <andrew.kaylor@intel.com>2015-11-04 18:10:41 +0000
committerAndrew Kaylor <andrew.kaylor@intel.com>2015-11-04 18:10:41 +0000
commite41a8c4182f403b4f98f6b03f1803484a1f60201 (patch)
tree2622019fe649a2f9f562157ac145937b27004552 /llvm/test/CodeGen
parente028e4e3bbfe631b62f9c0127777ed9c56dfc4b7 (diff)
downloadbcm5719-llvm-e41a8c4182f403b4f98f6b03f1803484a1f60201.tar.gz
bcm5719-llvm-e41a8c4182f403b4f98f6b03f1803484a1f60201.zip
Created new X86 FMA3 opcodes (FMA*_Int) that are used now for lowering of scalar FMA intrinsics.
Patch by Slava Klochkov The key difference between FMA* and FMA*_Int opcodes is that FMA*_Int opcodes are handled more conservatively. It is illegal to commute the 1st operand of FMA*_Int instructions as the upper bits of scalar FMA intrinsic result must be taken from the 1st operand, but such commute transformation would change those upper bits and invalidate the intrinsic's result. Reviewers: Quentin Colombet, Elena Demikhovsky Differential Revision: http://reviews.llvm.org/D13710 llvm-svn: 252060
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll499
-rw-r--r--llvm/test/CodeGen/X86/fma-intrinsics-x86.ll688
2 files changed, 939 insertions, 248 deletions
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
index f7d0cdf3c65..8d0318bb93e 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
@@ -1,8 +1,337 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
-; CHECK-LABEL: fmaddsubpd_loop:
-; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmaddsubpd_loop_128:
+; CHECK: vfmaddsub231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop_128:
+; CHECK: vfmsubadd231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop_128:
+; CHECK: vfmadd231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop_128:
+; CHECK: vfmsub231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmaddpd_loop_128:
+; CHECK: vfnmadd231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubpd_loop_128:
+; CHECK: vfnmsub231pd %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <2 x double> %c.addr.0
+}
+
+declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop_128:
+; CHECK: vfmaddsub231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop_128:
+; CHECK: vfmsubadd231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop_128:
+; CHECK: vfmadd231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop_128:
+; CHECK: vfmsub231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmaddps_loop_128:
+; CHECK: vfnmadd231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubps_loop_128:
+; CHECK: vfnmsub231ps %xmm1, %xmm0, %xmm2
+; CHECK: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x float> %c.addr.0
+}
+
+declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK-LABEL: fmaddsubpd_loop_256:
+; CHECK: vfmaddsub231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
entry:
br label %for.cond
@@ -24,9 +353,11 @@ for.end:
ret <4 x double> %c.addr.0
}
-; CHECK-LABEL: fmsubaddpd_loop:
-; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmsubaddpd_loop_256:
+; CHECK: vfmsubadd231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
entry:
br label %for.cond
@@ -48,9 +379,11 @@ for.end:
ret <4 x double> %c.addr.0
}
-; CHECK-LABEL: fmaddpd_loop:
-; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmaddpd_loop_256:
+; CHECK: vfmadd231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
entry:
br label %for.cond
@@ -72,9 +405,11 @@ for.end:
ret <4 x double> %c.addr.0
}
-; CHECK-LABEL: fmsubpd_loop:
-; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+; CHECK-LABEL: fmsubpd_loop_256:
+; CHECK: vfmsub231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
entry:
br label %for.cond
@@ -96,15 +431,71 @@ for.end:
ret <4 x double> %c.addr.0
}
+; CHECK-LABEL: fnmaddpd_loop_256:
+; CHECK: vfnmadd231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubpd_loop_256:
+; CHECK: vfnmsub231pd %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
-; CHECK-LABEL: fmaddsubps_loop:
-; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmaddsubps_loop_256:
+; CHECK: vfmaddsub231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
entry:
br label %for.cond
@@ -126,9 +517,11 @@ for.end:
ret <8 x float> %c.addr.0
}
-; CHECK-LABEL: fmsubaddps_loop:
-; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmsubaddps_loop_256:
+; CHECK: vfmsubadd231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
entry:
br label %for.cond
@@ -150,9 +543,11 @@ for.end:
ret <8 x float> %c.addr.0
}
-; CHECK-LABEL: fmaddps_loop:
-; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmaddps_loop_256:
+; CHECK: vfmadd231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
entry:
br label %for.cond
@@ -174,9 +569,11 @@ for.end:
ret <8 x float> %c.addr.0
}
-; CHECK-LABEL: fmsubps_loop:
-; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: fmsubps_loop_256:
+; CHECK: vfmsub231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
entry:
br label %for.cond
@@ -198,7 +595,61 @@ for.end:
ret <8 x float> %c.addr.0
}
+; CHECK-LABEL: fnmaddps_loop_256:
+; CHECK: vfnmadd231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fnmsubps_loop_256:
+; CHECK: vfnmsub231ps %ymm1, %ymm0, %ymm2
+; CHECK: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
index 881436386ba..d2d41c766cb 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,95 +1,149 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-pc-windows -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
; VFMADD
define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdi)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
+
+define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
+
+define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -97,90 +151,144 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
; VFMSUB
define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
+
+define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
+
+define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -188,90 +296,144 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
; VFNMADD
define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
+
+define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
+
+define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -279,90 +441,144 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
; VFNMSUB
define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
+
+define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
+
+define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
+;
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
+;
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
+;
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -370,60 +586,72 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
; VFMADDSUB
define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -431,60 +659,72 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
; VFMSUBADD
define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
+;
; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+;
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
-; CHECK-FMA: # BB#0:
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-NEXT: # BB#0:
+;
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
+;
; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
;
-; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
-; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+;
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
OpenPOWER on IntegriCloud