[X86] Require 32-byte alignment for 32-byte VMOVNTs.

We used to accept (and even test, and generate) 16-byte alignment for 32-byte nontemporal stores, but they require 32-byte alignment, per SDM. Found by inspection. Instead of hardcoding 16 in the patfrag, check for natural alignment. Also fix the autoupgrade and the various tests. Also, use explicit -mattr instead of -mcpu: I stared at the output several minutes wondering why I get 2x movntps for the unaligned case (which is the ideal output, but needs some work: see FIXME), until I remembered corei7-avx implies +slow-unaligned-mem-32. llvm-svn: 246733
author: Ahmed Bougacha <ahmed.bougacha@gmail.com> 2015-09-02 23:25:39 +0000
committer: Ahmed Bougacha <ahmed.bougacha@gmail.com> 2015-09-02 23:25:39 +0000
commit: b03ea02479c82430b4149609e0ac3e0490d5ca12 (patch)
tree: 1eba0a16380443680a0e0d9fd40640fbd1b7cb8e /llvm/test/CodeGen
parent: 78425200ee5a51df88ed62eafc51ae70a5ecc112 (diff)
download: bcm5719-llvm-b03ea02479c82430b4149609e0ac3e0490d5ca12.tar.gz
bcm5719-llvm-b03ea02479c82430b4149609e0ac3e0490d5ca12.zip
3 files changed, 22 insertions, 7 deletions
diff --git a/llvm/test/CodeGen/X86/avx2-nontemporal.ll b/llvm/test/CodeGen/X86/avx2-nontemporal.ll
index 544c096c52d..058358f13b8 100644
--- a/llvm/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/avx2-nontemporal.ll
@@ -4,15 +4,15 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
 ; CHECK: vmovntps %y
   %cast = bitcast i8* %B to <8 x float>*
   %A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
-  store <8 x float> %A2, <8 x float>* %cast, align 16, !nontemporal !0
+  store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
 ; CHECK: vmovntdq %y
   %cast1 = bitcast i8* %B to <4 x i64>*
   %E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
-  store <4 x i64> %E2, <4 x i64>* %cast1, align 16, !nontemporal !0
+  store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
 ; CHECK: vmovntpd %y
   %cast2 = bitcast i8* %B to <4 x double>*
   %C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
-  store <4 x double> %C2, <4 x double>* %cast2, align 16, !nontemporal !0
+  store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/movntdq-no-avx.ll b/llvm/test/CodeGen/X86/movntdq-no-avx.ll
index cc35e201e6b..2bf09dd6f58 100644
--- a/llvm/test/CodeGen/X86/movntdq-no-avx.ll
+++ b/llvm/test/CodeGen/X86/movntdq-no-avx.ll
@@ -5,7 +5,7 @@
 
 define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
 entry:
-  store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
+  store <2 x i64> %b, <2 x i64>* %a, align 32, !nontemporal !0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll
index 8c08b3c163c..c9767f88488 100644
--- a/llvm/test/CodeGen/X86/nontemporal-2.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
 
 ; Make sure that we generate non-temporal stores for the test cases below.
 ; We use xorps for zeroing, so domain information isn't available anymore.
@@ -300,4 +300,19 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
   ret void
 }
 
+; 256-bit NT stores require 256-bit alignment.
+; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
+; could even scalarize to movnti when we have 1-alignment: nontemporal is
+; probably always worth even some 20 instruction scalarization.
+define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
+; CHECK-LABEL: test_unaligned_v8f32:
+; SSE: movntps %xmm
+; SSE: movntps %xmm
+; AVX-NOT: movnt
+; AVX: vmovups %ymm
+  %r = fadd <8 x float> %a, %b
+  store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
 !1 = !{i32 1}
author	Ahmed Bougacha <ahmed.bougacha@gmail.com>	2015-09-02 23:25:39 +0000
committer	Ahmed Bougacha <ahmed.bougacha@gmail.com>	2015-09-02 23:25:39 +0000
commit	b03ea02479c82430b4149609e0ac3e0490d5ca12 (patch)
tree	1eba0a16380443680a0e0d9fd40640fbd1b7cb8e /llvm/test/CodeGen
parent	78425200ee5a51df88ed62eafc51ae70a5ecc112 (diff)
download	bcm5719-llvm-b03ea02479c82430b4149609e0ac3e0490d5ca12.tar.gz bcm5719-llvm-b03ea02479c82430b4149609e0ac3e0490d5ca12.zip