summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJim Grosbach <grosbach@apple.com>2014-12-23 00:35:23 +0000
committerJim Grosbach <grosbach@apple.com>2014-12-23 00:35:23 +0000
commit1bd0f3530ec14748b45639c1bf6dc0213bde8111 (patch)
treeed93e2f9b8d12e8b27137911ba72e7a15e0cb2f2
parentce0093344fa1f9a10831038bfd47703b699db5f4 (diff)
downloadbcm5719-llvm-1bd0f3530ec14748b45639c1bf6dc0213bde8111.tar.gz
bcm5719-llvm-1bd0f3530ec14748b45639c1bf6dc0213bde8111.zip
X86: Don't over-align combined loads.
When combining consecutive loads+inserts into a single vector load, we should keep the alignment of the base load. Doing otherwise can, and does, lead to using overly aligned instructions. In the included test case, for example, using a 32-byte vmovaps on a 16-byte aligned value. Oops. rdar://19190968 llvm-svn: 224746
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll35
2 files changed, 38 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7440b5decbb..f154bd66556 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6042,15 +6042,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
SDValue NewLd = SDValue();
- if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
- NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), 0);
NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), LDBase->getAlignment());
+ LDBase->getPointerInfo(), LDBase->isVolatile(),
+ LDBase->isNonTemporal(), LDBase->isInvariant(),
+ LDBase->getAlignment());
if (LDBase->hasAnyUseOfValue(1)) {
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
diff --git a/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll b/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll
new file mode 100644
index 00000000000..6aa2adb228e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
+
+@e = global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8], align 16
+@d = global [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
+
+; The global 'e' has 16 byte alignment, so make sure we don't generate an
+; aligned 32-byte load instruction when we combine the load+insert sequence.
+
+define i32 @subb() nounwind ssp {
+; CHECK-LABEL: subb:
+; CHECK: vmovups e(%rip), %ymm
+entry:
+ %0 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 7), align 4
+ %1 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 6), align 8
+ %2 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 5), align 4
+ %3 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 4), align 16
+ %4 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 3), align 4
+ %5 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 2), align 8
+ %6 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 1), align 4
+ %7 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 0), align 16
+ %vecinit.i = insertelement <8 x i32> undef, i32 %7, i32 0
+ %vecinit1.i = insertelement <8 x i32> %vecinit.i, i32 %6, i32 1
+ %vecinit2.i = insertelement <8 x i32> %vecinit1.i, i32 %5, i32 2
+ %vecinit3.i = insertelement <8 x i32> %vecinit2.i, i32 %4, i32 3
+ %vecinit4.i = insertelement <8 x i32> %vecinit3.i, i32 %3, i32 4
+ %vecinit5.i = insertelement <8 x i32> %vecinit4.i, i32 %2, i32 5
+ %vecinit6.i = insertelement <8 x i32> %vecinit5.i, i32 %1, i32 6
+ %vecinit7.i = insertelement <8 x i32> %vecinit6.i, i32 %0, i32 7
+ %8 = bitcast <8 x i32> %vecinit7.i to <32 x i8>
+ tail call void @llvm.x86.avx.storeu.dq.256(i8* bitcast ([8 x i32]* @d to i8*), <32 x i8> %8)
+ ret i32 0
+}
+
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
OpenPOWER on IntegriCloud