summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp24
-rw-r--r--llvm/test/CodeGen/X86/chain_order.ll16
-rw-r--r--llvm/test/CodeGen/X86/vec_loadsingles.ll31
3 files changed, 56 insertions, 15 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f9131e7f2b2..20eaa2965e2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6553,11 +6553,25 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
}
- // Handle X+C
- if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
- cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
- return true;
-
+ // Handle X + C.
+ if (isBaseWithConstantOffset(Loc)) {
+ int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+ if (Loc.getOperand(0) == BaseLoc) {
+ // If the base location is a simple address with no offset itself, then
+ // the second load's first add operand should be the base address.
+ if (LocOffset == Dist * (int)Bytes)
+ return true;
+ } else if (isBaseWithConstantOffset(BaseLoc)) {
+ // The base location itself has an offset, so subtract that value from the
+ // second load's offset before comparing to distance * size.
+ int64_t BOffset =
+ cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+ if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+ if ((LocOffset - BOffset) == Dist * (int)Bytes)
+ return true;
+ }
+ }
+ }
const GlobalValue *GV1 = nullptr;
const GlobalValue *GV2 = nullptr;
int64_t Offset1 = 0;
diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll
index c88726e75a8..72e6f78bdef 100644
--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@@ -1,13 +1,13 @@
; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s
-;CHECK-LABEL: cftx020:
-;CHECK: vmovsd (%rdi), %xmm{{.*}}
-;CHECK: vmovsd 16(%rdi), %xmm{{.*}}
-;CHECK: vmovsd 24(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd 8(%rdi), %xmm{{.*}}
-;CHECK: vmovupd %xmm{{.*}}, (%rdi)
-;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
-;CHECK: ret
+; CHECK-LABEL: cftx020:
+; CHECK: vmovsd (%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovsd 16(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd 24(%rdi), %xmm{{.*}}
+; CHECK-NEXT: vmovhpd 8(%rdi), %xmm{{.*}}
+; CHECK: vmovupd %xmm{{.*}}, (%rdi)
+; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi)
+; CHECK: ret
; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads.
define void @cftx020(double* nocapture %a) {
diff --git a/llvm/test/CodeGen/X86/vec_loadsingles.ll b/llvm/test/CodeGen/X86/vec_loadsingles.ll
index af4d6fa61fd..fd132a52b8f 100644
--- a/llvm/test/CodeGen/X86/vec_loadsingles.ll
+++ b/llvm/test/CodeGen/X86/vec_loadsingles.ll
@@ -89,7 +89,7 @@ define <8 x float> @merge_8_floats(float* %ptr) {
; FAST32-NEXT: retq
; SLOW32: vmovups
-; SLOW32: vinsertf128
+; SLOW32-NEXT: vinsertf128
; SLOW32-NEXT: retq
}
@@ -112,7 +112,34 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
; FAST32-NEXT: retq
; SLOW32: vmovups
-; SLOW32: vinsertf128
+; SLOW32-NEXT: vinsertf128
+; SLOW32-NEXT: retq
+}
+
+; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
+; Recognize and combine consecutive loads even when the
+; first of the combined loads is offset from the base address.
+define <4 x double> @merge_4_doubles_offset(double* %ptr) {
+ %arrayidx4 = getelementptr inbounds double* %ptr, i64 4
+ %arrayidx5 = getelementptr inbounds double* %ptr, i64 5
+ %arrayidx6 = getelementptr inbounds double* %ptr, i64 6
+ %arrayidx7 = getelementptr inbounds double* %ptr, i64 7
+ %e = load double* %arrayidx4, align 8
+ %f = load double* %arrayidx5, align 8
+ %g = load double* %arrayidx6, align 8
+ %h = load double* %arrayidx7, align 8
+ %vecinit4 = insertelement <4 x double> undef, double %e, i32 0
+ %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1
+ %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
+ %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
+ ret <4 x double> %vecinit7
+
+; ALL-LABEL: merge_4_doubles_offset
+; FAST32: vmovups
+; FAST32-NEXT: retq
+
+; SLOW32: vmovups
+; SLOW32-NEXT: vinsertf128
; SLOW32-NEXT: retq
}
OpenPOWER on IntegriCloud