diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/chain_order.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_loadsingles.ll | 31 |
3 files changed, 56 insertions, 15 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index f9131e7f2b2..20eaa2965e2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6553,11 +6553,25 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } - // Handle X+C - if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && - cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) - return true; - + // Handle X + C. + if (isBaseWithConstantOffset(Loc)) { + int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc) { + // If the base location is a simple address with no offset itself, then + // the second load's first add operand should be the base address. + if (LocOffset == Dist * (int)Bytes) + return true; + } else if (isBaseWithConstantOffset(BaseLoc)) { + // The base location itself has an offset, so subtract that value from the + // second load's offset before comparing to distance * size. + int64_t BOffset = + cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { + if ((LocOffset - BOffset) == Dist * (int)Bytes) + return true; + } + } + } const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; int64_t Offset1 = 0; diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll index c88726e75a8..72e6f78bdef 100644 --- a/llvm/test/CodeGen/X86/chain_order.ll +++ b/llvm/test/CodeGen/X86/chain_order.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s -;CHECK-LABEL: cftx020: -;CHECK: vmovsd (%rdi), %xmm{{.*}} -;CHECK: vmovsd 16(%rdi), %xmm{{.*}} -;CHECK: vmovsd 24(%rdi), %xmm{{.*}} -;CHECK: vmovhpd 8(%rdi), %xmm{{.*}} -;CHECK: vmovupd %xmm{{.*}}, (%rdi) -;CHECK: vmovupd %xmm{{.*}}, 16(%rdi) -;CHECK: ret +; CHECK-LABEL: cftx020: +; CHECK: vmovsd (%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovsd 16(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 24(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 8(%rdi), %xmm{{.*}} +; CHECK: vmovupd %xmm{{.*}}, (%rdi) +; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi) +; CHECK: ret ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads. define void @cftx020(double* nocapture %a) { diff --git a/llvm/test/CodeGen/X86/vec_loadsingles.ll b/llvm/test/CodeGen/X86/vec_loadsingles.ll index af4d6fa61fd..fd132a52b8f 100644 --- a/llvm/test/CodeGen/X86/vec_loadsingles.ll +++ b/llvm/test/CodeGen/X86/vec_loadsingles.ll @@ -89,7 +89,7 @@ define <8 x float> @merge_8_floats(float* %ptr) { ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq } @@ -112,7 +112,34 @@ define <4 x double> @merge_4_doubles(double* %ptr) { ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq +} + +; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) +; Recognize and combine consecutive loads even when the +; first of the combined loads is offset from the base address. +define <4 x double> @merge_4_doubles_offset(double* %ptr) { + %arrayidx4 = getelementptr inbounds double* %ptr, i64 4 + %arrayidx5 = getelementptr inbounds double* %ptr, i64 5 + %arrayidx6 = getelementptr inbounds double* %ptr, i64 6 + %arrayidx7 = getelementptr inbounds double* %ptr, i64 7 + %e = load double* %arrayidx4, align 8 + %f = load double* %arrayidx5, align 8 + %g = load double* %arrayidx6, align 8 + %h = load double* %arrayidx7, align 8 + %vecinit4 = insertelement <4 x double> undef, double %e, i32 0 + %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1 + %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2 + %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3 + ret <4 x double> %vecinit7 + +; ALL-LABEL: merge_4_doubles_offset +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq } |