Recommit r265309 after fixed an invalid memory reference bug happened

when DenseMap growed and moved memory. I verified it fixed the bootstrap problem on x86_64-linux-gnu but I cannot verify whether it fixes the bootstrap error on clang-ppc64be-linux. I will watch the build-bot result closely. Replace analyzeSiblingValues with new algorithm to fix its compile time issue. The patch is to solve PR17409 and its duplicates. analyzeSiblingValues is a N x N complexity algorithm where N is the number of siblings generated by reg splitting. Although it causes siginificant compile time issue when N is large, it is also important for performance since it removes redundent spills and enables rematerialization. To solve the compile time issue, the patch removes analyzeSiblingValues and replaces it with lower cost alternatives containing two parts. The first part creates a new spill hoisting method in postOptimization of register allocation. It does spill hoisting at once after all the spills are generated instead of inside every instance of selectOrSplit. The second part queries the define expr of the original register for rematerializaiton and keep it always available during register allocation even if it is already dead. It deletes those dead instructions only in postOptimization. With the two parts in the patch, it can remove analyzeSiblingValues without sacrificing performance. Differential Revision: http://reviews.llvm.org/D15302 llvm-svn: 265547
author: Wei Mi <wmi@google.com> 2016-04-06 15:41:07 +0000
committer: Wei Mi <wmi@google.com> 2016-04-06 15:41:07 +0000
commit: 18293bef4e4efaafff57da67cef87ea47dc26cae (patch)
tree: 979c8be87f939b1a7c5c65feeb71ed537bb3424c /llvm/test/CodeGen/X86
parent: 506f295a109918ae7449688e5d6eb0c024f895d0 (diff)
download: bcm5719-llvm-18293bef4e4efaafff57da67cef87ea47dc26cae.tar.gz
bcm5719-llvm-18293bef4e4efaafff57da67cef87ea47dc26cae.zip
4 files changed, 199 insertions, 4 deletions
diff --git a/llvm/test/CodeGen/X86/fp128-compare.ll b/llvm/test/CodeGen/X86/fp128-compare.ll
index b5d4fbe1b74..d9a48c5c13e 100644
--- a/llvm/test/CodeGen/X86/fp128-compare.ll
+++ b/llvm/test/CodeGen/X86/fp128-compare.ll
@@ -86,8 +86,8 @@ entry:
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
 ; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm1
 ; CHECK: movaps %xmm0
+; CHECK: movaps %xmm1
 ; CHECK: callq __gttf2
 ; CHECK: movaps {{.*}}, %xmm0
 ; CHECK: testl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll
new file mode 100644
index 00000000000..db9c4105a02
--- /dev/null
+++ b/llvm/test/CodeGen/X86/hoist-spill.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+
+; grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) {
+entry:
+  %tmp = load i32*, i32** @d, align 8
+  %tmp1 = load i32*, i32** @a, align 8
+  %tmp2 = sext i32 %p1 to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc14, %entry
+  %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+  %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+  %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+  %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+  %tmp3 = icmp sgt i32 undef, 0
+  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %tmp4 = zext i32 %smax52 to i64
+  %tmp5 = icmp sgt i64 undef, %tmp4
+  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %tmp6 = add nsw i64 %smax53, 1
+  %tmp7 = sub nsw i64 %tmp6, %tmp4
+  %tmp8 = add nsw i64 %tmp7, -8
+  %tmp9 = sub i32 undef, %indvar
+  %tmp10 = icmp sgt i64 %tmp2, 0
+  %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+  %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+  %indvars.iv30 = add i32 %indvars.iv30.in, -1
+  %tmp11 = icmp sgt i32 %indvars.iv30, 0
+  %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+  %tmp12 = zext i32 %smax to i64
+  %sub = sub nsw i32 %p1, %c.0
+  %cmp = icmp sgt i32 %sub, 0
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  %cmp326 = icmp sgt i32 %k.0, %p1
+  br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.cond4.preheader:                              ; preds = %for.body, %for.cond
+  %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+  %cmp528 = icmp sgt i32 %sub., %p1
+  br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body6.preheader
+  br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %bound1 = icmp ule i32* undef, %scevgep41
+  %memcheck.conflict = and i1 undef, %bound1
+  br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %lcmp.mod = icmp eq i64 undef, 0
+  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol, %vector.body.preheader
+  %prol.iter.cmp = icmp eq i64 undef, 0
+  br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split:                      ; preds = %vector.body.prol, %vector.body.preheader
+  %tmp13 = icmp ult i64 %tmp8, 24
+  br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split
+  %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+  %index.next = add i64 %index, 8
+  %offset.idx.1 = add i64 %tmp12, %index.next
+  %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+  %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+  %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+  store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+  %index.next.3 = add i64 %index, 32
+  br i1 undef, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
+  br i1 undef, label %for.inc14, label %for.body6
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %k.127, 1
+  %tmp18 = load i32, i32* undef, align 4
+  store i32 %tmp18, i32* @b, align 4
+  br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+  %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+  %tmp19 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+  store i32 %tmp19, i32* %arrayidx10, align 4
+  %cmp5 = icmp slt i64 %indvars.iv32, undef
+  br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6, %middle.block, %for.cond4.preheader
+  %inc15 = add nuw nsw i32 %c.0, 1
+  %indvar.next = add i32 %indvar, 1
+  br label %for.cond
+}
diff --git a/llvm/test/CodeGen/X86/new-remat.ll b/llvm/test/CodeGen/X86/new-remat.ll
new file mode 100644
index 00000000000..d06ac72e3ec
--- /dev/null
+++ b/llvm/test/CodeGen/X86/new-remat.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) {
+entry:
+  %cmp3 = icmp sgt i32 %p1, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %p1, -1
+  %xtraiter = and i32 %p1, 7
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader:                          ; preds = %for.body.preheader
+  br label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol, %for.body.prol.preheader
+  %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+  %tmp1 = load double, double* @b, align 8
+  %call.prol = tail call double @pow(double %tmp1, double 2.500000e-01)
+  %inc.prol = add nuw nsw i32 %i.04.prol, 1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit:                ; preds = %for.body.prol
+  %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+  br label %for.body.preheader.split
+
+for.body.preheader.split:                         ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+  %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+  %tmp2 = icmp ult i32 %tmp, 7
+  br i1 %tmp2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split:                   ; preds = %for.body.preheader.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.split.split
+  %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+  %tmp3 = load double, double* @b, align 8
+  %call = tail call double @pow(double %tmp3, double 2.500000e-01)
+  %tmp4 = load double, double* @b, align 8
+  %call.1 = tail call double @pow(double %tmp4, double 2.500000e-01)
+  %inc.7 = add nsw i32 %i.04, 8
+  %exitcond.7 = icmp eq i32 %inc.7, %p1
+  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa:                       ; preds = %for.body
+  br label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.end.loopexit.unr-lcssa, %for.body.preheader.split
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %tmp5 = load i32, i32* @a, align 4
+  ret i32 %tmp5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double)
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index 46b65bd24fc..1d6b4f94731 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
 
 ; This testing case is reduced from 254.gap SyFgets function.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
 
 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
 %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +181,10 @@ sw.bb474:
   br i1 %cmp476, label %if.end517, label %do.body479.preheader
 
 do.body479.preheader:
+  ; CHECK: do.body479.preheader
+  ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: land.rhs485
   %cmp4833314 = icmp eq i8 undef, 0
   br i1 %cmp4833314, label %if.end517, label %land.rhs485
 
@@ -200,8 +204,8 @@ land.lhs.true490:
 
 lor.rhs500:
   ; CHECK: lor.rhs500
-  ; Make sure that we don't hoist the spill to outer loops.
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; Make sure spill is hoisted to a cold preheader in outside loop.
+  ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
   ; CHECK: callq {{.*}}maskrune
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
   br i1 undef, label %land.lhs.true504, label %do.body479.backedge
author	Wei Mi <wmi@google.com>	2016-04-06 15:41:07 +0000
committer	Wei Mi <wmi@google.com>	2016-04-06 15:41:07 +0000
commit	18293bef4e4efaafff57da67cef87ea47dc26cae (patch)
tree	979c8be87f939b1a7c5c65feeb71ed537bb3424c /llvm/test/CodeGen/X86
parent	506f295a109918ae7449688e5d6eb0c024f895d0 (diff)
download	bcm5719-llvm-18293bef4e4efaafff57da67cef87ea47dc26cae.tar.gz bcm5719-llvm-18293bef4e4efaafff57da67cef87ea47dc26cae.zip