[UnrollAndJam] New Unroll and Jam pass

This is a simple implementation of the unroll-and-jam classical loop optimisation. The basic idea is that we take an outer loop of the form: for i.. ForeBlocks(i) for j.. SubLoopBlocks(i, j) AftBlocks(i) Instead of doing normal inner or outer unrolling, we unroll as follows: for i... i+=2 ForeBlocks(i) ForeBlocks(i+1) for j.. SubLoopBlocks(i, j) SubLoopBlocks(i+1, j) AftBlocks(i) AftBlocks(i+1) Remainder Loop So we have unrolled the outer loop, then jammed the two inner loops into one. This can lead to a simpler inner loop if memory accesses can be shared between the now jammed loops. To do this we have to prove that this is all safe, both for the memory accesses (using dependence analysis) and that ForeBlocks(i+1) can move before AftBlocks(i) and SubLoopBlocks(i, j). Differential Revision: https://reviews.llvm.org/D41953 llvm-svn: 336062
author: David Green <david.green@arm.com> 2018-07-01 12:47:30 +0000
committer: David Green <david.green@arm.com> 2018-07-01 12:47:30 +0000
commit: 963401d2be2d5ba1cfe3471f241f4c7585c33010 (patch)
tree: a24eb44f15e16e9b5240a358168503101ac9b5c3 /llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
parent: 8dabda70afba00d82f30e83bf0d513076eba37ab (diff)
download: bcm5719-llvm-963401d2be2d5ba1cfe3471f241f4c7585c33010.tar.gz
bcm5719-llvm-963401d2be2d5ba1cfe3471f241f4c7585c33010.zip
1 files changed, 319 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
new file mode 100644
index 00000000000..d45a04cdea8
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
@@ -0,0 +1,319 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime < %s -S | FileCheck %s
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Basic check that these loops are by default UnJ'd
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll_and_jam
+; #pragma nounroll_and_jam
+define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam_count
+; #pragma unroll_and_jam(8)
+define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll_and_jam
+; #pragma unroll_and_jam
+define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll
+; #pragma nounroll (which we take to mean disable unroll and jam too)
+define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: unroll
+; #pragma unroll (which we take to mean disable unroll and jam)
+define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; CHECK-LABEL: nounroll_plus_unroll_and_jam
+; #pragma clang loop nounroll, unroll_and_jam (which we take to mean do unroll_and_jam)
+define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
+; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmp122 = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmp122
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
+  %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
+  %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  %add.us = add i32 %0, %sum1.us
+  %inc.us = add nuw i32 %j.us, 1
+  %exitcond = icmp eq i32 %inc.us, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
+  %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
+  store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
+  %add8.us = add nuw i32 %i.us, 1
+  %exitcond25 = icmp eq i32 %add8.us, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
+!5 = distinct !{!5, !6}
+!6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
+!7 = distinct !{!7, !8}
+!8 = distinct !{!"llvm.loop.unroll.disable"}
+!9 = distinct !{!9, !10}
+!10 = distinct !{!"llvm.loop.unroll.enable"}
+!11 = distinct !{!11, !8, !6}
+\ No newline at end of file
author	David Green <david.green@arm.com>	2018-07-01 12:47:30 +0000
committer	David Green <david.green@arm.com>	2018-07-01 12:47:30 +0000
commit	963401d2be2d5ba1cfe3471f241f4c7585c33010 (patch)
tree	a24eb44f15e16e9b5240a358168503101ac9b5c3 /llvm/test/Transforms/LoopUnrollAndJam/pragma.ll
parent	8dabda70afba00d82f30e83bf0d513076eba37ab (diff)
download	bcm5719-llvm-963401d2be2d5ba1cfe3471f241f4c7585c33010.tar.gz bcm5719-llvm-963401d2be2d5ba1cfe3471f241f4c7585c33010.zip