MachinePipeliner pass that implements Swing Modulo Scheduling

Software pipelining is an optimization for improving ILP by overlapping loop iterations. Swing Modulo Scheduling (SMS) is an implementation of software pipelining that attempts to reduce register pressure and generate efficient pipelines with a low compile-time cost. This implementaion of SMS is a target-independent back-end pass. When enabled, the pass should run just prior to the register allocation pass, while the machine IR is in SSA form. If the pass is successful, then the original loop is replaced by the optimized loop. The optimized loop contains one or more prolog blocks, the pipelined kernel, and one or more epilog blocks. This pass is enabled for Hexagon only. To enable for other targets, a couple of target specific hooks must be implemented, and the pass needs to be called from the target's TargetMachine implementation. Differential Review: http://reviews.llvm.org/D16829 llvm-svn: 277169
author: Brendon Cahoon <bcahoon@codeaurora.org> 2016-07-29 16:44:44 +0000
committer: Brendon Cahoon <bcahoon@codeaurora.org> 2016-07-29 16:44:44 +0000
commit: 254f889dc54672f6765791078a5c22a71e2b7cb3 (patch)
tree: f1e9c5895906c2bf954cbd915ebd061396abb1c2 /llvm/test/CodeGen
parent: 0bd55a7608098ebacd4f8bd4a83a15bc1b3e7206 (diff)
download: bcm5719-llvm-254f889dc54672f6765791078a5c22a71e2b7cb3.tar.gz
bcm5719-llvm-254f889dc54672f6765791078a5c22a71e2b7cb3.zip
12 files changed, 456 insertions, 3 deletions
diff --git a/llvm/test/CodeGen/Hexagon/bit-gen-rseq.ll b/llvm/test/CodeGen/Hexagon/bit-gen-rseq.ll
index 6b5a5ad8ec3..08d4b787715 100644
--- a/llvm/test/CodeGen/Hexagon/bit-gen-rseq.ll
+++ b/llvm/test/CodeGen/Hexagon/bit-gen-rseq.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-subreg-liveness < %s | FileCheck %s
 ; Check that we don't generate any bitwise operations.
 
 ; CHECK-NOT: = or(
diff --git a/llvm/test/CodeGen/Hexagon/hwloop1.ll b/llvm/test/CodeGen/Hexagon/hwloop1.ll
index 97b779cf962..68af3b34eee 100644
--- a/llvm/test/CodeGen/Hexagon/hwloop1.ll
+++ b/llvm/test/CodeGen/Hexagon/hwloop1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
 ; Check that we generate hardware loop instructions.
 
 ; Case 1 : Loop with a constant number of iterations.
diff --git a/llvm/test/CodeGen/Hexagon/swp-const-tc.ll b/llvm/test/CodeGen/Hexagon/swp-const-tc.ll
new file mode 100644
index 00000000000..3113094d2ba
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-const-tc.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -verify-machineinstrs < %s | FileCheck %s
+
+; If the trip count is a compile-time constant, then decrement it instead
+; of computing a new LC0 value.
+
+; CHECK-LABEL: @test
+; CHECK: loop0(.LBB0_1, #998)
+
+define i32 @test(i32* %A, i32* %B, i32 %count) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
+; The constant trip count is small enough that the kernel is not executed.
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: loop0(
+
+define i32 @test1(i32* %A, i32* %B, i32 %count) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 1
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/swp-dag-phi.ll b/llvm/test/CodeGen/Hexagon/swp-dag-phi.ll
new file mode 100644
index 00000000000..54d9492ebac
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-dag-phi.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 < %s
+; REQUIRES: asserts
+
+; This tests check that a dependence is created between a Phi and it's uses.
+; An assert occurs if the Phi dependences are not correct.
+
+define void @test1(i32* %f2, i32 %nc) {
+entry:
+  %i.011 = add i32 %nc, -1
+  %cmp12 = icmp sgt i32 %i.011, 1
+  br i1 %cmp12, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %0 = add i32 %nc, -2
+  %scevgep = getelementptr i32, i32* %f2, i32 %0
+  %sri = load i32, i32* %scevgep, align 4
+  %scevgep15 = getelementptr i32, i32* %f2, i32 %i.011
+  %sri16 = load i32, i32* %scevgep15, align 4
+  br label %for.body
+
+for.body:
+  %i.014 = phi i32 [ %i.0, %for.body ], [ %i.011, %for.body.preheader ]
+  %i.0.in13 = phi i32 [ %i.014, %for.body ], [ %nc, %for.body.preheader ]
+  %sr = phi i32 [ %1, %for.body ], [ %sri, %for.body.preheader ]
+  %sr17 = phi i32 [ %sr, %for.body ], [ %sri16, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %f2, i32 %i.014
+  %sub1 = add nsw i32 %i.0.in13, -3
+  %arrayidx2 = getelementptr inbounds i32, i32* %f2, i32 %sub1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %sub3 = sub nsw i32 %sr17, %1
+  store i32 %sub3, i32* %arrayidx, align 4
+  %i.0 = add nsw i32 %i.014, -1
+  %cmp = icmp sgt i32 %i.0, 1
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-reuse.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-reuse.ll
new file mode 100644
index 00000000000..6a2ad73f209
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-reuse.ll
@@ -0,0 +1,65 @@
+; RUN: llc -fp-contract=fast -O3 -march=hexagon -mcpu=hexagonv5 < %s
+; REQUIRES: asserts
+
+; Test that the pipeliner doesn't ICE due because the PHI generation
+; code in the epilog does not attempt to reuse an existing PHI.
+
+define void @test(float* noalias %srcImg, i32 %width, float* noalias %dstImg) {
+entry.split:
+  %shr = lshr i32 %width, 1
+  %incdec.ptr253 = getelementptr inbounds float, float* %dstImg, i32 2
+  br i1 undef, label %for.body, label %for.end
+
+for.body:
+  %dst.21518.reg2mem.0 = phi float* [ null, %while.end712 ], [ %incdec.ptr253, %entry.split ]
+  %dstEnd.01519 = phi float* [ %add.ptr725, %while.end712 ], [ undef, %entry.split ]
+  %add.ptr367 = getelementptr inbounds float, float* %srcImg, i32 undef
+  %dst.31487 = getelementptr inbounds float, float* %dst.21518.reg2mem.0, i32 1
+  br i1 undef, label %while.body661.preheader, label %while.end712
+
+while.body661.preheader:
+  %scevgep1941 = getelementptr float, float* %add.ptr367, i32 1
+  br label %while.body661.ur
+
+while.body661.ur:
+  %lsr.iv1942 = phi float* [ %scevgep1941, %while.body661.preheader ], [ undef, %while.body661.ur ]
+  %col1.31508.reg2mem.0.ur = phi float [ %col3.31506.reg2mem.0.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
+  %col4.31507.reg2mem.0.ur = phi float [ %add710.ur, %while.body661.ur ], [ 0.000000e+00, %while.body661.preheader ]
+  %col3.31506.reg2mem.0.ur = phi float [ %add689.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
+  %dst.41511.ur = phi float* [ %incdec.ptr674.ur, %while.body661.ur ], [ %dst.31487, %while.body661.preheader ]
+  %mul662.ur = fmul float %col1.31508.reg2mem.0.ur, 4.000000e+00
+  %add663.ur = fadd float undef, %mul662.ur
+  %add665.ur = fadd float %add663.ur, undef
+  %add667.ur = fadd float undef, %add665.ur
+  %add669.ur = fadd float undef, %add667.ur
+  %add670.ur = fadd float %col4.31507.reg2mem.0.ur, %add669.ur
+  %conv673.ur = fmul float %add670.ur, 3.906250e-03
+  %incdec.ptr674.ur = getelementptr inbounds float, float* %dst.41511.ur, i32 1
+  store float %conv673.ur, float* %dst.41511.ur, align 4
+  %scevgep1959 = getelementptr float, float* %lsr.iv1942, i32 -1
+  %0 = load float, float* %scevgep1959, align 4
+  %mul680.ur = fmul float %0, 4.000000e+00
+  %add681.ur = fadd float undef, %mul680.ur
+  %add684.ur = fadd float undef, %add681.ur
+  %add687.ur = fadd float undef, %add684.ur
+  %add689.ur = fadd float undef, %add687.ur
+  %add699.ur = fadd float undef, undef
+  %add703.ur = fadd float undef, %add699.ur
+  %add707.ur = fadd float undef, %add703.ur
+  %add710.ur = fadd float undef, %add707.ur
+  %cmp660.ur = icmp ult float* %incdec.ptr674.ur, %dstEnd.01519
+  br i1 %cmp660.ur, label %while.body661.ur, label %while.end712
+
+while.end712:
+  %dst.4.lcssa.reg2mem.0 = phi float* [ %dst.31487, %for.body ], [ undef, %while.body661.ur ]
+  %conv721 = fpext float undef to double
+  %mul722 = fmul double %conv721, 0x3F7111112119E8FB
+  %conv723 = fptrunc double %mul722 to float
+  store float %conv723, float* %dst.4.lcssa.reg2mem.0, align 4
+  %add.ptr725 = getelementptr inbounds float, float* %dstEnd.01519, i32 %shr
+  %cmp259 = icmp ult i32 undef, undef
+  br i1 %cmp259, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
new file mode 100644
index 00000000000..db5bb96d0bc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-bsb-sched=0 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+
+; From coremark. Test that we pipeline the matrix multiplication bitextract
+; function. The pipelined code should have two packets.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: = extractu([[REG2:(r[0-9]+)]],
+; CHECK: = extractu([[REG2]],
+; CHECK: [[REG0:(r[0-9]+)]] = memh
+; CHECK: [[REG1:(r[0-9]+)]] = memh
+; CHECK: += mpyi
+; CHECK: [[REG2]] = mpyi([[REG0]], [[REG1]])
+; CHECK: endloop0
+
+%union_h2_sem_t = type { i32 }
+
+@sem_i = common global [0 x %union_h2_sem_t] zeroinitializer, align 4
+
+define void @matrix_mul_matrix_bitextract(i32 %N, i32* %C, i16* %A, i16* %B) {
+entry:
+  %cmp53 = icmp eq i32 %N, 0
+  br i1 %cmp53, label %for_end27, label %for_body3_lr_ph_us
+
+for_body3_lr_ph_us:
+  %i_054_us = phi i32 [ %inc26_us, %for_cond1_for_inc25_crit_edge_us ], [ 0, %entry ]
+  %0 = mul i32 %i_054_us, %N
+  %arrayidx9_us_us_gep = getelementptr i16, i16* %A, i32 %0
+  br label %for_body3_us_us
+
+for_cond1_for_inc25_crit_edge_us:
+  %inc26_us = add i32 %i_054_us, 1
+  %exitcond89 = icmp eq i32 %inc26_us, %N
+  br i1 %exitcond89, label %for_end27, label %for_body3_lr_ph_us
+
+for_body3_us_us:
+  %j_052_us_us = phi i32 [ %inc23_us_us, %for_cond4_for_inc22_crit_edge_us_us ], [ 0, %for_body3_lr_ph_us ]
+  %add_us_us = add i32 %j_052_us_us, %0
+  %arrayidx_us_us = getelementptr inbounds i32, i32* %C, i32 %add_us_us
+  store i32 0, i32* %arrayidx_us_us, align 4
+  br label %for_body6_us_us
+
+for_cond4_for_inc22_crit_edge_us_us:
+  store i32 %add21_us_us, i32* %arrayidx_us_us, align 4
+  %inc23_us_us = add i32 %j_052_us_us, 1
+  %exitcond88 = icmp eq i32 %inc23_us_us, %N
+  br i1 %exitcond88, label %for_cond1_for_inc25_crit_edge_us, label %for_body3_us_us
+
+for_body6_us_us:
+  %1 = phi i32 [ 0, %for_body3_us_us ], [ %add21_us_us, %for_body6_us_us ]
+  %arrayidx9_us_us_phi = phi i16* [ %arrayidx9_us_us_gep, %for_body3_us_us ], [ %arrayidx9_us_us_inc, %for_body6_us_us ]
+  %k_050_us_us = phi i32 [ 0, %for_body3_us_us ], [ %inc_us_us, %for_body6_us_us ]
+  %2 = load i16, i16* %arrayidx9_us_us_phi, align 2
+  %conv_us_us = sext i16 %2 to i32
+  %mul10_us_us = mul i32 %k_050_us_us, %N
+  %add11_us_us = add i32 %mul10_us_us, %j_052_us_us
+  %arrayidx12_us_us = getelementptr inbounds i16, i16* %B, i32 %add11_us_us
+  %3 = load i16, i16* %arrayidx12_us_us, align 2
+  %conv13_us_us = sext i16 %3 to i32
+  %mul14_us_us = mul nsw i32 %conv13_us_us, %conv_us_us
+  %shr47_us_us = lshr i32 %mul14_us_us, 2
+  %and_us_us = and i32 %shr47_us_us, 15
+  %shr1548_us_us = lshr i32 %mul14_us_us, 5
+  %and16_us_us = and i32 %shr1548_us_us, 127
+  %mul17_us_us = mul i32 %and_us_us, %and16_us_us
+  %add21_us_us = add i32 %mul17_us_us, %1
+  %inc_us_us = add i32 %k_050_us_us, 1
+  %exitcond87 = icmp eq i32 %inc_us_us, %N
+  %arrayidx9_us_us_inc = getelementptr i16, i16* %arrayidx9_us_us_phi, i32 1
+  br i1 %exitcond87, label %for_cond4_for_inc22_crit_edge_us_us, label %for_body6_us_us
+
+for_end27:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-max.ll b/llvm/test/CodeGen/Hexagon/swp-max.ll
new file mode 100644
index 00000000000..038138ff256
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-max.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \
+; RUN:     -pipeliner-max-stages=2 < %s | FileCheck %s
+
+@A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8
+
+define i32 @test(i32 %Left, i32 %Right) {
+entry:
+  %add = add nsw i32 %Right, %Left
+  %div = sdiv i32 %add, 2
+  %cmp9 = icmp slt i32 %div, %Left
+  br i1 %cmp9, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: [[REG1:(r[0-9]+)]] = max(r{{[0-9]+}}, [[REG1]])
+; CHECK: [[REG0:(r[0-9]+)]] = add([[REG2:(r[0-9]+)]], [[REG0]])
+; CHECK: [[REG2]] = memw
+; CHECK: endloop0
+
+for.body:
+  %MaxLeftBorderSum.012 = phi i32 [ %MaxLeftBorderSum.1, %for.body ], [ 0, %for.body.preheader ]
+  %i.011 = phi i32 [ %dec, %for.body ], [ %div, %for.body.preheader ]
+  %LeftBorderSum.010 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @A, i32 0, i32 %i.011
+  %0 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, %LeftBorderSum.010
+  %cmp2 = icmp sgt i32 %add1, %MaxLeftBorderSum.012
+  %MaxLeftBorderSum.1 = select i1 %cmp2, i32 %add1, i32 %MaxLeftBorderSum.012
+  %dec = add nsw i32 %i.011, -1
+  %cmp = icmp slt i32 %dec, %Left
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  %MaxLeftBorderSum.0.lcssa = phi i32 [ 0, %entry ], [ %MaxLeftBorderSum.1, %for.end.loopexit ]
+  ret i32 %MaxLeftBorderSum.0.lcssa
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll
new file mode 100644
index 00000000000..56e8c651100
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+
+; Make sure we attempt to pipeline all inner most loops.
+
+; Check if the first loop is pipelined.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK-NEXT: endloop0
+
+; Check if the second loop is pipelined.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK-NEXT: endloop0
+
+define i32 @test(i32* %a, i32 %n, i32 %l) {
+entry:
+  %cmp23 = icmp sgt i32 %n, 0
+  br i1 %cmp23, label %for.body3.lr.ph.preheader, label %for.end14
+
+for.body3.lr.ph.preheader:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %sum1.026 = phi i32 [ %add8, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  %sum.025 = phi i32 [ %add, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  %j.024 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  br label %for.body3
+
+for.body3:
+  %sum.118 = phi i32 [ %sum.025, %for.body3.lr.ph ], [ %add, %for.body3 ]
+  %arrayidx.phi = phi i32* [ %a, %for.body3.lr.ph ], [ %arrayidx.inc, %for.body3 ]
+  %i.017 = phi i32 [ 0, %for.body3.lr.ph ], [ %inc, %for.body3 ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.118
+  %inc = add nsw i32 %i.017, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:
+  tail call void @bar(i32* %a) #2
+  br label %for.body6
+
+for.body6:
+  %sum1.121 = phi i32 [ %sum1.026, %for.end ], [ %add8, %for.body6 ]
+  %arrayidx7.phi = phi i32* [ %a, %for.end ], [ %arrayidx7.inc, %for.body6 ]
+  %i.120 = phi i32 [ 0, %for.end ], [ %inc10, %for.body6 ]
+  %1 = load i32, i32* %arrayidx7.phi, align 4
+  %add8 = add nsw i32 %1, %sum1.121
+  %inc10 = add nsw i32 %i.120, 1
+  %exitcond29 = icmp eq i32 %inc10, %n
+  %arrayidx7.inc = getelementptr i32, i32* %arrayidx7.phi, i32 1
+  br i1 %exitcond29, label %for.inc12, label %for.body6
+
+for.inc12:
+  %inc13 = add nsw i32 %j.024, 1
+  %exitcond30 = icmp eq i32 %inc13, %n
+  br i1 %exitcond30, label %for.end14.loopexit, label %for.body3.lr.ph
+
+for.end14.loopexit:
+  br label %for.end14
+
+for.end14:
+  %sum1.0.lcssa = phi i32 [ 0, %entry ], [ %add8, %for.end14.loopexit ]
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end14.loopexit ]
+  %add15 = add nsw i32 %sum1.0.lcssa, %sum.0.lcssa
+  ret i32 %add15
+}
+
+declare void @bar(i32*)
+
diff --git a/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll b/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll
new file mode 100644
index 00000000000..3ff88452499
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+;
+; Check that we pipeline a vectorized dot product in a single packet.
+;
+; CHECK: {
+; CHECK: += mpyi
+; CHECK: += mpyi
+; CHECK: memd
+; CHECK: memd
+; CHECK: }      :endloop0
+
+@a = common global [5000 x i32] zeroinitializer, align 8
+@b = common global [5000 x i32] zeroinitializer, align 8
+
+define i32 @vecMultGlobal() {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:
+  %0 = extractelement <2 x i32> %addp_vec, i32 0
+  %1 = extractelement <2 x i32> %addp_vec, i32 1
+  %add_sum = add i32 %0, %1
+  ret i32 %add_sum
+
+polly.loop_body:
+  %polly.loopiv13 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %reduction.012 = phi <2 x i32> [ zeroinitializer, %entry ], [ %addp_vec, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv13, 2
+  %p_arrayidx1 = getelementptr [5000 x i32], [5000 x i32]* @b, i32 0, i32 %polly.loopiv13
+  %p_arrayidx = getelementptr [5000 x i32], [5000 x i32]* @a, i32 0, i32 %polly.loopiv13
+  %vector_ptr = bitcast i32* %p_arrayidx1 to <2 x i32>*
+  %_p_vec_full = load <2 x i32>, <2 x i32>* %vector_ptr, align 8
+  %vector_ptr7 = bitcast i32* %p_arrayidx to <2 x i32>*
+  %_p_vec_full8 = load <2 x i32>, <2 x i32>* %vector_ptr7, align 8
+  %mulp_vec = mul <2 x i32> %_p_vec_full8, %_p_vec_full
+  %addp_vec = add <2 x i32> %mulp_vec, %reduction.012
+  %2 = icmp slt i32 %polly.next_loopiv, 5000
+  br i1 %2, label %polly.loop_body, label %polly.loop_after
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-vmult.ll b/llvm/test/CodeGen/Hexagon/swp-vmult.ll
new file mode 100644
index 00000000000..9018405274c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-vmult.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+
+; Multiply and accumulate
+; CHECK: mpyi([[REG0:r([0-9]+)]], [[REG1:r([0-9]+)]])
+; CHECK-NEXT: add(r{{[0-9]+}}, #4)
+; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: endloop0
+
+define i32 @foo(i32* %a, i32* %b, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.03 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %arrayidx1.phi = phi i32* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %1 = load i32, i32* %arrayidx1.phi, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.03
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/swp-vsum.ll b/llvm/test/CodeGen/Hexagon/swp-vsum.ll
new file mode 100644
index 00000000000..4756c644709
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-vsum.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+
+; Simple vector total.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add([[REG:r([0-9]+)]], r{{[0-9]+}})
+; CHECK-NEXT: add(r{{[0-9]+}}, #4)
+; CHECK-NEXT: [[REG]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: endloop0
+
+define i32 @foo(i32* %a, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/Hexagon/v60-cur.ll b/llvm/test/CodeGen/Hexagon/v60-cur.ll
index fe24309f5b8..a7d4f6d310e 100644
--- a/llvm/test/CodeGen/Hexagon/v60-cur.ll
+++ b/llvm/test/CodeGen/Hexagon/v60-cur.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
 
 ; Test that we generate a .cur
author	Brendon Cahoon <bcahoon@codeaurora.org>	2016-07-29 16:44:44 +0000
committer	Brendon Cahoon <bcahoon@codeaurora.org>	2016-07-29 16:44:44 +0000
commit	254f889dc54672f6765791078a5c22a71e2b7cb3 (patch)
tree	f1e9c5895906c2bf954cbd915ebd061396abb1c2 /llvm/test/CodeGen
parent	0bd55a7608098ebacd4f8bd4a83a15bc1b3e7206 (diff)
download	bcm5719-llvm-254f889dc54672f6765791078a5c22a71e2b7cb3.tar.gz bcm5719-llvm-254f889dc54672f6765791078a5c22a71e2b7cb3.zip