summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/CodeGen/ModuloSchedule.h9
-rw-r--r--llvm/lib/CodeGen/ModuloSchedule.cpp150
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll2
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll54
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-stages4.ll1
5 files changed, 195 insertions, 21 deletions
diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index f3b2424bd68..f041d9e48bd 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -299,6 +299,8 @@ class PeelingModuloScheduleExpander {
/// State passed from peelKernel to peelPrologAndEpilogs().
std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
+ /// Illegal phis that need to be deleted once we re-link stages.
+ SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
public:
PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
@@ -321,6 +323,13 @@ private:
/// Peels one iteration of the rewritten kernel (BB) in the specified
/// direction.
MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
+ // Delete instructions whose stage is less than MinStage in the given basic
+ // block.
+ void filterInstructions(MachineBasicBlock *MB, int MinStage);
+ // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
+ // instructions to keep a valid IR.
+ void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
+ MachineBasicBlock *SourceBB, unsigned Stage);
/// Peel the kernel forwards and backwards to produce prologs and epilogs,
/// and stitch them together.
void peelPrologAndEpilogs();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 98de0816f27..c654f0b616e 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -1582,6 +1582,99 @@ PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
return NewBB;
}
+void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
+ int MinStage) {
+ for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
+ I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
+ MachineInstr *MI = &*I++;
+ int Stage = getStage(MI);
+ if (Stage == -1 || Stage >= MinStage)
+ continue;
+
+ for (MachineOperand &DefMO : MI->defs()) {
+ SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+ for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+ // Only PHIs can use values from this block by construction.
+ // Match with the equivalent PHI in B.
+ assert(UseMI.isPHI());
+ Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+ MI->getParent());
+ Subs.emplace_back(&UseMI, Reg);
+ }
+ for (auto &Sub : Subs)
+ Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+ *MRI.getTargetRegisterInfo());
+ }
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+}
+
+void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
+ MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
+ auto InsertPt = DestBB->getFirstNonPHI();
+ DenseMap<Register, Register> Remaps;
+ for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) {
+ MachineInstr *MI = &*I++;
+ if (MI->isPHI()) {
+ // This is an illegal PHI. If we move any instructions using an illegal
+ // PHI, we need to create a legal Phi
+ Register PhiR = MI->getOperand(0).getReg();
+ auto RC = MRI.getRegClass(PhiR);
+ Register NR = MRI.createVirtualRegister(RC);
+ MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NR)
+ .addReg(PhiR)
+ .addMBB(SourceBB);
+ BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
+ CanonicalMIs[NI] = CanonicalMIs[MI];
+ Remaps[PhiR] = NR;
+ continue;
+ }
+ if (getStage(MI) != Stage)
+ continue;
+ MI->removeFromParent();
+ DestBB->insert(InsertPt, MI);
+ auto *KernelMI = CanonicalMIs[MI];
+ BlockMIs[{DestBB, KernelMI}] = MI;
+ BlockMIs.erase({SourceBB, KernelMI});
+ }
+ SmallVector<MachineInstr *, 4> PhiToDelete;
+ for (MachineInstr &MI : DestBB->phis()) {
+ assert(MI.getNumOperands() == 3);
+ MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
+ // If the instruction referenced by the phi is moved inside the block
+ // we don't need the phi anymore.
+ if (getStage(Def) == Stage) {
+ Register PhiReg = MI.getOperand(0).getReg();
+ MRI.replaceRegWith(MI.getOperand(0).getReg(),
+ Def->getOperand(0).getReg());
+ MI.getOperand(0).setReg(PhiReg);
+ PhiToDelete.push_back(&MI);
+ }
+ }
+ for (auto *P : PhiToDelete)
+ P->eraseFromParent();
+ InsertPt = DestBB->getFirstNonPHI();
+ for (MachineInstr &MI : SourceBB->phis()) {
+ MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+ DestBB->insert(InsertPt, NewMI);
+ Register OrigR = MI.getOperand(0).getReg();
+ Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+ NewMI->getOperand(0).setReg(R);
+ NewMI->getOperand(1).setReg(OrigR);
+ NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
+ Remaps[OrigR] = R;
+ CanonicalMIs[NewMI] = CanonicalMIs[&MI];
+ BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI;
+ }
+ for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I)
+ for (MachineOperand &MO : I->uses())
+ if (MO.isReg() && Remaps.count(MO.getReg()))
+ MO.setReg(Remaps[MO.getReg()]);
+}
+
void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
BitVector LS(Schedule.getNumStages(), true);
BitVector AS(Schedule.getNumStages(), true);
@@ -1607,23 +1700,36 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
// Push out the epilogs, again in reverse order.
// We can't assume anything about the minumum loop trip count at this point,
- // so emit a fairly complex epilog:
- // K[0, 1, 2] // Kernel runs stages 0, 1, 2
- // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0].
- // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
- //
- // This creates a single-successor single-predecessor sequence of blocks for
- // each epilog, which are kept this way for simplicity at this stage and
- // cleaned up by the optimizer later.
+ // so emit a fairly complex epilog.
+
+ // We first peel number of stages minus one epilogue. Then we remove dead
+ // stages and reorder instructions based on their stage. If we have 3 stages
+ // we generate first:
+ // E0[3, 2, 1]
+ // E1[3', 2']
+ // E2[3'']
+ // And then we move instructions based on their stages to have:
+ // E0[3]
+ // E1[2, 3']
+ // E2[1, 2', 3'']
+ // The transformation is legal because we only move instructions past
+ // instructions of a previous loop iteration.
for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
- Epilogs.push_back(nullptr);
- for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
- LS.reset();
- LS[J] = 1;
- Epilogs.back() = peelKernel(LPD_Back);
- LiveStages[Epilogs.back()] = LS;
- AvailableStages[Epilogs.back()] = AS;
+ Epilogs.push_back(peelKernel(LPD_Back));
+ filterInstructions(Epilogs.back(), Schedule.getNumStages() - I);
+ }
+ for (size_t I = 0; I < Epilogs.size(); I++) {
+ LS.reset();
+ for (size_t J = I; J < Epilogs.size(); J++) {
+ int Iteration = J;
+ unsigned Stage = Schedule.getNumStages() - 1 + I - J;
+ // Move stage one block at a time so that Phi nodes are updated correctly.
+ for (size_t K = Iteration; K > I; K--)
+ moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
+ LS[Stage] = 1;
}
+ LiveStages[Epilogs[I]] = LS;
+ AvailableStages[Epilogs[I]] = AS;
}
// Now we've defined all the prolog and epilog blocks as a fallthrough
@@ -1659,6 +1765,13 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
rewriteUsesOf(MI);
}
}
+ for (auto *MI : IllegalPhisToDelete) {
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+ IllegalPhisToDelete.clear();
+
// Now all remapping has been done, we're free to optimize the generated code.
for (MachineBasicBlock *B : reverse(Blocks))
EliminateDeadPhis(B, MRI, LIS);
@@ -1727,9 +1840,10 @@ void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
R = MI->getOperand(1).getReg();
MRI.setRegClass(R, MRI.getRegClass(PhiR));
MRI.replaceRegWith(PhiR, R);
- if (LIS)
- LIS->RemoveMachineInstrFromMaps(*MI);
- MI->eraseFromParent();
+ // Postpone deleting the Phi as it may be referenced by BlockMIs and used
+ // later to figure out how to remap registers.
+ MI->getOperand(0).setReg(PhiR);
+ IllegalPhisToDelete.push_back(MI);
return;
}
diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
index d14177cc684..2b69c3da8d0 100644
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,6 +1,4 @@
; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
-; XFAIL: *
-; LSR changes required.
; This version of the conv3x3 test has both loops. This test checks that the
; inner loop has 13 packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
new file mode 100644
index 00000000000..d3689cc9d8f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true < %s | FileCheck %s
+
+; Test epilogue generation when reading loop-carried dependency from a previous
+; stage. The first epilogue should read value from iteration N-1 of the kernel.
+
+; CHECK: loop0
+; CHECK: r{{[0-9]+}} = add([[REG0:r([0-9]+)]],#8)
+; CHECK: [[REG0:r([0-9]+)]] = [[REG1:r([0-9]+)]]
+; CHECK: endloop0
+; CHECK: = add([[REG1]],#8)
+
+; Function Attrs: nounwind
+define i32* @f0(i16* nocapture readonly %a0, i32 %a1) #0 {
+b0:
+ %v0 = alloca [129 x i32], align 8
+ br i1 undef, label %b1, label %b3
+
+b1: ; preds = %b0
+ br label %b2
+
+b2: ; preds = %b2, %b1
+ %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ]
+ %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ]
+ %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ]
+ %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ]
+ %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
+ %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ]
+ %v7 = load i16, i16* %v2, align 2
+ %v8 = sext i16 %v7 to i32
+ %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2
+ %v10 = load i16, i16* %v6, align 2
+ %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2
+ store i32 %v11, i32* %v4, align 4
+ %v12 = getelementptr inbounds i16, i16* %v6, i32 -1
+ %v13 = add i32 %v5, 1
+ %v14 = getelementptr inbounds i32, i32* %v3, i32 2
+ %v15 = getelementptr inbounds i16, i16* %v1, i32 2
+ %v16 = icmp slt i32 %v13, %a1
+ br i1 %v16, label %b2, label %b3
+
+b3: ; preds = %b2, %b0
+ %out = phi i32* [ null, %b0 ], [ %v14, %b2 ]
+ ret i32* %out
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
index 1b96aca2a48..a10775512dc 100644
--- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
@@ -5,7 +5,6 @@
; CHECK: = and
; CHECK: = and
-; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
; CHECK: loop0(.LBB0_[[LOOP:.]],
OpenPOWER on IntegriCloud