5 files changed, 157 insertions, 14 deletions
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d0616fd678b..20141f7f8d6 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -369,8 +369,9 @@ public:
   /// Set the Minimum Initiation Interval for this schedule attempt.
   void setMII(unsigned mii) { MII = mii; }
 
-  MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule,
-                                 bool UpdateDAG = false);
+  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
+
+  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
 
   /// Return the new base register that was stored away for the changed
   /// instruction.
@@ -3390,9 +3391,8 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
 
 /// Apply changes to the instruction if needed. The changes are need
 /// to improve the scheduling and depend up on the final schedule.
-MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
-                                                  SMSchedule &Schedule,
-                                                  bool UpdateDAG) {
+void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
+                                         SMSchedule &Schedule) {
   SUnit *SU = getSUnit(MI);
   DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
       InstrChanges.find(SU);
@@ -3400,7 +3400,7 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
     std::pair<unsigned, int64_t> RegAndOffset = It->second;
     unsigned BasePos, OffsetPos;
     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
-      return nullptr;
+      return;
     unsigned BaseReg = MI->getOperand(BasePos).getReg();
     MachineInstr *LoopDef = findDefInLoop(BaseReg);
     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
@@ -3418,15 +3418,11 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
       int64_t NewOffset =
           MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
       NewMI->getOperand(OffsetPos).setImm(NewOffset);
-      if (UpdateDAG) {
-        SU->setInstr(NewMI);
-        MISUnitMap[NewMI] = SU;
-      }
+      SU->setInstr(NewMI);
+      MISUnitMap[NewMI] = SU;
       NewMIs.insert(NewMI);
-      return NewMI;
     }
   }
-  return nullptr;
 }
 
 /// Return true for an order dependence that is loop carried potentially.
@@ -3872,6 +3868,53 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
   return true;
 }
 
+/// Attempt to fix the degenerate cases when the instruction serialization
+/// causes the register lifetimes to overlap. For example,
+///   p' = store_pi(p, b)
+///      = load p, offset
+/// In this case p and p' overlap, which means that two registers are needed.
+/// Instead, this function changes the load to use p' and updates the offset.
+void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
+  unsigned OverlapReg = 0;
+  unsigned NewBaseReg = 0;
+  for (SUnit *SU : Instrs) {
+    MachineInstr *MI = SU->getInstr();
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      // Look for an instruction that uses p. The instruction occurs in the
+      // same cycle but occurs later in the serialized order.
+      if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
+        // Check that the instruction appears in the InstrChanges structure,
+        // which contains instructions that can have the offset updated.
+        DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+          InstrChanges.find(SU);
+        if (It != InstrChanges.end()) {
+          unsigned BasePos, OffsetPos;
+          // Update the base register and adjust the offset.
+          if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
+            MI->getOperand(BasePos).setReg(NewBaseReg);
+            int64_t Offset = MI->getOperand(OffsetPos).getImm();
+            MI->getOperand(OffsetPos).setImm(Offset - It->second.second);
+          }
+        }
+        OverlapReg = 0;
+        NewBaseReg = 0;
+        break;
+      }
+      // Look for an instruction of the form p' = op(p), which uses and defines
+      // two virtual registers that get allocated to the same physical register.
+      unsigned TiedUseIdx = 0;
+      if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
+        // OverlapReg is p in the example above.
+        OverlapReg = MI->getOperand(TiedUseIdx).getReg();
+        // NewBaseReg is p' in the example above.
+        NewBaseReg = MI->getOperand(i).getReg();
+        break;
+      }
+    }
+  }
+}
+
 /// After the schedule has been formed, call this function to combine
 /// the instructions from the different stages/cycles.  That is, this
 /// function creates a schedule that represents a single iteration.
@@ -3932,7 +3975,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
   // map. We need to use the new registers to create the correct order.
   for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
     SUnit *SU = &SSD->SUnits[i];
-    SSD->applyInstrChange(SU->getInstr(), *this, true);
+    SSD->applyInstrChange(SU->getInstr(), *this);
   }
 
   // Reorder the instructions in each cycle to fix and improve the
@@ -3956,6 +3999,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
     // Replace the old order with the new order.
     cycleInstrs.swap(newOrderZC);
     cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
+    SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 
   DEBUG(dump(););
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index bcd1a5089c7..c2125bec3a5 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1651,8 +1651,13 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
 bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
       int &Value) const {
   if (isPostIncrement(MI)) {
+    // For a post-increment, the offset is zero and the increment value is
+    // determined by the instruction's access size.
+    int Zero;
     unsigned AccessSize;
-    return getBaseAndOffset(MI, Value, AccessSize);
+    bool RetVal = getBaseAndOffset(MI, Zero, AccessSize);
+    Value = (int) AccessSize;
+    return RetVal;
   }
   if (MI.getOpcode() == Hexagon::A2_addi) {
     Value = MI.getOperand(2).getImm();
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 0f407c2d836..2555b50f91c 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -496,6 +496,48 @@ void HexagonPacketizerList::useCalleesSP(MachineInstr &MI) {
   Off.setImm(Off.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
 }
 
+/// Return true if we can update the offset in MI so that MI and MJ
+/// can be packetized together.
+bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
+  assert(SUI->getInstr() && SUJ->getInstr());
+  MachineInstr &MI = *SUI->getInstr();
+  MachineInstr &MJ = *SUJ->getInstr();
+
+  unsigned BPI, OPI;
+  if (!HII->getBaseAndOffsetPosition(MI, BPI, OPI))
+    return false;
+  unsigned BPJ, OPJ;
+  if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ))
+    return false;
+  unsigned Reg = MI.getOperand(BPI).getReg();
+  if (Reg != MJ.getOperand(BPJ).getReg())
+    return false;
+  // Make sure that the dependences do not restrict adding MI to the packet.
+  // That is, ignore anti dependences, and make sure the only data dependence
+  // involves the specific register.
+  for (const auto &PI : SUI->Preds)
+    if (PI.getKind() != SDep::Anti &&
+        (PI.getKind() != SDep::Data || PI.getReg() != Reg))
+      return false;
+  int Incr;
+  if (!HII->getIncrementValue(MJ, Incr))
+    return false;
+
+  int64_t Offset = MI.getOperand(OPI).getImm();
+  MI.getOperand(OPI).setImm(Offset + Incr);
+  ChangedOffset = Offset;
+  return true;
+}
+
+/// Undo the changed offset. This is needed if the instruction cannot be
+/// added to the current packet due to a different instruction.
+void HexagonPacketizerList::undoChangedOffset(MachineInstr &MI) {
+  unsigned BP, OP;
+  if (!HII->getBaseAndOffsetPosition(MI, BP, OP))
+    llvm_unreachable("Unable to find base and offset operands.");
+  MI.getOperand(OP).setImm(ChangedOffset);
+}
+
 enum PredicateKind {
   PK_False,
   PK_True,
@@ -980,6 +1022,7 @@ void HexagonPacketizerList::initPacketizerState() {
   GlueToNewValueJump = false;
   GlueAllocframeStore = false;
   FoundSequentialDependence = false;
+  ChangedOffset = INT64_MAX;
 }
 
 // Ignore bundling of pseudo instructions.
@@ -1567,6 +1610,15 @@ bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
     useCalleesSP(I);
     GlueAllocframeStore = false;
   }
+
+  if (ChangedOffset != INT64_MAX)
+    undoChangedOffset(I);
+  else if (updateOffset(SUI, SUJ)) {
+    FoundSequentialDependence = false;
+    Dependence = false;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 3a7bdf5101e..cbdd2367429 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -38,6 +38,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // Has the feeder instruction been glued to new value jump.
   bool GlueToNewValueJump;
 
+  // This holds the offset value, when pruning the dependences.
+  int64_t ChangedOffset;
+
   // Check if there is a dependence between some instruction already in this
   // packet and this instruction.
   bool Dependence;
@@ -117,6 +120,8 @@ protected:
   bool demoteToDotOld(MachineInstr &MI);
   bool useCallersSP(MachineInstr &MI);
   void useCalleesSP(MachineInstr &MI);
+  bool updateOffset(SUnit *SUI, SUnit *SUJ);
+  void undoChangedOffset(MachineInstr &MI);
   bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2);
   bool restrictingDepExistInPacket(MachineInstr&, unsigned);
   bool isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC);
diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
new file mode 100644
index 00000000000..5de0717654f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test that the instruction ordering code in the pipeliner fixes up dependences
+; between post-increment register definitions and uses so that the register
+; allocator does not allocate an additional register. The following test case
+; should generate a single packet.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: {
+; CHECK-NOT: {
+; CHECK: :endloop0
+
+define void @test(i64* nocapture %v1, i64 %v2, i32 %len) local_unnamed_addr #0 {
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i64* [ %arrayidx.inc, %for.body ], [ %v1, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %0 = load i64, i64* %arrayidx.phi, align 8
+  %1 = tail call i64 @llvm.hexagon.M2.mmpyul.rs1(i64 %0, i64 %v2)
+  store i64 %1, i64* %arrayidx.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %len
+  %arrayidx.inc = getelementptr i64, i64* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.hexagon.M2.mmpyul.rs1(i64, i64) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }