[peephole] Enhance folding logic to work for STATEPOINTs

The general idea here is to get enough of the existing restrictions out of the way that the already existing folding logic in foldMemoryOperand can kick in for STATEPOINTs and fold references to immutable stack slots. The key changes are: Support for folding multiple operands at once which reference the same load Support for folding multiple loads into a single instruction Walk all the operands of the instruction for varidic instructions (this is a bug fix!) Once this lands, I'll post another patch which refactors the TII interface here. There's nothing actually x86 specific about the x86 code used here. Differential Revision: https://reviews.llvm.org/D24103 llvm-svn: 289510
author: Philip Reames <listmail@philipreames.com> 2016-12-13 01:38:41 +0000
committer: Philip Reames <listmail@philipreames.com> 2016-12-13 01:38:41 +0000
commit: 1f1bbac8dac99605128981effa3edebb03099916 (patch)
tree: ca5bdef2ba7734d606841a178dd32dc2be72be57
parent: 51387a8c28ca43ceadd29aa58fde8fe1103f9a2a (diff)
download: bcm5719-llvm-1f1bbac8dac99605128981effa3edebb03099916.tar.gz
bcm5719-llvm-1f1bbac8dac99605128981effa3edebb03099916.zip
5 files changed, 64 insertions, 76 deletions
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 3cf67eeaa64..1b8106d1fa1 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1540,11 +1540,6 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isDebugValue())
           continue;
 
-      // If we run into an instruction we can't fold across, discard
-      // the load candidates.
-      if (MI->isLoadFoldBarrier())
-        FoldAsLoadDefCandidates.clear();
-
       if (MI->isPosition() || MI->isPHI())
         continue;
 
@@ -1588,7 +1583,6 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
                      << '\n');
         NAPhysToVirtMIs.clear();
-        continue;
       }
 
       if ((isUncoalescableCopy(*MI) &&
@@ -1639,8 +1633,14 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       // earlier load into MI.
       if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
           !FoldAsLoadDefCandidates.empty()) {
+
+        // We visit each operand even after successfully folding a previous
+        // one.  This allows us to fold multiple loads into a single
+        // instruction.  We do assume that optimizeLoadInstr doesn't insert
+        // foldable uses earlier in the argument list.  Since we don't restart
+        // iteration, we'd miss such cases.
         const MCInstrDesc &MIDesc = MI->getDesc();
-        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
+        for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
              ++i) {
           const MachineOperand &MOp = MI->getOperand(i);
           if (!MOp.isReg())
@@ -1667,13 +1667,23 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              // MI is replaced with FoldMI.
+              
+              // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
-              break;
+              MI = FoldMI;
             }
           }
         }
       }
+      
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.  Note: We might be able to fold *into* this
+      // instruction, so this needs to be after the folding logic.
+      if (MI->isLoadFoldBarrier()) {
+        DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n");
+        FoldAsLoadDefCandidates.clear();
+      }
+
     }
   }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ca0004abc4c..dfd88e4deda 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6576,14 +6576,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
-  if (FoldAsLoadDefReg == 0)
-    return nullptr;
-  // To be conservative, if there exists another load, clear the load candidate.
-  if (MI.mayLoad()) {
-    FoldAsLoadDefReg = 0;
-    return nullptr;
-  }
-
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
@@ -6592,27 +6584,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
     return nullptr;
 
   // Collect information about virtual register operands of MI.
-  unsigned SrcOperandId = 0;
-  bool FoundSrcOperand = false;
-  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+  SmallVector<unsigned, 1> SrcOperandIds;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (Reg != FoldAsLoadDefReg)
       continue;
-    // Do not fold if we have a subreg use or a def or multiple uses.
-    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+    // Do not fold if we have a subreg use or a def.
+    if (MO.getSubReg() || MO.isDef())
       return nullptr;
-
-    SrcOperandId = i;
-    FoundSrcOperand = true;
+    SrcOperandIds.push_back(i);
   }
-  if (!FoundSrcOperand)
+  if (SrcOperandIds.empty())
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
diff --git a/llvm/test/CodeGen/X86/anyregcc.ll b/llvm/test/CodeGen/X86/anyregcc.ll
index 018a92a52f7..1b51b53bd22 100644
--- a/llvm/test/CodeGen/X86/anyregcc.ll
+++ b/llvm/test/CodeGen/X86/anyregcc.ll
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:   .quad 56
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _anyreg_test2
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _patchpoint_spilldef
 ; CHECK-NEXT:   .quad 56
@@ -272,31 +272,31 @@ entry:
 ; CHECK-NEXT:   .byte 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
+; Loc 9: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 10: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 11: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 12: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 13: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index fe3846254be..9818d3547fc 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -38,10 +38,10 @@
 ; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spilledValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spilledStackMapValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad _spillSubReg
 ; CHECK-NEXT:   .quad 56
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index a1718f0f193..b236393e9f4 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -34,35 +34,24 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 entry:
 ; TODO: We should have folded the reload into the statepoint.
 ; CHECK-LABEL: @test3
-; CHECK:       	movl	32(%rsp), %r10d
-; CHECK-NEXT: 	movl	24(%rsp), %r11d
-; CHECK-NEXT:   movl	16(%rsp), %eax
+; CHECK:       	pushq %rax
+; CHECK-NEXT: 	Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
   ret void
 }
 
 ; This case just confirms that we don't crash when given more live values
-; than registers.  This is a case where we *have* to use a stack slot.
+; than registers.  This is a case where we *have* to use a stack slot.  This
+; also ends up being a good test of whether we can fold loads from immutable
+; stack slots into the statepoint.
 define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
 entry:
-; TODO: We should have folded the reload into the statepoint.  
 ; CHECK-LABEL: test4
-; CHECK: pushq %r15
-; CHECK: pushq %r14
-; CHECK: pushq %r13
-; CHECK: pushq %r12
-; CHECK: pushq %rbx
-; CHECK: pushq %rax
-; CHECK:       	movl	128(%rsp), %r13d
-; CHECK-NEXT:   movl	120(%rsp), %r12d
-; CHECK-NEXT:   movl	112(%rsp), %r15d
-; CHECK-NEXT:   movl	104(%rsp), %r14d
-; CHECK-NEXT:   movl	96(%rsp), %ebp
-; CHECK-NEXT:   movl	88(%rsp), %ebx
-; CHECK-NEXT:   movl	80(%rsp), %r11d
-; CHECK-NEXT:   movl	72(%rsp), %r10d
-; CHECK-NEXT:   movl	64(%rsp), %eax
+; CHECK:        pushq %rax
+; CHECK-NEXT: 	Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
 ; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
   ret void
@@ -90,7 +79,7 @@ entry:
 ; CHECK:        movl %edi, %ebx
 ; CHECK:        movl %ebx, 12(%rsp)
 ; CHECK-NEXT:   callq	_baz
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp
 ; CHECK-NEXT:   callq	_bar
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
author	Philip Reames <listmail@philipreames.com>	2016-12-13 01:38:41 +0000
committer	Philip Reames <listmail@philipreames.com>	2016-12-13 01:38:41 +0000
commit	1f1bbac8dac99605128981effa3edebb03099916 (patch)
tree	ca5bdef2ba7734d606841a178dd32dc2be72be57
parent	51387a8c28ca43ceadd29aa58fde8fe1103f9a2a (diff)
download	bcm5719-llvm-1f1bbac8dac99605128981effa3edebb03099916.tar.gz bcm5719-llvm-1f1bbac8dac99605128981effa3edebb03099916.zip