AMDGPU: Fold frame index into MUBUF

This matters for byval uses outside of the entry block, which appear as copies. Previously, the only folding done was during selection, which could not see the underlying frame index. For any uses outside the entry block, the frame index was materialized in the entry block relative to the global scratch wave offset. This may produce worse code in cases where the offset ends up not fitting in the MUBUF offset field. A better heuristic would be helpfu for extreme frames. llvm-svn: 364185
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-06-24 14:53:56 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2019-06-24 14:53:56 +0000
commit: 60957cb74c8869e2abd8996f616261bf4103305d (patch)
tree: 2f5a78b31d959b60a1629736a66828dacd40eb57 /llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
parent: f27f794d473e777340ee96ac5d751a698039caf8 (diff)
download: bcm5719-llvm-60957cb74c8869e2abd8996f616261bf4103305d.tar.gz
bcm5719-llvm-60957cb74c8869e2abd8996f616261bf4103305d.zip
1 files changed, 44 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3f566884f6b..78e6e39b05a 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -87,10 +87,11 @@ public:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const GCNSubtarget *ST;
+  const SIMachineFunctionInfo *MFI;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
-                   unsigned UseOpIdx,
+                   int UseOpIdx,
                    SmallVectorImpl<FoldCandidate> &FoldList,
                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
 
@@ -159,6 +160,17 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
   }
 }
 
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+                              const MachineInstr &UseMI,
+                              int OpNo,
+                              const MachineOperand &OpToFold) {
+  return OpToFold.isFI() &&
+    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -290,7 +302,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -403,7 +414,7 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
 void SIFoldOperands::foldOperand(
   MachineOperand &OpToFold,
   MachineInstr *UseMI,
-  unsigned UseOpIdx,
+  int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -453,10 +464,28 @@ void SIFoldOperands::foldOperand(
     return;
   }
 
+  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+    // Sanity check that this is a stack access.
+    // FIXME: Should probably use stack pseudos before frame lowering.
+    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
+      return;
+
+    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+        MFI->getScratchRSrcReg())
+      return;
 
-  bool FoldingImm = OpToFold.isImm();
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+    SOff->setReg(MFI->getStackPtrOffsetReg());
+    return;
+  }
 
-  if (FoldingImm && UseMI->isCopy()) {
+  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+  if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -517,7 +546,7 @@ void SIFoldOperands::foldOperand(
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
-      if (FoldingImm) {
+      if (FoldingImmLike) {
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
@@ -528,7 +557,10 @@ void SIFoldOperands::foldOperand(
 
         // FIXME: ChangeToImmediate should clear subreg
         UseMI->getOperand(1).setSubReg(0);
-        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        if (OpToFold.isImm())
+          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        else
+          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
         UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
@@ -560,7 +592,7 @@ void SIFoldOperands::foldOperand(
       return;
   }
 
-  if (!FoldingImm) {
+  if (!FoldingImmLike) {
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -904,6 +936,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // in some cases. A better heuristic is needed.
       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                    CopiesToReplace);
       } else {
         if (++NumLiteralUses == 1) {
           NonInlineUse = &*Use;
@@ -1170,8 +1205,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-06-24 14:53:56 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2019-06-24 14:53:56 +0000
commit	60957cb74c8869e2abd8996f616261bf4103305d (patch)
tree	2f5a78b31d959b60a1629736a66828dacd40eb57 /llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
parent	f27f794d473e777340ee96ac5d751a698039caf8 (diff)
download	bcm5719-llvm-60957cb74c8869e2abd8996f616261bf4103305d.tar.gz bcm5719-llvm-60957cb74c8869e2abd8996f616261bf4103305d.zip