AMDGPU: Annotate necessity of flat-scratch-init

As an approximation of the existing handling to avoid regressions. Fixes using too many registers with calls on subtargets with the SGPR allocation bug. llvm-svn: 308326
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-07-18 16:44:58 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-07-18 16:44:58 +0000
commit: 254ad3de5cd433df78d44503f48c52102e17db7d (patch)
tree: 3a68a0d2906663cfc48803fc963e42ae990125ec
parent: 1cc47f8413b38f075bbc7a1e0e38ead00700efdf (diff)
download: bcm5719-llvm-254ad3de5cd433df78d44503f48c52102e17db7d.tar.gz
bcm5719-llvm-254ad3de5cd433df78d44503f48c52102e17db7d.zip
4 files changed, 69 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 66249276cd8..c68e5861ff2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -201,11 +201,14 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 }
 
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  bool HasApertureRegs = TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  bool HasFlat = ST.hasFlatAddressSpace();
+  bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
   bool Changed = false;
   bool NeedQueuePtr = false;
+  bool HaveCall = false;
   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
   for (BasicBlock &BB : F) {
@@ -215,11 +218,15 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
         Function *Callee = CS.getCalledFunction();
 
         // TODO: Do something with indirect calls.
-        if (!Callee)
+        if (!Callee) {
+          if (!CS.isInlineAsm())
+            HaveCall = true;
           continue;
+        }
 
         Intrinsic::ID IID = Callee->getIntrinsicID();
         if (IID == Intrinsic::not_intrinsic) {
+          HaveCall = true;
           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
           Changed = true;
         } else {
@@ -261,6 +268,14 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
     Changed = true;
   }
 
+  // TODO: We could refine this to captured pointers that could possibly be
+  // accessed by flat instructions. For now this is mostly a poor way of
+  // estimating whether there are calls before argument lowering.
+  if (HasFlat && !IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-flat-scratch");
+    Changed = true;
+  }
+
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ed962ac38b8..7334781916d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -246,7 +246,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+  if (MFI->hasFlatScratchInit())
     emitFlatScratchInit(ST, MF, MBB);
 
   unsigned SPReg = MFI->getStackPtrOffsetReg();
@@ -254,7 +254,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
 
     DebugLoc DL;
-    int64_t StackSize = MF.getFrameInfo().getStackSize();
+    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    int64_t StackSize = FrameInfo.getStackSize();
 
     if (StackSize == 0) {
       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 01456a124fb..a7c8166ff6d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -152,7 +152,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     }
   }
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -172,12 +173,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
     KernargSegmentPtr = true;
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS() &&
-      isEntryFunction())
-    FlatScratchInit = true;
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 32bcb21279c..e9797eff712 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -250,9 +250,48 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
   ret void
 }
 
+; HSA: declare void @external.func() #15
+declare void @external.func() #3
+
+; HSA: define internal void @defined.func() #15 {
+define internal void @defined.func() #3 {
+  ret void
+}
+
+; HSA: define void @func_call_external() #15 {
+define void @func_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define void @func_call_defined() #15 {
+define void @func_call_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
+; HSA: define void @func_call_asm() #15 {
+define void @func_call_asm() #3 {
+  call void asm sideeffect "", ""() #3
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @kern_call_external() #16 {
+define amdgpu_kernel void @kern_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @func_kern_defined() #16 {
+define amdgpu_kernel void @func_kern_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind "target-cpu"="fiji" }
 attributes #2 = { nounwind "target-cpu"="gfx900" }
+attributes #3 = { nounwind }
 
 ; HSA: attributes #0 = { nounwind readnone speculatable }
 ; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" }
@@ -269,3 +308,5 @@ attributes #2 = { nounwind "target-cpu"="gfx900" }
 ; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" }
 ; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
 ; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
+; HSA: attributes #15 = { nounwind }
+; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" }
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-07-18 16:44:58 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-07-18 16:44:58 +0000
commit	254ad3de5cd433df78d44503f48c52102e17db7d (patch)
tree	3a68a0d2906663cfc48803fc963e42ae990125ec
parent	1cc47f8413b38f075bbc7a1e0e38ead00700efdf (diff)
download	bcm5719-llvm-254ad3de5cd433df78d44503f48c52102e17db7d.tar.gz bcm5719-llvm-254ad3de5cd433df78d44503f48c52102e17db7d.zip