AMDGPU: Don't use struct type for argument layout

This was introducing unnecessary padding after the explicit arguments, depending on the alignment of the total struct type. Also has the side effect of avoiding creating an extra GEP for the offset from the base kernel argument to the explicit kernel argument offset. llvm-svn: 335999
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-29 17:31:42 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-29 17:31:42 +0000
commit: f5be3ad7f86c31a0e185250581f53f50c220a524 (patch)
tree: 7230382f6d0df405f5e551b1396b1c956a986a09 /llvm/lib
parent: 87b107dd698fcf0678e65208d670c04cfa570355 (diff)
download: bcm5719-llvm-f5be3ad7f86c31a0e185250581f53f50c220a524.tar.gz
bcm5719-llvm-f5be3ad7f86c31a0e185250581f53f50c220a524.zip
4 files changed, 71 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index e6f25144e04..4221da96996 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1203,9 +1203,13 @@ AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps(
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
+  const Function &F = MF.getFunction();
 
-  HSACodeProps.mKernargSegmentSize =
-    STM.getKernArgSegmentSize(MF.getFunction(), MFI.getExplicitKernArgSize());
+  // Avoid asserting on erroneous cases.
+  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return HSACodeProps;
+
+  HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F);
   HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
   HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
   HSACodeProps.mKernargSegmentAlign =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 438d6c8ab32..b06f3014ac6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -74,31 +74,11 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   BasicBlock &EntryBlock = *F.begin();
   IRBuilder<> Builder(&*EntryBlock.begin());
 
-  SmallVector<Type *, 16> ArgTypes;
-  for (Argument &Arg : F.args()) {
-    Type *ArgTy = Arg.getType();
-    unsigned Size = DL.getTypeStoreSizeInBits(ArgTy);
-    bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
-                    !ST.isAmdHsaOS();
-
-    // Clover seems to always pad i8/i16 to i32, but doesn't properly align
-    // them?
-    // Make sure the struct elements have correct size and alignment for ext
-    // args. These seem to be padded up to 4-bytes but not correctly aligned.
-    ArgTypes.push_back(
-      IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType());
-  }
-
-  StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName());
-  const StructLayout *Layout = DL.getStructLayout(ArgStructTy);
-
-  // Minimum alignment for kern segment is 16.
-  unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy));
+  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
 
   // FIXME: Alignment is broken broken with explicit arg offset.;
-  const uint64_t TotalKernArgSize = BaseOffset +
-    ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
+  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F);
   if (TotalKernArgSize == 0)
     return false;
 
@@ -109,23 +89,34 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
   KernArgSegment->addAttribute(AttributeList::ReturnIndex,
     Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
-  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
-    Attribute::getWithAlignment(Ctx, KernArgBaseAlign));
-
-  Value *KernArgBase = KernArgSegment;
-  if (BaseOffset != 0) {
-    KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset);
-    KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset);
-  }
 
   unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
-  Value *CastStruct = Builder.CreateBitCast(KernArgBase,
-                                            ArgStructTy->getPointerTo(AS));
+  unsigned MaxAlign = 1;
+  uint64_t ExplicitArgOffset = 0;
+
   for (Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    MaxAlign = std::max(Align, MaxAlign);
+    unsigned Size = DL.getTypeSizeInBits(ArgTy);
+    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+
+    // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+    // them?
+    // Make sure the struct elements have correct size and alignment for ext
+    // args. These seem to be padded up to 4-bytes but not correctly aligned.
+    bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+                    !ST.isAmdHsaOS();
+    if (IsExtArg)
+      AllocSize = 4;
+
+    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
     if (Arg.use_empty())
       continue;
 
-    Type *ArgTy = Arg.getType();
     if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
       // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
       // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
@@ -145,10 +136,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     bool IsV3 = VT && VT->getNumElements() == 3;
     VectorType *V4Ty = nullptr;
 
-    unsigned Size = DL.getTypeSizeInBits(ArgTy);
-    bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
-                    !ST.isAmdHsaOS();
-    int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo());
     int64_t AlignDownOffset = alignDown(EltOffset, 4);
     int64_t OffsetDiff = EltOffset - AlignDownOffset;
     unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
@@ -162,19 +149,24 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
       // Additionally widen any sub-dword load to i32 even if suitably aligned,
       // so that CSE between different argument loads works easily.
 
-      ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset);
-      ArgPtr = Builder.CreateBitCast(
-        ArgPtr,
-        Builder.getInt32Ty()->getPointerTo(AS),
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
         Arg.getName() + ".kernarg.offset.align.down");
+      ArgPtr = Builder.CreateBitCast(ArgPtr,
+                                     Builder.getInt32Ty()->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
     } else {
-      ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(),
-                                       Arg.getName() + ".kernarg.offset");
+      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
+        KernArgSegment,
+        AlignDownOffset,
+        Arg.getName() + ".kernarg.offset");
+      ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
+                                     ArgPtr->getName() + ".cast");
     }
 
     assert((!IsExtArg || !IsV3) && "incompatible situation");
 
-
     if (IsV3 && Size >= 32) {
       V4Ty = VectorType::get(VT->getVectorElementType(), 4);
       // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
@@ -254,6 +246,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     }
   }
 
+  KernArgSegment->addAttribute(
+    AttributeList::ReturnIndex,
+    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index be775a1ae6b..9ad16e0b8a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -451,11 +451,31 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
+uint64_t SISubtarget::getExplicitKernArgSize(const Function &F) const {
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL);
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  uint64_t ExplicitArgBytes = 0;
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+  }
+
+  return ExplicitArgBytes;
+}
+
 unsigned SISubtarget::getKernArgSegmentSize(const Function &F,
-                                            unsigned ExplicitArgBytes) const {
-  uint64_t TotalSize = ExplicitArgBytes;
-  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+                                            int64_t ExplicitArgBytes) const {
+  if (ExplicitArgBytes == -1)
+    ExplicitArgBytes = getExplicitKernArgSize(F);
 
+  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
   if (ImplicitBytes != 0) {
     unsigned Alignment = getAlignmentForImplicitArgPtr();
     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 9c8b82c2834..d21301a6a9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -862,8 +862,9 @@ public:
     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
+  uint64_t getExplicitKernArgSize(const Function &F) const;
   unsigned getKernArgSegmentSize(const Function &F,
-                                 unsigned ExplictArgBytes) const;
+                                 int64_t ExplicitArgBytes = -1) const;
 
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-29 17:31:42 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-29 17:31:42 +0000
commit	f5be3ad7f86c31a0e185250581f53f50c220a524 (patch)
tree	7230382f6d0df405f5e551b1396b1c956a986a09 /llvm/lib
parent	87b107dd698fcf0678e65208d670c04cfa570355 (diff)
download	bcm5719-llvm-f5be3ad7f86c31a0e185250581f53f50c220a524.tar.gz bcm5719-llvm-f5be3ad7f86c31a0e185250581f53f50c220a524.zip