diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-07-19 14:15:18 +0000 | 
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2019-07-19 14:15:18 +0000 | 
| commit | fecf43eba3630aeb55c56e5d308f99a7bd05bfbe (patch) | |
| tree | f65a92ab47e7aed8b45afa1e0396de4197491f85 /llvm/lib | |
| parent | 3fd917d8860e9bdcabc14c536da4377307906be0 (diff) | |
| download | bcm5719-llvm-fecf43eba3630aeb55c56e5d308f99a7bd05bfbe.tar.gz bcm5719-llvm-fecf43eba3630aeb55c56e5d308f99a7bd05bfbe.zip | |
AMDGPU/GlobalISel: Rewrite lowerFormalArguments
This should now handle everything except structs passed as multiple
registers.
I think most of the packing logic should be handled by
handleAssignments, but I'm unclear on what the contract is for
multiple registers. This is copying how x86 handles this.
This does change the behavior of the test_sgpr_alignment0 amdgpu_vs
test. I don't think shader arguments should try to follow the
alignment, and registers need to be repacked. I also don't think it
matters, since I think the pointers are packed to the beginning of the
argument list anyway.
llvm-svn: 366582
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 471 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h | 11 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 65 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 27 | 
4 files changed, 374 insertions, 200 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index b107c357196..a118743f4d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -61,12 +61,126 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {    }  }; +struct IncomingArgHandler : public CallLowering::ValueHandler { +  uint64_t StackUsed = 0; + +  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, +                     CCAssignFn *AssignFn) +    : ValueHandler(MIRBuilder, MRI, AssignFn) {} + +  Register getStackAddress(uint64_t Size, int64_t Offset, +                           MachinePointerInfo &MPO) override { +    auto &MFI = MIRBuilder.getMF().getFrameInfo(); +    int FI = MFI.CreateFixedObject(Size, Offset, true); +    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); +    Register AddrReg = MRI.createGenericVirtualRegister( +      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); +    MIRBuilder.buildFrameIndex(AddrReg, FI); +    StackUsed = std::max(StackUsed, Size + Offset); +    return AddrReg; +  } + +  void assignValueToReg(Register ValVReg, Register PhysReg, +                        CCValAssign &VA) override { +    markPhysRegUsed(PhysReg); + +    if (VA.getLocVT().getSizeInBits() < 32) { +      // 16-bit types are reported as legal for 32-bit registers. We need to do +      // a 32-bit copy, and truncate to avoid the verifier complaining about it. +      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); +      MIRBuilder.buildTrunc(ValVReg, Copy); +      return; +    } + +    switch (VA.getLocInfo()) { +    case CCValAssign::LocInfo::SExt: +    case CCValAssign::LocInfo::ZExt: +    case CCValAssign::LocInfo::AExt: { +      auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); +      MIRBuilder.buildTrunc(ValVReg, Copy); +      break; +    } +    default: +      MIRBuilder.buildCopy(ValVReg, PhysReg); +      break; +    } +  } + +  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, +                            MachinePointerInfo &MPO, CCValAssign &VA) override { +    // FIXME: Get alignment +    auto MMO = MIRBuilder.getMF().getMachineMemOperand( +      MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); +    MIRBuilder.buildLoad(ValVReg, Addr, *MMO); +  } + +  /// How the physical register gets marked varies between formal +  /// parameters (it's a basic-block live-in), and a call instruction +  /// (it's an implicit-def of the BL). +  virtual void markPhysRegUsed(unsigned PhysReg) = 0; + +  // FIXME: What is the point of this being a callback? +  bool isArgumentHandler() const override { return true; } +}; + +struct FormalArgHandler : public IncomingArgHandler { +  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, +                   CCAssignFn *AssignFn) +    : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} + +  void markPhysRegUsed(unsigned PhysReg) override { +    MIRBuilder.getMBB().addLiveIn(PhysReg); +  } +}; +  }  AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)    : CallLowering(&TLI) {  } +void AMDGPUCallLowering::splitToValueTypes( +    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, +    const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, +    SplitArgTy PerformArgSplit) const { +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +  LLVMContext &Ctx = OrigArg.Ty->getContext(); + +  if (OrigArg.Ty->isVoidTy()) +    return; + +  SmallVector<EVT, 4> SplitVTs; +  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); + +  EVT VT = SplitVTs[0]; +  unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + +  if (NumParts == 1) { +    // No splitting to do, but we want to replace the original type (e.g. [1 x +    // double] -> double). +    SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Ctx), +                           OrigArg.Flags, OrigArg.IsFixed); +    return; +  } + +  LLT LLTy = getLLTForType(*OrigArg.Ty, DL); +  SmallVector<Register, 8> SplitRegs; + +  EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); +  Type *PartTy = PartVT.getTypeForEVT(Ctx); +  LLT PartLLT = getLLTForType(*PartTy, DL); + +  // FIXME: Should we be reporting all of the part registers for a single +  // argument, and let handleAssignments take care of the repacking? +  for (unsigned i = 0; i < NumParts; ++i) { +    Register PartReg = MRI.createGenericVirtualRegister(PartLLT); +    SplitRegs.push_back(PartReg); +    SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); +  } + +  PerformArgSplit(SplitRegs, LLTy, PartLLT); +} +  bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,                                       const Value *Val,                                       ArrayRef<Register> VRegs) const { @@ -156,48 +270,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,    MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);  } -static Register findFirstFreeSGPR(CCState &CCInfo) { -  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); -  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { -    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { -      return AMDGPU::SGPR0 + Reg; -    } -  } -  llvm_unreachable("Cannot allocate sgpr"); -} - -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, -                                           MachineFunction &MF, -                                           const SIRegisterInfo &TRI, -                                           SIMachineFunctionInfo &Info) { -  const LLT S32 = LLT::scalar(32); -  MachineRegisterInfo &MRI = MF.getRegInfo(); - -  if (Info.hasWorkItemIDX()) { -    Register Reg = AMDGPU::VGPR0; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); -  } - -  if (Info.hasWorkItemIDY()) { -    Register Reg = AMDGPU::VGPR1; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); -  } - -  if (Info.hasWorkItemIDZ()) { -    Register Reg = AMDGPU::VGPR2; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); -  } -} -  // Allocate special inputs passed in user SGPRs.  static void allocateHSAUserSGPRs(CCState &CCInfo,                                   MachineIRBuilder &MIRBuilder, @@ -250,60 +322,6 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,    // these from the dispatch pointer.  } -static void allocateSystemSGPRs(CCState &CCInfo, -                                MachineFunction &MF, -                                SIMachineFunctionInfo &Info, -                                CallingConv::ID CallConv, -                                bool IsShader) { -  const LLT S32 = LLT::scalar(32); -  MachineRegisterInfo &MRI = MF.getRegInfo(); - -  if (Info.hasWorkGroupIDX()) { -    Register Reg = Info.addWorkGroupIDX(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupIDY()) { -    Register Reg = Info.addWorkGroupIDY(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupIDZ()) { -    unsigned Reg = Info.addWorkGroupIDZ(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupInfo()) { -    unsigned Reg = Info.addWorkGroupInfo(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasPrivateSegmentWaveByteOffset()) { -    // Scratch wave offset passed in system SGPR. -    unsigned PrivateSegmentWaveByteOffsetReg; - -    if (IsShader) { -      PrivateSegmentWaveByteOffsetReg = -        Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); - -      // This is true if the scratch wave byte offset doesn't have a fixed -      // location. -      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { -        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); -        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); -      } -    } else -      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); - -    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); -    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); -  } -} -  bool AMDGPUCallLowering::lowerFormalArgumentsKernel(      MachineIRBuilder &MIRBuilder, const Function &F,      ArrayRef<ArrayRef<Register>> VRegs) const { @@ -311,7 +329,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(    const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();    MachineRegisterInfo &MRI = MF.getRegInfo();    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); -  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); +  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +    const DataLayout &DL = F.getParent()->getDataLayout();    SmallVector<CCValAssign, 16> ArgLocs; @@ -349,117 +369,228 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(      ++i;    } -  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); -  allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); +  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); +  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);    return true;  } +static void packSplitRegsToOrigType(MachineIRBuilder &MIRBuilder, +                                    ArrayRef<Register> OrigRegs, +                                    ArrayRef<Register> Regs, +                                    LLT LLTy, +                                    LLT PartLLT) { +  if (!LLTy.isVector() && !PartLLT.isVector()) { +    MIRBuilder.buildMerge(OrigRegs[0], Regs); +    return; +  } + +  if (LLTy.isVector() && PartLLT.isVector()) { +    assert(LLTy.getElementType() == PartLLT.getElementType()); + +    int DstElts = LLTy.getNumElements(); +    int PartElts = PartLLT.getNumElements(); +    if (DstElts % PartElts == 0) +      MIRBuilder.buildConcatVectors(OrigRegs[0], Regs); +    else { +      // Deal with v3s16 split into v2s16 +      assert(PartElts == 2 && DstElts % 2 != 0); +      int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); + +      LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); +      auto RoundedConcat = MIRBuilder.buildConcatVectors(RoundedDestTy, Regs); +      MIRBuilder.buildExtract(OrigRegs[0], RoundedConcat, 0); +    } + +    return; +  } + +  assert(LLTy.isVector() && !PartLLT.isVector()); + +  LLT DstEltTy = LLTy.getElementType(); +  if (DstEltTy == PartLLT) { +    // Vector was trivially scalarized. +    MIRBuilder.buildBuildVector(OrigRegs[0], Regs); +  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { +    // Deal with vector with 64-bit elements decomposed to 32-bit +    // registers. Need to create intermediate 64-bit elements. +    SmallVector<Register, 8> EltMerges; +    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); + +    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); + +    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  { +      auto Merge = MIRBuilder.buildMerge(DstEltTy, +                                         Regs.take_front(PartsPerElt)); +      EltMerges.push_back(Merge.getReg(0)); +      Regs = Regs.drop_front(PartsPerElt); +    } + +    MIRBuilder.buildBuildVector(OrigRegs[0], EltMerges); +  } else { +    // Vector was split, and elements promoted to a wider type. +    LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); +    auto BV = MIRBuilder.buildBuildVector(BVType, Regs); +    MIRBuilder.buildTrunc(OrigRegs[0], BV); +  } +} +  bool AMDGPUCallLowering::lowerFormalArguments(      MachineIRBuilder &MIRBuilder, const Function &F,      ArrayRef<ArrayRef<Register>> VRegs) const { +  CallingConv::ID CC = F.getCallingConv(); +    // The infrastructure for normal calling convention lowering is essentially    // useless for kernels. We want to avoid any kind of legalization or argument    // splitting. -  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) +  if (CC == CallingConv::AMDGPU_KERNEL)      return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);    // AMDGPU_GS and AMDGP_HS are not supported yet. -  if (F.getCallingConv() == CallingConv::AMDGPU_GS || -      F.getCallingConv() == CallingConv::AMDGPU_HS) +  if (CC == CallingConv::AMDGPU_GS || CC == CallingConv::AMDGPU_HS)      return false; +  const bool IsShader = AMDGPU::isShader(CC); +  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); +    MachineFunction &MF = MIRBuilder.getMF(); +  MachineBasicBlock &MBB = MIRBuilder.getMBB();    MachineRegisterInfo &MRI = MF.getRegInfo();    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); -  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); +  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); +  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();    const DataLayout &DL = F.getParent()->getDataLayout(); -  bool IsShader = AMDGPU::isShader(F.getCallingConv());    SmallVector<CCValAssign, 16> ArgLocs; -  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); +  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());    if (Info->hasImplicitBufferPtr()) { -    unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); +    Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);      MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(ImplicitBufferPtrReg);    } -  unsigned NumArgs = F.arg_size(); -  Function::const_arg_iterator CurOrigArg = F.arg_begin(); -  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + +  SmallVector<ArgInfo, 32> SplitArgs; +  unsigned Idx = 0;    unsigned PSInputNum = 0; -  BitVector Skipped(NumArgs); -  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { -    EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); - -    // We can only hanlde simple value types at the moment. -    ISD::ArgFlagsTy Flags; -    assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); -    ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; -    setArgFlags(OrigArg, i + 1, DL, F); -    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); - -    if (F.getCallingConv() == CallingConv::AMDGPU_PS && -        !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && -        PSInputNum <= 15) { -      if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { -        Skipped.set(i); -        ++PSInputNum; + +  for (auto &Arg : F.args()) { +    if (DL.getTypeStoreSize(Arg.getType()) == 0) +      continue; + +    const bool InReg = Arg.hasAttribute(Attribute::InReg); + +    // SGPR arguments to functions not implemented. +    if (!IsShader && InReg) +      return false; + +    // TODO: Handle multiple registers and sret. +    if (Arg.hasAttribute(Attribute::StructRet) || +        Arg.hasAttribute(Attribute::SwiftSelf) || +        Arg.hasAttribute(Attribute::SwiftError) || +        Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) +      return false; + +    if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { +      const bool ArgUsed = !Arg.use_empty(); +      bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); + +      if (!SkipArg) { +        Info->markPSInputAllocated(PSInputNum); +        if (ArgUsed) +          Info->markPSInputEnabled(PSInputNum); +      } + +      ++PSInputNum; + +      if (SkipArg) { +        MIRBuilder.buildUndef(VRegs[Idx][0]); +        ++Idx;          continue;        } +    } -      Info->markPSInputAllocated(PSInputNum); -      if (!CurOrigArg->use_empty()) -        Info->markPSInputEnabled(PSInputNum); +    ArgInfo OrigArg(VRegs[Idx], Arg.getType()); +    setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); +    splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CC, +      // FIXME: We should probably be passing multiple registers to +      // handleAssignments to do this +      [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) { +        packSplitRegsToOrigType(MIRBuilder, VRegs[Idx], Regs, LLTy, PartLLT); +      }); -      ++PSInputNum; +    ++Idx; +  } + +  // At least one interpolation mode must be enabled or else the GPU will +  // hang. +  // +  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user +  // set PSInputAddr, the user wants to enable some bits after the compilation +  // based on run-time states. Since we can't know what the final PSInputEna +  // will look like, so we shouldn't do anything here and the user should take +  // responsibility for the correct programming. +  // +  // Otherwise, the following restrictions apply: +  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. +  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be +  //   enabled too. +  if (CC == CallingConv::AMDGPU_PS) { +    if ((Info->getPSInputAddr() & 0x7F) == 0 || +        ((Info->getPSInputAddr() & 0xF) == 0 && +         Info->isPSInputAllocated(11))) { +      CCInfo.AllocateReg(AMDGPU::VGPR0); +      CCInfo.AllocateReg(AMDGPU::VGPR1); +      Info->markPSInputAllocated(0); +      Info->markPSInputEnabled(0);      } -    CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), -                                             /*IsVarArg=*/false); - -    if (ValEVT.isVector()) { -      EVT ElemVT = ValEVT.getVectorElementType(); -      if (!ValEVT.isSimple()) -        return false; -      MVT ValVT = ElemVT.getSimpleVT(); -      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, -                          OrigArg.Flags, CCInfo); -      if (!Res) -        return false; -    } else { -      MVT ValVT = ValEVT.getSimpleVT(); -      if (!ValEVT.isSimple()) -        return false; -      bool Res = -          AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); - -      // Fail if we don't know how to handle this type. -      if (Res) -        return false; +    if (Subtarget.isAmdPalOS()) { +      // For isAmdPalOS, the user does not enable some bits after compilation +      // based on run-time states; the register values being generated here are +      // the final ones set in hardware. Therefore we need to apply the +      // workaround to PSInputAddr and PSInputEnable together.  (The case where +      // a bit is set in PSInputAddr but not PSInputEnable is where the frontend +      // set up an input arg for a particular interpolation mode, but nothing +      // uses that input arg. Really we should have an earlier pass that removes +      // such an arg.) +      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); +      if ((PsInputBits & 0x7F) == 0 || +          ((PsInputBits & 0xF) == 0 && +           (PsInputBits >> 11 & 1))) +        Info->markPSInputEnabled( +          countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));      }    } -  Function::const_arg_iterator Arg = F.arg_begin(); - -  if (F.getCallingConv() == CallingConv::AMDGPU_VS || -      F.getCallingConv() == CallingConv::AMDGPU_PS) { -    for (unsigned i = 0, OrigArgIdx = 0; -         OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { -       if (Skipped.test(OrigArgIdx)) -          continue; -       assert(VRegs[OrigArgIdx].size() == 1 && -              "Can't lower into more than 1 reg"); -       CCValAssign &VA = ArgLocs[i++]; -       MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); -       MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); -       MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); -    } +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); -    allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); -    return true; +  if (!MBB.empty()) +    MIRBuilder.setInstr(*MBB.begin()); + +  FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); +  if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, SplitArgs, Handler)) +    return false; + +  if (!IsEntryFunc) { +    // Special inputs come after user arguments. +    TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);    } -  return false; +  // Start adding system SGPRs. +  if (IsEntryFunc) { +    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); +  } else { +    CCInfo.AllocateReg(Info->getScratchRSrcReg()); +    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); +    CCInfo.AllocateReg(Info->getFrameOffsetReg()); +    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); +  } + +  // Move back to the end of the basic block. +  MIRBuilder.setMBB(MBB); + +  return true;  } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 3599659cac6..97f8c85b9e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -29,7 +29,16 @@ class AMDGPUCallLowering: public CallLowering {                        uint64_t Offset, unsigned Align,                        Register DstReg) const; - public: +  /// A function of this type is used to perform value split action. +  using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT)>; + +  void splitToValueTypes(const ArgInfo &OrigArgInfo, +                         SmallVectorImpl<ArgInfo> &SplitArgs, +                         const DataLayout &DL, MachineRegisterInfo &MRI, +                         CallingConv::ID CallConv, +                         SplitArgTy SplitArg) const; + +public:    AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);    bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b8c7bd648f4..40b93f4ae3b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1596,29 +1596,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,  }  // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, -                                           MachineFunction &MF, -                                           const SIRegisterInfo &TRI, -                                           SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, +                                                      MachineFunction &MF, +                                                      const SIRegisterInfo &TRI, +                                                      SIMachineFunctionInfo &Info) const { +  const LLT S32 = LLT::scalar(32); +  MachineRegisterInfo &MRI = MF.getRegInfo(); +    if (Info.hasWorkItemIDX()) { -    unsigned Reg = AMDGPU::VGPR0; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR0; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));    }    if (Info.hasWorkItemIDY()) { -    unsigned Reg = AMDGPU::VGPR1; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR1; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));    }    if (Info.hasWorkItemIDZ()) { -    unsigned Reg = AMDGPU::VGPR2; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR2; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); @@ -1678,10 +1681,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {    return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);  } -static void allocateSpecialInputVGPRs(CCState &CCInfo, -                                      MachineFunction &MF, -                                      const SIRegisterInfo &TRI, -                                      SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, +                                                 MachineFunction &MF, +                                                 const SIRegisterInfo &TRI, +                                                 SIMachineFunctionInfo &Info) const {    const unsigned Mask = 0x3ff;    ArgDescriptor Arg; @@ -1699,10 +1702,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,      Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));  } -static void allocateSpecialInputSGPRs(CCState &CCInfo, -                                      MachineFunction &MF, -                                      const SIRegisterInfo &TRI, -                                      SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputSGPRs( +  CCState &CCInfo, +  MachineFunction &MF, +  const SIRegisterInfo &TRI, +  SIMachineFunctionInfo &Info) const {    auto &ArgInfo = Info.getArgInfo();    // TODO: Unify handling with private memory pointers. @@ -1735,10 +1739,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo,  }  // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, -                                 MachineFunction &MF, -                                 const SIRegisterInfo &TRI, -                                 SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, +                                            MachineFunction &MF, +                                            const SIRegisterInfo &TRI, +                                            SIMachineFunctionInfo &Info) const {    if (Info.hasImplicitBufferPtr()) {      unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);      MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); @@ -1765,9 +1769,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,    }    if (Info.hasKernargSegmentPtr()) { -    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); -    MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); +    MachineRegisterInfo &MRI = MF.getRegInfo(); +    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);      CCInfo.AllocateReg(InputPtrReg); + +    Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); +    MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));    }    if (Info.hasDispatchID()) { @@ -1787,11 +1794,11 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,  }  // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, -                                MachineFunction &MF, -                                SIMachineFunctionInfo &Info, -                                CallingConv::ID CallConv, -                                bool IsShader) { +void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, +                                           MachineFunction &MF, +                                           SIMachineFunctionInfo &Info, +                                           CallingConv::ID CallConv, +                                           bool IsShader) const {    if (Info.hasWorkGroupIDX()) {      unsigned Reg = Info.addWorkGroupIDX();      MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 21a215e16ce..1f8cf4ffe1a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -375,6 +375,33 @@ public:    AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;    unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + + +  void allocateHSAUserSGPRs(CCState &CCInfo, +                            MachineFunction &MF, +                            const SIRegisterInfo &TRI, +                            SIMachineFunctionInfo &Info) const; + +  void allocateSystemSGPRs(CCState &CCInfo, +                           MachineFunction &MF, +                           SIMachineFunctionInfo &Info, +                           CallingConv::ID CallConv, +                           bool IsShader) const; + +  void allocateSpecialEntryInputVGPRs(CCState &CCInfo, +                                      MachineFunction &MF, +                                      const SIRegisterInfo &TRI, +                                      SIMachineFunctionInfo &Info) const; +  void allocateSpecialInputSGPRs( +    CCState &CCInfo, +    MachineFunction &MF, +    const SIRegisterInfo &TRI, +    SIMachineFunctionInfo &Info) const; + +  void allocateSpecialInputVGPRs(CCState &CCInfo, +                                 MachineFunction &MF, +                                 const SIRegisterInfo &TRI, +                                 SIMachineFunctionInfo &Info) const;  };  } // End namespace llvm | 

