diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 471 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h | 11 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 65 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 27 | 
4 files changed, 374 insertions, 200 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index b107c357196..a118743f4d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -61,12 +61,126 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {    }  }; +struct IncomingArgHandler : public CallLowering::ValueHandler { +  uint64_t StackUsed = 0; + +  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, +                     CCAssignFn *AssignFn) +    : ValueHandler(MIRBuilder, MRI, AssignFn) {} + +  Register getStackAddress(uint64_t Size, int64_t Offset, +                           MachinePointerInfo &MPO) override { +    auto &MFI = MIRBuilder.getMF().getFrameInfo(); +    int FI = MFI.CreateFixedObject(Size, Offset, true); +    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); +    Register AddrReg = MRI.createGenericVirtualRegister( +      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); +    MIRBuilder.buildFrameIndex(AddrReg, FI); +    StackUsed = std::max(StackUsed, Size + Offset); +    return AddrReg; +  } + +  void assignValueToReg(Register ValVReg, Register PhysReg, +                        CCValAssign &VA) override { +    markPhysRegUsed(PhysReg); + +    if (VA.getLocVT().getSizeInBits() < 32) { +      // 16-bit types are reported as legal for 32-bit registers. We need to do +      // a 32-bit copy, and truncate to avoid the verifier complaining about it. +      auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); +      MIRBuilder.buildTrunc(ValVReg, Copy); +      return; +    } + +    switch (VA.getLocInfo()) { +    case CCValAssign::LocInfo::SExt: +    case CCValAssign::LocInfo::ZExt: +    case CCValAssign::LocInfo::AExt: { +      auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); +      MIRBuilder.buildTrunc(ValVReg, Copy); +      break; +    } +    default: +      MIRBuilder.buildCopy(ValVReg, PhysReg); +      break; +    } +  } + +  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, +                            MachinePointerInfo &MPO, CCValAssign &VA) override { +    // FIXME: Get alignment +    auto MMO = MIRBuilder.getMF().getMachineMemOperand( +      MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); +    MIRBuilder.buildLoad(ValVReg, Addr, *MMO); +  } + +  /// How the physical register gets marked varies between formal +  /// parameters (it's a basic-block live-in), and a call instruction +  /// (it's an implicit-def of the BL). +  virtual void markPhysRegUsed(unsigned PhysReg) = 0; + +  // FIXME: What is the point of this being a callback? +  bool isArgumentHandler() const override { return true; } +}; + +struct FormalArgHandler : public IncomingArgHandler { +  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, +                   CCAssignFn *AssignFn) +    : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} + +  void markPhysRegUsed(unsigned PhysReg) override { +    MIRBuilder.getMBB().addLiveIn(PhysReg); +  } +}; +  }  AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)    : CallLowering(&TLI) {  } +void AMDGPUCallLowering::splitToValueTypes( +    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, +    const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, +    SplitArgTy PerformArgSplit) const { +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +  LLVMContext &Ctx = OrigArg.Ty->getContext(); + +  if (OrigArg.Ty->isVoidTy()) +    return; + +  SmallVector<EVT, 4> SplitVTs; +  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); + +  EVT VT = SplitVTs[0]; +  unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + +  if (NumParts == 1) { +    // No splitting to do, but we want to replace the original type (e.g. [1 x +    // double] -> double). +    SplitArgs.emplace_back(OrigArg.Regs[0], VT.getTypeForEVT(Ctx), +                           OrigArg.Flags, OrigArg.IsFixed); +    return; +  } + +  LLT LLTy = getLLTForType(*OrigArg.Ty, DL); +  SmallVector<Register, 8> SplitRegs; + +  EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); +  Type *PartTy = PartVT.getTypeForEVT(Ctx); +  LLT PartLLT = getLLTForType(*PartTy, DL); + +  // FIXME: Should we be reporting all of the part registers for a single +  // argument, and let handleAssignments take care of the repacking? +  for (unsigned i = 0; i < NumParts; ++i) { +    Register PartReg = MRI.createGenericVirtualRegister(PartLLT); +    SplitRegs.push_back(PartReg); +    SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); +  } + +  PerformArgSplit(SplitRegs, LLTy, PartLLT); +} +  bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,                                       const Value *Val,                                       ArrayRef<Register> VRegs) const { @@ -156,48 +270,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,    MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);  } -static Register findFirstFreeSGPR(CCState &CCInfo) { -  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); -  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { -    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { -      return AMDGPU::SGPR0 + Reg; -    } -  } -  llvm_unreachable("Cannot allocate sgpr"); -} - -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, -                                           MachineFunction &MF, -                                           const SIRegisterInfo &TRI, -                                           SIMachineFunctionInfo &Info) { -  const LLT S32 = LLT::scalar(32); -  MachineRegisterInfo &MRI = MF.getRegInfo(); - -  if (Info.hasWorkItemIDX()) { -    Register Reg = AMDGPU::VGPR0; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); -  } - -  if (Info.hasWorkItemIDY()) { -    Register Reg = AMDGPU::VGPR1; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); -  } - -  if (Info.hasWorkItemIDZ()) { -    Register Reg = AMDGPU::VGPR2; -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - -    CCInfo.AllocateReg(Reg); -    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); -  } -} -  // Allocate special inputs passed in user SGPRs.  static void allocateHSAUserSGPRs(CCState &CCInfo,                                   MachineIRBuilder &MIRBuilder, @@ -250,60 +322,6 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,    // these from the dispatch pointer.  } -static void allocateSystemSGPRs(CCState &CCInfo, -                                MachineFunction &MF, -                                SIMachineFunctionInfo &Info, -                                CallingConv::ID CallConv, -                                bool IsShader) { -  const LLT S32 = LLT::scalar(32); -  MachineRegisterInfo &MRI = MF.getRegInfo(); - -  if (Info.hasWorkGroupIDX()) { -    Register Reg = Info.addWorkGroupIDX(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupIDY()) { -    Register Reg = Info.addWorkGroupIDY(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupIDZ()) { -    unsigned Reg = Info.addWorkGroupIDZ(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasWorkGroupInfo()) { -    unsigned Reg = Info.addWorkGroupInfo(); -    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); -    CCInfo.AllocateReg(Reg); -  } - -  if (Info.hasPrivateSegmentWaveByteOffset()) { -    // Scratch wave offset passed in system SGPR. -    unsigned PrivateSegmentWaveByteOffsetReg; - -    if (IsShader) { -      PrivateSegmentWaveByteOffsetReg = -        Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); - -      // This is true if the scratch wave byte offset doesn't have a fixed -      // location. -      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { -        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); -        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); -      } -    } else -      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); - -    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); -    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); -  } -} -  bool AMDGPUCallLowering::lowerFormalArgumentsKernel(      MachineIRBuilder &MIRBuilder, const Function &F,      ArrayRef<ArrayRef<Register>> VRegs) const { @@ -311,7 +329,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(    const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();    MachineRegisterInfo &MRI = MF.getRegInfo();    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); -  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); +  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +    const DataLayout &DL = F.getParent()->getDataLayout();    SmallVector<CCValAssign, 16> ArgLocs; @@ -349,117 +369,228 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(      ++i;    } -  allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); -  allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); +  TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); +  TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);    return true;  } +static void packSplitRegsToOrigType(MachineIRBuilder &MIRBuilder, +                                    ArrayRef<Register> OrigRegs, +                                    ArrayRef<Register> Regs, +                                    LLT LLTy, +                                    LLT PartLLT) { +  if (!LLTy.isVector() && !PartLLT.isVector()) { +    MIRBuilder.buildMerge(OrigRegs[0], Regs); +    return; +  } + +  if (LLTy.isVector() && PartLLT.isVector()) { +    assert(LLTy.getElementType() == PartLLT.getElementType()); + +    int DstElts = LLTy.getNumElements(); +    int PartElts = PartLLT.getNumElements(); +    if (DstElts % PartElts == 0) +      MIRBuilder.buildConcatVectors(OrigRegs[0], Regs); +    else { +      // Deal with v3s16 split into v2s16 +      assert(PartElts == 2 && DstElts % 2 != 0); +      int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); + +      LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); +      auto RoundedConcat = MIRBuilder.buildConcatVectors(RoundedDestTy, Regs); +      MIRBuilder.buildExtract(OrigRegs[0], RoundedConcat, 0); +    } + +    return; +  } + +  assert(LLTy.isVector() && !PartLLT.isVector()); + +  LLT DstEltTy = LLTy.getElementType(); +  if (DstEltTy == PartLLT) { +    // Vector was trivially scalarized. +    MIRBuilder.buildBuildVector(OrigRegs[0], Regs); +  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { +    // Deal with vector with 64-bit elements decomposed to 32-bit +    // registers. Need to create intermediate 64-bit elements. +    SmallVector<Register, 8> EltMerges; +    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); + +    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); + +    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  { +      auto Merge = MIRBuilder.buildMerge(DstEltTy, +                                         Regs.take_front(PartsPerElt)); +      EltMerges.push_back(Merge.getReg(0)); +      Regs = Regs.drop_front(PartsPerElt); +    } + +    MIRBuilder.buildBuildVector(OrigRegs[0], EltMerges); +  } else { +    // Vector was split, and elements promoted to a wider type. +    LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); +    auto BV = MIRBuilder.buildBuildVector(BVType, Regs); +    MIRBuilder.buildTrunc(OrigRegs[0], BV); +  } +} +  bool AMDGPUCallLowering::lowerFormalArguments(      MachineIRBuilder &MIRBuilder, const Function &F,      ArrayRef<ArrayRef<Register>> VRegs) const { +  CallingConv::ID CC = F.getCallingConv(); +    // The infrastructure for normal calling convention lowering is essentially    // useless for kernels. We want to avoid any kind of legalization or argument    // splitting. -  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) +  if (CC == CallingConv::AMDGPU_KERNEL)      return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);    // AMDGPU_GS and AMDGP_HS are not supported yet. -  if (F.getCallingConv() == CallingConv::AMDGPU_GS || -      F.getCallingConv() == CallingConv::AMDGPU_HS) +  if (CC == CallingConv::AMDGPU_GS || CC == CallingConv::AMDGPU_HS)      return false; +  const bool IsShader = AMDGPU::isShader(CC); +  const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); +    MachineFunction &MF = MIRBuilder.getMF(); +  MachineBasicBlock &MBB = MIRBuilder.getMBB();    MachineRegisterInfo &MRI = MF.getRegInfo();    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); -  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); +  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); +  const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();    const DataLayout &DL = F.getParent()->getDataLayout(); -  bool IsShader = AMDGPU::isShader(F.getCallingConv());    SmallVector<CCValAssign, 16> ArgLocs; -  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); +  CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());    if (Info->hasImplicitBufferPtr()) { -    unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); +    Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);      MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);      CCInfo.AllocateReg(ImplicitBufferPtrReg);    } -  unsigned NumArgs = F.arg_size(); -  Function::const_arg_iterator CurOrigArg = F.arg_begin(); -  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + +  SmallVector<ArgInfo, 32> SplitArgs; +  unsigned Idx = 0;    unsigned PSInputNum = 0; -  BitVector Skipped(NumArgs); -  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { -    EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); - -    // We can only hanlde simple value types at the moment. -    ISD::ArgFlagsTy Flags; -    assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); -    ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; -    setArgFlags(OrigArg, i + 1, DL, F); -    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); - -    if (F.getCallingConv() == CallingConv::AMDGPU_PS && -        !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && -        PSInputNum <= 15) { -      if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { -        Skipped.set(i); -        ++PSInputNum; + +  for (auto &Arg : F.args()) { +    if (DL.getTypeStoreSize(Arg.getType()) == 0) +      continue; + +    const bool InReg = Arg.hasAttribute(Attribute::InReg); + +    // SGPR arguments to functions not implemented. +    if (!IsShader && InReg) +      return false; + +    // TODO: Handle multiple registers and sret. +    if (Arg.hasAttribute(Attribute::StructRet) || +        Arg.hasAttribute(Attribute::SwiftSelf) || +        Arg.hasAttribute(Attribute::SwiftError) || +        Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) +      return false; + +    if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { +      const bool ArgUsed = !Arg.use_empty(); +      bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); + +      if (!SkipArg) { +        Info->markPSInputAllocated(PSInputNum); +        if (ArgUsed) +          Info->markPSInputEnabled(PSInputNum); +      } + +      ++PSInputNum; + +      if (SkipArg) { +        MIRBuilder.buildUndef(VRegs[Idx][0]); +        ++Idx;          continue;        } +    } -      Info->markPSInputAllocated(PSInputNum); -      if (!CurOrigArg->use_empty()) -        Info->markPSInputEnabled(PSInputNum); +    ArgInfo OrigArg(VRegs[Idx], Arg.getType()); +    setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); +    splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CC, +      // FIXME: We should probably be passing multiple registers to +      // handleAssignments to do this +      [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) { +        packSplitRegsToOrigType(MIRBuilder, VRegs[Idx], Regs, LLTy, PartLLT); +      }); -      ++PSInputNum; +    ++Idx; +  } + +  // At least one interpolation mode must be enabled or else the GPU will +  // hang. +  // +  // Check PSInputAddr instead of PSInputEnable. The idea is that if the user +  // set PSInputAddr, the user wants to enable some bits after the compilation +  // based on run-time states. Since we can't know what the final PSInputEna +  // will look like, so we shouldn't do anything here and the user should take +  // responsibility for the correct programming. +  // +  // Otherwise, the following restrictions apply: +  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. +  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be +  //   enabled too. +  if (CC == CallingConv::AMDGPU_PS) { +    if ((Info->getPSInputAddr() & 0x7F) == 0 || +        ((Info->getPSInputAddr() & 0xF) == 0 && +         Info->isPSInputAllocated(11))) { +      CCInfo.AllocateReg(AMDGPU::VGPR0); +      CCInfo.AllocateReg(AMDGPU::VGPR1); +      Info->markPSInputAllocated(0); +      Info->markPSInputEnabled(0);      } -    CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), -                                             /*IsVarArg=*/false); - -    if (ValEVT.isVector()) { -      EVT ElemVT = ValEVT.getVectorElementType(); -      if (!ValEVT.isSimple()) -        return false; -      MVT ValVT = ElemVT.getSimpleVT(); -      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, -                          OrigArg.Flags, CCInfo); -      if (!Res) -        return false; -    } else { -      MVT ValVT = ValEVT.getSimpleVT(); -      if (!ValEVT.isSimple()) -        return false; -      bool Res = -          AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); - -      // Fail if we don't know how to handle this type. -      if (Res) -        return false; +    if (Subtarget.isAmdPalOS()) { +      // For isAmdPalOS, the user does not enable some bits after compilation +      // based on run-time states; the register values being generated here are +      // the final ones set in hardware. Therefore we need to apply the +      // workaround to PSInputAddr and PSInputEnable together.  (The case where +      // a bit is set in PSInputAddr but not PSInputEnable is where the frontend +      // set up an input arg for a particular interpolation mode, but nothing +      // uses that input arg. Really we should have an earlier pass that removes +      // such an arg.) +      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); +      if ((PsInputBits & 0x7F) == 0 || +          ((PsInputBits & 0xF) == 0 && +           (PsInputBits >> 11 & 1))) +        Info->markPSInputEnabled( +          countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));      }    } -  Function::const_arg_iterator Arg = F.arg_begin(); - -  if (F.getCallingConv() == CallingConv::AMDGPU_VS || -      F.getCallingConv() == CallingConv::AMDGPU_PS) { -    for (unsigned i = 0, OrigArgIdx = 0; -         OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { -       if (Skipped.test(OrigArgIdx)) -          continue; -       assert(VRegs[OrigArgIdx].size() == 1 && -              "Can't lower into more than 1 reg"); -       CCValAssign &VA = ArgLocs[i++]; -       MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); -       MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); -       MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); -    } +  const SITargetLowering &TLI = *getTLI<SITargetLowering>(); +  CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); -    allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); -    return true; +  if (!MBB.empty()) +    MIRBuilder.setInstr(*MBB.begin()); + +  FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); +  if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, SplitArgs, Handler)) +    return false; + +  if (!IsEntryFunc) { +    // Special inputs come after user arguments. +    TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);    } -  return false; +  // Start adding system SGPRs. +  if (IsEntryFunc) { +    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); +  } else { +    CCInfo.AllocateReg(Info->getScratchRSrcReg()); +    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); +    CCInfo.AllocateReg(Info->getFrameOffsetReg()); +    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); +  } + +  // Move back to the end of the basic block. +  MIRBuilder.setMBB(MBB); + +  return true;  } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 3599659cac6..97f8c85b9e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -29,7 +29,16 @@ class AMDGPUCallLowering: public CallLowering {                        uint64_t Offset, unsigned Align,                        Register DstReg) const; - public: +  /// A function of this type is used to perform value split action. +  using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT)>; + +  void splitToValueTypes(const ArgInfo &OrigArgInfo, +                         SmallVectorImpl<ArgInfo> &SplitArgs, +                         const DataLayout &DL, MachineRegisterInfo &MRI, +                         CallingConv::ID CallConv, +                         SplitArgTy SplitArg) const; + +public:    AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);    bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b8c7bd648f4..40b93f4ae3b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1596,29 +1596,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,  }  // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, -                                           MachineFunction &MF, -                                           const SIRegisterInfo &TRI, -                                           SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, +                                                      MachineFunction &MF, +                                                      const SIRegisterInfo &TRI, +                                                      SIMachineFunctionInfo &Info) const { +  const LLT S32 = LLT::scalar(32); +  MachineRegisterInfo &MRI = MF.getRegInfo(); +    if (Info.hasWorkItemIDX()) { -    unsigned Reg = AMDGPU::VGPR0; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR0; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));    }    if (Info.hasWorkItemIDY()) { -    unsigned Reg = AMDGPU::VGPR1; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR1; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));    }    if (Info.hasWorkItemIDZ()) { -    unsigned Reg = AMDGPU::VGPR2; -    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); +    Register Reg = AMDGPU::VGPR2; +    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);      CCInfo.AllocateReg(Reg);      Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); @@ -1678,10 +1681,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {    return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);  } -static void allocateSpecialInputVGPRs(CCState &CCInfo, -                                      MachineFunction &MF, -                                      const SIRegisterInfo &TRI, -                                      SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, +                                                 MachineFunction &MF, +                                                 const SIRegisterInfo &TRI, +                                                 SIMachineFunctionInfo &Info) const {    const unsigned Mask = 0x3ff;    ArgDescriptor Arg; @@ -1699,10 +1702,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,      Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));  } -static void allocateSpecialInputSGPRs(CCState &CCInfo, -                                      MachineFunction &MF, -                                      const SIRegisterInfo &TRI, -                                      SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputSGPRs( +  CCState &CCInfo, +  MachineFunction &MF, +  const SIRegisterInfo &TRI, +  SIMachineFunctionInfo &Info) const {    auto &ArgInfo = Info.getArgInfo();    // TODO: Unify handling with private memory pointers. @@ -1735,10 +1739,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo,  }  // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, -                                 MachineFunction &MF, -                                 const SIRegisterInfo &TRI, -                                 SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, +                                            MachineFunction &MF, +                                            const SIRegisterInfo &TRI, +                                            SIMachineFunctionInfo &Info) const {    if (Info.hasImplicitBufferPtr()) {      unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);      MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); @@ -1765,9 +1769,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,    }    if (Info.hasKernargSegmentPtr()) { -    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); -    MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); +    MachineRegisterInfo &MRI = MF.getRegInfo(); +    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);      CCInfo.AllocateReg(InputPtrReg); + +    Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); +    MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));    }    if (Info.hasDispatchID()) { @@ -1787,11 +1794,11 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,  }  // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, -                                MachineFunction &MF, -                                SIMachineFunctionInfo &Info, -                                CallingConv::ID CallConv, -                                bool IsShader) { +void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, +                                           MachineFunction &MF, +                                           SIMachineFunctionInfo &Info, +                                           CallingConv::ID CallConv, +                                           bool IsShader) const {    if (Info.hasWorkGroupIDX()) {      unsigned Reg = Info.addWorkGroupIDX();      MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 21a215e16ce..1f8cf4ffe1a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -375,6 +375,33 @@ public:    AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;    unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + + +  void allocateHSAUserSGPRs(CCState &CCInfo, +                            MachineFunction &MF, +                            const SIRegisterInfo &TRI, +                            SIMachineFunctionInfo &Info) const; + +  void allocateSystemSGPRs(CCState &CCInfo, +                           MachineFunction &MF, +                           SIMachineFunctionInfo &Info, +                           CallingConv::ID CallConv, +                           bool IsShader) const; + +  void allocateSpecialEntryInputVGPRs(CCState &CCInfo, +                                      MachineFunction &MF, +                                      const SIRegisterInfo &TRI, +                                      SIMachineFunctionInfo &Info) const; +  void allocateSpecialInputSGPRs( +    CCState &CCInfo, +    MachineFunction &MF, +    const SIRegisterInfo &TRI, +    SIMachineFunctionInfo &Info) const; + +  void allocateSpecialInputVGPRs(CCState &CCInfo, +                                 MachineFunction &MF, +                                 const SIRegisterInfo &TRI, +                                 SIMachineFunctionInfo &Info) const;  };  } // End namespace llvm  | 

