diff options
| author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-06-28 01:52:13 +0000 |
|---|---|---|
| committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-06-28 01:52:13 +0000 |
| commit | 07fd88d7358bbcc2d4a3bfdc05700ad68541cf44 (patch) | |
| tree | 2b512fe95be0cad910fc989d0a40442bcbd12521 /llvm/lib | |
| parent | 3018d1845b66a5cc09e5a1feb84b1725ff0ed4af (diff) | |
| download | bcm5719-llvm-07fd88d7358bbcc2d4a3bfdc05700ad68541cf44.tar.gz bcm5719-llvm-07fd88d7358bbcc2d4a3bfdc05700ad68541cf44.zip | |
[AMDGPU] Packed thread ids in function call ABI
Differential Revision: https://reviews.llvm.org/D63851
llvm-svn: 364619
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h | 29 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 97 |
4 files changed, 132 insertions, 22 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index 81c3356ab9e..99a01ca3a2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -26,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS, } if (isRegister()) - OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + OS << "Reg " << printReg(getRegister(), TRI); else - OS << "Stack offset " << getStackOffset() << '\n'; + OS << "Stack offset " << getStackOffset(); + + if (isMasked()) { + OS << " & "; + llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower); + } + + OS << '\n'; } char AMDGPUArgumentUsageInfo::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 277f3616031..ab0024b50be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -32,18 +32,27 @@ private: unsigned StackOffset; }; + // Bitmask to locate argument within the register. + unsigned Mask; + bool IsStack : 1; bool IsSet : 1; - ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) - : Register(Val), IsStack(IsStack), IsSet(IsSet) {} public: - static ArgDescriptor createRegister(unsigned Reg) { - return ArgDescriptor(Reg, false, true); + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + bool IsStack = false, bool IsSet = false) + : Register(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + + static ArgDescriptor createRegister(unsigned Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, false, true); + } + + static ArgDescriptor createStack(unsigned Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, true, true); } - static ArgDescriptor createStack(unsigned Reg) { - return ArgDescriptor(Reg, true, true); + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + return ArgDescriptor(Arg.Register, Mask, Arg.IsStack, Arg.IsSet); } bool isSet() const { @@ -68,6 +77,14 @@ public: return StackOffset; } + unsigned getMask() const { + return Mask; + } + + bool isMasked() const { + return Mask != ~0u; + } + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d0af336a00b..766294dee23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4233,9 +4233,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, const ArgDescriptor &Arg) const { assert(Arg && "Attempting to load missing argument"); - if (Arg.isRegister()) - return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); - return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + SDValue V = Arg.isRegister() ? + CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : + loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + + if (!Arg.isMasked()) + return V; + + unsigned Mask = Arg.getMask(); + unsigned Shift = countTrailingZeros<unsigned>(Mask); + V = DAG.getNode(ISD::SRL, SL, VT, V, + DAG.getShiftAmountConstant(Shift, VT, SL)); + return DAG.getNode(ISD::AND, SL, VT, V, + DAG.getConstant(Mask >> Shift, SL, VT)); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 25000506f2c..398f6887644 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1585,7 +1585,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. -static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { +// If \p Mask is is given it indicates bitfield position in the register. +// If \p Arg is given use it with new ]p Mask instead of allocating new. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, + ArgDescriptor Arg = ArgDescriptor()) { + if (Arg.isSet()) + return ArgDescriptor::createArg(Arg, Mask); + ArrayRef<MCPhysReg> ArgVGPRs = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); @@ -1593,7 +1599,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { // Spill to stack required. int64_t Offset = CCInfo.AllocateStack(4, 4); - return ArgDescriptor::createStack(Offset); + return ArgDescriptor::createStack(Offset, Mask); } unsigned Reg = ArgVGPRs[RegIdx]; @@ -1602,7 +1608,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { MachineFunction &MF = CCInfo.getMachineFunction(); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - return ArgDescriptor::createRegister(Reg); + return ArgDescriptor::createRegister(Reg, Mask); } static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, @@ -1634,14 +1640,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) - Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + const unsigned Mask = 0x3ff; + ArgDescriptor Arg; - if (Info.hasWorkItemIDY()) - Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + if (Info.hasWorkItemIDX()) { + Arg = allocateVGPR32Input(CCInfo, Mask); + Info.setWorkItemIDX(Arg); + } + + if (Info.hasWorkItemIDY()) { + Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); + Info.setWorkItemIDY(Arg); + } if (Info.hasWorkItemIDZ()) - Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } static void allocateSpecialInputSGPRs(CCState &CCInfo, @@ -2387,9 +2400,6 @@ void SITargetLowering::passSpecialInputs( AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR }; @@ -2429,6 +2439,71 @@ void SITargetLowering::passSpecialInputs( MemOpChains.push_back(ArgStore); } } + + // Pack workitem IDs into a single register or pass it as is if already + // packed. + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + if (!OutgoingArg) + return; + + const ArgDescriptor *IncomingArgX + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; + const ArgDescriptor *IncomingArgY + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; + const ArgDescriptor *IncomingArgZ + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + + SDValue InputReg; + SDLoc SL; + + // If incoming ids are not packed we need to pack them. + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); + Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, + DAG.getShiftAmountConstant(10, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; + } + + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); + Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, + DAG.getShiftAmountConstant(20, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; + } + + if (!InputReg.getNode()) { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } static bool canGuaranteeTCO(CallingConv::ID CC) { |

