diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-07-21 15:45:01 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2014-07-21 15:45:01 +0000 |
commit | b02094e115fcc94c01b4d274ea24645fd3ea0c58 (patch) | |
tree | 83d43c7b53db71fb76e4727e5f04665851821a2e /llvm/lib/Target/R600/SIISelLowering.cpp | |
parent | 42639a57de19504aaae3c1aad13699e8c4005432 (diff) | |
download | bcm5719-llvm-b02094e115fcc94c01b4d274ea24645fd3ea0c58.tar.gz bcm5719-llvm-b02094e115fcc94c01b4d274ea24645fd3ea0c58.zip |
R600/SI: Use scratch memory for large private arrays
llvm-svn: 213551
Diffstat (limited to 'llvm/lib/Target/R600/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/R600/SIISelLowering.cpp | 94 |
1 files changed, 86 insertions, 8 deletions
diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index b2a8f1a9cb4..540b3c74d33 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArguments( } // The pointer to the list of arguments is stored in SGPR0, SGPR1 + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { + Info->NumUserSGPRs = 4; CCInfo.AllocateReg(AMDGPU::SGPR0); CCInfo.AllocateReg(AMDGPU::SGPR1); + CCInfo.AllocateReg(AMDGPU::SGPR2); + CCInfo.AllocateReg(AMDGPU::SGPR3); MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_BUFFER_RSRC: { + unsigned SuperReg = MI->getOperand(0).getReg(); + unsigned Args[4]; + for (unsigned i = 0, e = 4; i < e; ++i) { + MachineOperand &Arg = MI->getOperand(i + 1); + + if (Arg.isReg()) { + Args[i] = Arg.getReg(); + continue; + } + + assert(Arg.isImm()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Arg.getImm()); + Args[i] = Reg; + } + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), + SuperReg) + .addReg(Args[0]) + .addImm(AMDGPU::sub0) + .addReg(Args[1]) + .addImm(AMDGPU::sub1) + .addReg(Args[2]) + .addImm(AMDGPU::sub2) + .addReg(Args[3]) + .addImm(AMDGPU::sub3); + MI->eraseFromParent(); + break; + } case AMDGPU::V_SUB_F64: { unsigned DestReg = MI->getOperand(0).getReg(); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) @@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); @@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - //XXX: Hardcoded we only use two to store the pointer to the parameters. - unsigned NumUserSGPRs = 2; switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: @@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR0, VT); @@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); + unsigned FrameIndex = FINode->getIndex(); + + CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); + // Vector private memory loads have already been split, and + // all the rest of private memory loads are legal. + if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); if (Lowered.getNode()) return Lowered; @@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { VT.getVectorElementType() == MVT::i32) return SDValue(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, // This is a conservative aproach. It is possible that we can't determine the // correct register class and copy too often, but better safe than sorry. - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); + + SDNode *Node; + // We can't use COPY_TO_REGCLASS with FrameIndex arguments. + if (isa<FrameIndexSDNode>(Operand)) { + unsigned Opcode = Operand.getValueType() == MVT::i32 ? + AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), + Operand); + } else { + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), + Operand.getValueType(), Operand, RC); + } Operand = SDValue(Node, 0); } @@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; + } else { + // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. + // These will be lowered to immediates, so we will need to insert a MOV. + if (isa<GlobalAddressSDNode>(Ops[i])) { + SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), + Operand.getValueType(), Operand); + Ops[i] = SDValue(Node, 0); + } } if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { |