diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 279 |
1 files changed, 267 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 217b3996996..73486b969f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -245,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, } } -static int64_t getConstant(const MachineInstr *MI) { - return MI->getOperand(1).getCImm()->getSExtValue(); -} - static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { switch (Opc) { case AMDGPU::G_AND: @@ -737,6 +734,260 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, .addImm(Enabled); } +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) + return true; + + // FIXME: matcher should ignore copies + return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; +} + +static unsigned extractGLC(unsigned CachePolicy) { + return CachePolicy & 1; +} + +static unsigned extractSLC(unsigned CachePolicy) { + return (CachePolicy >> 1) & 1; +} + +static unsigned extractDLC(unsigned CachePolicy) { + return (CachePolicy >> 2) & 1; +} + +// Returns Base register, constant offset, and offset def point. +static std::tuple<Register, unsigned, MachineInstr *> +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return {Reg, 0, nullptr}; + + if (Def->getOpcode() == AMDGPU::G_CONSTANT) { + unsigned Offset; + const MachineOperand &Op = Def->getOperand(1); + if (Op.isImm()) + Offset = Op.getImm(); + else + Offset = Op.getCImm()->getZExtValue(); + + return {Register(), Offset, Def}; + } + + int64_t Offset; + if (Def->getOpcode() == AMDGPU::G_ADD) { + // TODO: Handle G_OR used for add case + if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) + return {Def->getOperand(0).getReg(), Offset, Def}; + + // FIXME: matcher should ignore copies + if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) + return {Def->getOperand(0).getReg(), Offset, Def}; + } + + return {Reg, 0, Def}; +} + +static unsigned getBufferStoreOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + const int Size = Ty.getSizeInBits(); + switch (8 * MemSize) { + case 8: + return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + case 16: + return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + default: + unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + return Opc; + } +} + +static unsigned getBufferStoreFormatOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + bool IsD16Packed = Ty.getScalarSizeInBits() == 16; + bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); + int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + + if (IsD16Packed) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; + default: + return -1; + } + } + + if (IsD16Unpacked) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; + default: + return -1; + } + } + + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; + default: + return -1; + } + + llvm_unreachable("unhandled buffer store"); +} + +// TODO: Move this to combiner +// Returns base register, imm offset, total constant offset. +std::tuple<Register, unsigned, unsigned> +AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + MachineRegisterInfo &MRI = *B.getMRI(); + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = getBaseWithConstantOffset(MRI, OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store.f + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + // In case this is in a waterfall loop, insert offset code at the def point + // of the offset, not inside the loop. + MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); + MachineBasicBlock &OldMBB = B.getMBB(); + B.setInstr(*OffsetDef); + + if (!BaseReg) { + BaseReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(BaseReg) + .addImm(Overflow); + } else { + Register OverflowVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(OverflowVal) + .addImm(Overflow); + + Register NewBaseReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) + .addReg(BaseReg) + .addReg(OverflowVal, RegState::Kill) + .addImm(0); + BaseReg = NewBaseReg; + } + + B.setInsertPt(OldMBB, OldInsPt); + } + + return {BaseReg, ImmOffset, TotalConstOffset}; +} + +bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, + bool IsFormat) const { + MachineIRBuilder B(MI); + MachineRegisterInfo &MRI = *B.getMRI(); + MachineFunction &MF = B.getMF(); + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + int Size = Ty.getSizeInBits(); + if (Size % 32 != 0) + return false; + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned CachePolicy = MI.getOperand(5).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); + + const bool Offen = !isZero(VOffset, MRI); + + int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : + getBufferStoreOpcode(Ty, MemSize, Offen); + if (Opc == -1) + return false; + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(CachePolicy)) + .addImm(extractSLC(CachePolicy)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(CachePolicy)) + .addMemOperand(MMO); + + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); @@ -746,10 +997,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp: { - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); + int64_t Done = I.getOperand(7).getImm(); + int64_t VM = I.getOperand(8).getImm(); MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), I.getOperand(4).getReg(), @@ -762,13 +1013,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( } case Intrinsic::amdgcn_exp_compr: { const DebugLoc &DL = I.getDebugLoc(); - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); Register Reg0 = I.getOperand(3).getReg(); Register Reg1 = I.getOperand(4).getReg(); Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); + int64_t Done = I.getOperand(5).getImm(); + int64_t VM = I.getOperand(6).getImm(); BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, @@ -791,6 +1042,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } + case Intrinsic::amdgcn_raw_buffer_store: + return selectStoreIntrinsic(I, false); + case Intrinsic::amdgcn_raw_buffer_store_format: + return selectStoreIntrinsic(I, true); default: return selectImpl(I, *CoverageInfo); } |