diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp | 181 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 343 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 36 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 |
11 files changed, 546 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 07ae2bee49b..ea64b125b3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -42,6 +42,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIFixupVectorISelPass(); +FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -153,6 +154,9 @@ extern char &AMDGPUSimplifyLibCallsID; void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeSIAddIMGInitPass(PassRegistry &); +extern char &SIAddIMGInitID; + void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 23470c7a4d2..8f20c407ab8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -367,6 +367,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128", "Use ds_{read|write}_b128" >; +// Sparse texture support requires that all result registers are zeroed when +// PRTStrictNull is set to true. This feature is turned on for all architectures +// but is enabled as a feature in case there are situations where PRTStrictNull +// is disabled by the driver. +def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null", + "EnablePRTStrictNull", + "true", + "Enable zeroing of result registers for sparse texture fetches" +>; + // Unless +-flat-for-global is specified, turn on FlatForGlobal for // all OS-es on VI and newer hardware to avoid assertion failures due // to missing ADDR64 variants of MUBUF instructions. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index f1acd72b03a..1754ead2538 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you // disable it. + // + // Similarly we want enable-prt-strict-null to be on by default and not to + // unset everything else if it is disabled SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); @@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "-fp32-denormals,"; } + FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -175,6 +180,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), EnableDS128(false), + EnablePRTStrictNull(false), DumpCode(false), FP64(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 886aca42b6c..f6b176fe604 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -326,6 +326,7 @@ protected: bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; bool EnableDS128; + bool EnablePRTStrictNull; bool DumpCode; // Subtarget statically properties set by tablegen @@ -576,6 +577,12 @@ public: return getGeneration() < AMDGPUSubtarget::GFX9; } + /// \returns If target requires PRT Struct NULL support (zero result registers + /// for sparse texture support). + bool usePRTStrictNull() const { + return EnablePRTStrictNull; + } + bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2198ba8d6c0..c2e2129f5de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -815,6 +815,7 @@ bool GCNPassConfig::addInstSelector() { addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); addPass(createSIFixupVectorISelPass()); + addPass(createSIAddIMGInitPass()); return false; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index bb1096bc1de..dd1b8532aae 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -93,6 +93,7 @@ add_llvm_target(AMDGPUCodeGen R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp + SIAddIMGInit.cpp SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 1462682e761..1c68dbd78e7 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -29,6 +29,7 @@ class MIMGBaseOpcode { bit Atomic = 0; bit AtomicX2 = 0; // (f)cmpswap bit Sampler = 0; + bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; bit Coordinates = 1; @@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum { def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; @@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; + let VDataDwords = 8 in + defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>; } } @@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } @@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = 1; + let Gather4 = 1; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, @@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp new file mode 100644 index 00000000000..69cafef4a35 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -0,0 +1,181 @@ +//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Any MIMG instructions that use tfe or lwe require an initialization of the +/// result register that will be written in the case of a memory access failure +/// The required code is also added to tie this init code to the result of the +/// img instruction +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-img-init" + +using namespace llvm; + +namespace { + +class SIAddIMGInit : public MachineFunctionPass { +public: + static char ID; + +public: + SIAddIMGInit() : MachineFunctionPass(ID) { + initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false) + +char SIAddIMGInit::ID = 0; + +char &llvm::SIAddIMGInitID = SIAddIMGInit::ID; + +FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); } + +bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + bool Changed = false; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + auto Opcode = MI.getOpcode(); + if (TII->isMIMG(Opcode) && !MI.mayStore()) { + MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + // Check for instructions that don't have tfe or lwe fields + // There shouldn't be any at this point. + assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction"); + + unsigned TFEVal = TFE->getImm(); + unsigned LWEVal = LWE->getImm(); + unsigned D16Val = D16 ? D16->getImm() : 0; + + if (TFEVal || LWEVal) { + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + const DebugLoc &DL = MI.getDebugLoc(); + + int DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = + TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + + // check that dmask operand is found. + assert(MO_Dmask && "Expected dmask operand in instruction"); + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned ActiveLanes = + TII->isGather4(Opcode) ? 4 : countPopulation(dmask); + + // Subreg indices are counted from 1 + // When D16 then we want next whole VGPR after write data. + static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected"); + + bool Packed = !ST.hasUnpackedD16VMem(); + + unsigned InitIdx = + D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t DstSize = + RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + if (DstSize < InitIdx) + continue; + + // Create a register for the intialization value. + unsigned PrevDst = + MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + unsigned NewDst = 0; // Final initialized value will be in here + + // If PRTStrictNull feature is enabled (the default) then initialize + // all the result registers to 0, otherwise just the error indication + // register (VGPRn+1) + unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; + unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx; + + if (DstSize == 1) { + // In this case we can just initialize the result directly + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst) + .addImm(0); + NewDst = PrevDst; + } else { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); + for (; SizeLeft; SizeLeft--, CurrIdx++) { + NewDst = + MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + // Initialize dword + unsigned SubReg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) + .addImm(0); + // Insert into the super-reg + BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) + .addReg(PrevDst) + .addReg(SubReg) + .addImm(CurrIdx); + + PrevDst = NewDst; + } + } + + // Add as an implicit operand + MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit); + + // Tie the just added implicit operand to the dst + MI.tieOperands(DstIdx, MI.getNumOperands() - 1); + + Changed = true; + } + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9f5198042e4..e3d6b7941ee 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,6 +216,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -813,6 +814,48 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } +static MVT memVTFromAggregate(Type *Ty) { + // Only limited forms of aggregate type currently expected. + assert(Ty->isStructTy() && "Expected struct type"); + + + Type *ElementType = nullptr; + unsigned NumElts; + if (Ty->getContainedType(0)->isVectorTy()) { + VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); + ElementType = VecComponent->getElementType(); + NumElts = VecComponent->getNumElements(); + } else { + ElementType = Ty->getContainedType(0); + NumElts = 1; + } + + Type *FlagComponent = Ty->getContainedType(1); + assert(FlagComponent->isIntegerTy(32) && "Expected int32 type"); + + // Calculate the size of the memVT type from the aggregate + unsigned Pow2Elts = 0; + unsigned ElementSize; + switch (ElementType->getTypeID()) { + default: + llvm_unreachable("Unknown type!"); + case Type::IntegerTyID: + ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); + break; + case Type::HalfTyID: + ElementSize = 16; + break; + case Type::FloatTyID: + ElementSize = 32; + break; + } + unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; + Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); + + return MVT::getVectorVT(MVT::getVT(ElementType, false), + Pow2Elts); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -840,7 +883,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); + Info.memVT = MVT::getVT(CI.getType(), true); + if (Info.memVT == MVT::Other) { + // Some intrinsics return an aggregate type - special case to work out + // the correct memVT + Info.memVT = memVTFromAggregate(CI.getType()); + } Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; @@ -4613,6 +4661,109 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +// Re-construct the required return value for a image load intrinsic. +// This is more complicated due to the optional use TexFailCtrl which means the required +// return type is an aggregate +static SDValue constructRetValue(SelectionDAG &DAG, + MachineSDNode *Result, + ArrayRef<EVT> ResultTypes, + bool IsTexFail, bool Unpacked, bool IsD16, + int DMaskPop, int NumVDataDwords, + const SDLoc &DL, LLVMContext &Context) { + // Determine the required return type. This is the same regardless of IsTexFail flag + EVT ReqRetVT = ResultTypes[0]; + EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; + int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; + EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; + EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) + : AdjEltVT + : ReqRetVT; + + // Extract data part of the result + // Bitcast the result to the same type as the required return type + int NumElts; + if (IsD16 && !Unpacked) + NumElts = NumVDataDwords << 1; + else + NumElts = NumVDataDwords; + + EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) + : AdjEltVT; + + // Special case for v8f16. Rather than add support for this, use v4i32 to + // extract the data elements + bool V8F16Special = false; + if (CastVT == MVT::v8f16) { + CastVT = MVT::v4i32; + DMaskPop >>= 1; + ReqRetNumElts >>= 1; + V8F16Special = true; + AdjVT = MVT::v2i32; + } + + SDValue N = SDValue(Result, 0); + SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + + // Iterate over the result + SmallVector<SDValue, 4> BVElts; + + if (CastVT.isVector()) { + DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); + } else { + BVElts.push_back(CastRes); + } + int ExtraElts = ReqRetNumElts - DMaskPop; + while(ExtraElts--) + BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + + SDValue PreTFCRes; + if (ReqRetNumElts > 1) { + SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); + if (IsD16 && Unpacked) + PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); + else + PreTFCRes = NewVec; + } else { + PreTFCRes = BVElts[0]; + } + + if (V8F16Special) + PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + + if (!IsTexFail) { + if (Result->getNumValues() > 1) + return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); + else + return PreTFCRes; + } + + // Extract the TexFail result and insert into aggregate return + SmallVector<SDValue, 1> TFCElt; + DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); + SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); + return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); +} + +static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, + SDValue *LWE, bool &IsTexFail) { + auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); + if (!TexFailCtrlConst) + return false; + + uint64_t Value = TexFailCtrlConst->getZExtValue(); + if (Value) { + IsTexFail = true; + } + + SDLoc DL(TexFailCtrlConst); + *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + + return Value == 0; +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -4626,13 +4777,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; - SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; + bool AdjustRetType = false; + unsigned AddrIdx; // Index of first address argument unsigned DMask; + unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { VData = Op.getOperand(2); @@ -4655,7 +4810,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AddrIdx = 3; } } else { - unsigned DMaskIdx; + unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + DMask = DMaskConst->getZExtValue(); + DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); if (BaseOpcode->Store) { VData = Op.getOperand(2); @@ -4671,37 +4831,32 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; - DMaskIdx = 3; } else { - MVT LoadVT = Op.getSimpleValueType(); + // Work out the num dwords based on the dmask popcount and underlying type + // and whether packing is supported. + MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; - if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) - ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; } - NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; - DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; - } + // Confirm that the return type is large enough for the dmask specified + if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || + (!LoadVT.isVector() && DMaskLanes > 1)) + return Op; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; + else + NumVDataDwords = DMaskLanes; - AddrIdx = DMaskIdx + 1; - DMask = DMaskConst->getZExtValue(); - if (!DMask && !BaseOpcode->Store) { - // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they - // store the channels' default values. - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - if (isa<MemSDNode>(Op)) - return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); - return Undef; + AdjustRetType = true; } + + AddrIdx = DMaskIdx + 1; } unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; @@ -4780,11 +4935,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 3; } + SDValue TFE; + SDValue LWE; SDValue TexFail = Op.getOperand(CtrlIdx); - auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); - if (!TexFailConst || TexFailConst->getZExtValue() != 0) + bool IsTexFail = false; + if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) return Op; + if (IsTexFail) { + if (!NumVDataDwords) { + // Expecting to get an error flag since TFC is on - and dmask is 0 + // Force dmask to be at least 1 otherwise the instruction will fail + DMask = 0x1; + DMaskLanes = 1; + NumVDataDwords = 1; + } + NumVDataDwords += 1; + AdjustRetType = true; + } + + // Has something earlier tagged that the return type needs adjusting + // This happens if the instruction is a load or has set TexFailCtrl flags + if (AdjustRetType) { + // NumVDataDwords reflects the true number of dwords required in the return type + if (NumVDataDwords == 0 && !BaseOpcode->Store) { + // This is a no-op load. This can be eliminated + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } + + // Have to use a power of 2 number of dwords + NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); + + EVT NewVT = NumVDataDwords > 1 ? + EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) + : MVT::f32; + + ResultTypes[0] = NewVT; + if (ResultTypes.size() == 3) { + // Original result was aggregate type used for TexFailCtrl results + // The actual instruction returns as a vector type which has now been + // created. Remove the aggregate result. + ResultTypes.erase(&ResultTypes[1]); + } + } + SDValue GLC; SDValue SLC; if (BaseOpcode->Atomic) { @@ -4809,8 +5006,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - Ops.push_back(False); // tfe - Ops.push_back(False); // lwe + Ops.push_back(TFE); // tfe + Ops.push_back(LWE); // lwe Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -4838,11 +5035,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (IsD16 && !BaseOpcode->Store) { - MVT LoadVT = Op.getSimpleValueType(); - SDValue Adjusted = adjustLoadValueTypeImpl( - SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); - return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); + } else if (!BaseOpcode->Store) { + return constructRetValue(DAG, NewNode, + OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, + DMaskLanes, NumVDataDwords, DL, + *DAG.getContext()); } return SDValue(NewNode, 0); @@ -8772,6 +8970,7 @@ static unsigned SubIdx2Lane(unsigned Idx) { case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; case AMDGPU::sub3: return 3; + case AMDGPU::sub4: return 4; // Possible with TFE/LWE } } @@ -8785,11 +8984,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[4] = { nullptr }; + SDNode *Users[5] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; + unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; + bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; if (OldDmask == 0) { @@ -8797,6 +9001,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, return Node; } + unsigned OldBitsSet = countPopulation(OldDmask); + // Work out which is the TFE/LWE lane if that is enabled. + if (UsesTFC) { + TFCLane = OldBitsSet; + } + // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { @@ -8816,28 +9026,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } + // Check if the use is for the TFE/LWE generated result at VGPRn+1. + if (UsesTFC && Lane == TFCLane) { + Users[Lane] = *I; + } else { + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } - // Abort if we have more than one user per component - if (Users[Lane]) - return Node; + // Abort if we have more than one user per component. + if (Users[Lane]) + return Node; - Users[Lane] = *I; - NewDmask |= 1 << Comp; + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } } + // Don't allow 0 dmask, as hardware assumes one channel enabled. + bool NoChannels = !NewDmask; + if (NoChannels) { + // If the original dmask has one channel - then nothing to do + if (OldBitsSet == 1) + return Node; + // Use an arbitrary dmask - required for the instruction to work + NewDmask = 1; + } // Abort if there's no change if (NewDmask == OldDmask) return Node; unsigned BitsSet = countPopulation(NewDmask); - int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); + // Check for TFE or LWE - increase the number of channels by one to account + // for the extra return value + // This will need adjustment for D16 if this is also included in + // adjustWriteMask (this function) but at present D16 are excluded. + unsigned NewChannels = BitsSet + UsesTFC; + + int NewOpcode = + AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -8850,8 +9081,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + MVT ResultVT = NewChannels == 1 ? + SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : + NewChannels == 5 ? 8 : NewChannels); SDVTList NewVTList = HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); @@ -8865,7 +9097,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); } - if (BitsSet == 1) { + if (NewChannels == 1) { assert(Node->hasNUsesOfValue(1, 0)); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), Users[Lane]->getValueType(0), @@ -8875,19 +9107,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, } // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + if (!User) { + // Handle the special case of NoChannels. We set NewDmask to 1 above, but + // Users[0] is still nullptr because channel 0 doesn't really have a use. + if (i || !NoChannels) + continue; + } else { + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + } switch (Idx) { default: break; case AMDGPU::sub0: Idx = AMDGPU::sub1; break; case AMDGPU::sub1: Idx = AMDGPU::sub2; break; case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + case AMDGPU::sub3: Idx = AMDGPU::sub4; break; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 34d092f8500..978677ba7b0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2968,6 +2968,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Verify MIMG + if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { + // Ensure that the return type used is large enough for all the options + // being used TFE/LWE require an extra result register. + const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); + if (DMask) { + uint64_t DMaskImm = DMask->getImm(); + uint32_t RegCount = + isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); + const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); + const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); + const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); + + // Adjust for packed 16 bit values + if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) + RegCount >>= 1; + + // Adjust if using LWE or TFE + if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) + RegCount += 1; + + const uint32_t DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (Dst.isReg()) { + const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); + uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; + if (RegCount > DstSize) { + ErrInfo = "MIMG instruction returns too many registers for dst " + "register class"; + return false; + } + } + } + } + // Verify VOP*. Ignore multiple sgpr operands on writelane. if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 4bf16f59621..2a0416e45cf 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -184,6 +184,7 @@ struct MIMGBaseOpcodeInfo { bool Atomic; bool AtomicX2; bool Sampler; + bool Gather4; uint8_t NumExtraArgs; bool Gradients; |