diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 123 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp | 53 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIIntrinsics.td | 9 |
8 files changed, 227 insertions, 49 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 7e59710a427..d4784b5463d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -20,6 +20,7 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class GCNTargetMachine; struct MachineSchedContext; class MCAsmInfo; class raw_ostream; @@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 3b415774df4..0627708485c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" @@ -30,15 +32,28 @@ using namespace llvm; namespace { class AMDGPUCodeGenPrepare : public FunctionPass, - public InstVisitor<AMDGPUCodeGenPrepare> { + public InstVisitor<AMDGPUCodeGenPrepare, bool> { + const GCNTargetMachine *TM; + const SISubtarget *ST; DivergenceAnalysis *DA; - const TargetMachine *TM; + Module *Mod; + bool HasUnsafeFPMath; public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), - TM(TM) { } + TM(static_cast<const GCNTargetMachine *>(TM)), + ST(nullptr), + DA(nullptr), + Mod(nullptr), + HasUnsafeFPMath(false) { } + + bool visitFDiv(BinaryOperator &I); + + bool visitInstruction(Instruction &I) { + return false; + } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -55,7 +70,92 @@ public: } // End anonymous namespace +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { + const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); + if (!CNum) + return false; + + // Reciprocal f32 is handled separately without denormals. + return UnsafeDiv && CNum->isExactlyValue(+1.0); +} + +// Insert an intrinsic for fast fdiv for safe math situations where we can +// reduce precision. Leave fdiv for situations where the generic node is +// expected to be optimized. +bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { + Type *Ty = FDiv.getType(); + + // TODO: Handle half + if (!Ty->getScalarType()->isFloatTy()) + return false; + + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + if (!FPMath) + return false; + + const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); + float ULP = FPOp->getFPAccuracy(); + if (ULP < 2.5f) + return false; + + FastMathFlags FMF = FPOp->getFastMathFlags(); + bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + FMF.allowReciprocal(); + if (ST->hasFP32Denormals() && !UnsafeDiv) + return false; + + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); + + const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); + Function *Decl + = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *NewFDiv = nullptr; + + if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + NewFDiv = UndefValue::get(VT); + + // FIXME: Doesn't do the right thing for cases where the vector is partially + // constant. This works when the scalarizer pass is run first. + for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { + Value *NumEltI = Builder.CreateExtractElement(Num, I); + Value *DenEltI = Builder.CreateExtractElement(Den, I); + Value *NewElt; + + if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { + NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + } else { + NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); + } + + NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); + } + } else { + if (!shouldKeepFDivF32(Num, UnsafeDiv)) + NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } + + if (NewFDiv) { + FDiv.replaceAllUsesWith(NewFDiv); + NewFDiv->takeName(&FDiv); + FDiv.eraseFromParent(); + } + + return true; +} + +static bool hasUnsafeFPMath(const Function &F) { + Attribute Attr = F.getFnAttribute("unsafe-fp-math"); + return Attr.getValueAsString() == "true"; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; return false; } @@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + ST = &TM->getSubtarget<SISubtarget>(F); DA = &getAnalysis<DivergenceAnalysis>(); - visit(F); + HasUnsafeFPMath = hasUnsafeFPMath(F); - return true; + bool MadeChange = false; + + for (BasicBlock &BB : F) { + BasicBlock::iterator Next; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + Next = std::next(I); + MadeChange |= visit(*I); + } + } + + return MadeChange; } INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, @@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { return new AMDGPUCodeGenPrepare(TM); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 791872a9db4..8e3471bd208 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = { #undef GET_INTRINSIC_NAME_TABLE }; -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } +namespace { +#define GET_INTRINSIC_ATTRIBUTES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES +} + +StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, + ArrayRef<Type *> Tys) const { + if (IntrID < Intrinsic::num_intrinsics) + return StringRef(); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); - return Result; + return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; +} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned NumTys) const { + return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); +} + +FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, + ArrayRef<Type*> Tys) const { + // FIXME: Re-use Intrinsic::getType machinery + switch (ID) { + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + Type *F32Ty = Type::getFloatTy(Context); + return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); + } + default: + llvm_unreachable("unhandled intrinsic"); + } } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { } Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + ArrayRef<Type *> Tys) const { + FunctionType *FTy = getType(M->getContext(), IntrID, Tys); + Function *F + = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); + + AttributeSet AS = getAttributes(M->getContext(), + static_cast<AMDGPUIntrinsic::ID>(IntrID)); + F->setAttributes(AS); + return F; +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); + unsigned NumTys) const { + return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index f4173929259..6cb8b964464 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -34,13 +34,23 @@ enum ID { class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); + + StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const; + std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; bool isOverloaded(unsigned IID) const override; Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + + Function *getDeclaration(Module *M, unsigned ID, + ArrayRef<Type *> = None) const; + + FunctionType *getType(LLVMContext &Context, unsigned ID, + ArrayRef<Type*> Tys = None) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3e53f52c689..b2d4e1144c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -309,6 +309,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; + void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; bool addInstSelector() override; @@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&DeadMachineInstructionElimID); } +void GCNPassConfig::addIRPasses() { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); + + AMDGPUPassConfig::addIRPasses(); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a2c4e6a4056..e412d0750bc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2113,6 +2113,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + return lowerFDIV_FAST(Op, DAG); + } case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), @@ -2427,7 +2430,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Catch division cases where we can use shortcuts with rcp and rsq // instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -2468,47 +2472,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) - return FastLowered; - +// Faster 2.5 ULP division that does not support denormals. +SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); - // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag - if (EnableAMDGPUFastFDIV) { - // This does not support denormals. - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - // TODO: Should this propagate fast-math-flags? + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); - } +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); @@ -2538,7 +2543,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); + return lowerFastUnsafeFDIV(Op, DAG); SDLoc SL(Op); SDValue X = Op.getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 8e055eea58c..1d349faa592 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/llvm/lib/Target/AMDGPU/SIIntrinsics.td index a9b7c39096e..9d06ccfc6c7 100644 --- a/llvm/lib/Target/AMDGPU/SIIntrinsics.td +++ b/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// SI Intrinsic Definitions +// Backend internal SI Intrinsic Definitions. User code should not +// directly use these. // //===----------------------------------------------------------------------===// @@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in { } // End TargetPrefix = "SI", isTarget = 1 let TargetPrefix = "amdgcn", isTarget = 1 in { + // Emit 2.5 ulp, no denormal division. Should only be inserted by + // pass based on !fpmath metadata. + def int_amdgcn_fdiv_fast : Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + >; + /* Control flow Intrinsics */ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; |