summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp123
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp63
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIIntrinsics.td9
8 files changed, 227 insertions, 49 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 7e59710a427..d4784b5463d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
class AMDGPUSubtarget;
class AMDGPUTargetMachine;
class FunctionPass;
+class GCNTargetMachine;
struct MachineSchedContext;
class MCAsmInfo;
class raw_ostream;
@@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3b415774df4..0627708485c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/CodeGen/Passes.h"
@@ -30,15 +32,28 @@ using namespace llvm;
namespace {
class AMDGPUCodeGenPrepare : public FunctionPass,
- public InstVisitor<AMDGPUCodeGenPrepare> {
+ public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+ const GCNTargetMachine *TM;
+ const SISubtarget *ST;
DivergenceAnalysis *DA;
- const TargetMachine *TM;
+ Module *Mod;
+ bool HasUnsafeFPMath;
public:
static char ID;
AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
FunctionPass(ID),
- TM(TM) { }
+ TM(static_cast<const GCNTargetMachine *>(TM)),
+ ST(nullptr),
+ DA(nullptr),
+ Mod(nullptr),
+ HasUnsafeFPMath(false) { }
+
+ bool visitFDiv(BinaryOperator &I);
+
+ bool visitInstruction(Instruction &I) {
+ return false;
+ }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
@@ -55,7 +70,92 @@ public:
} // End anonymous namespace
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+ const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+ if (!CNum)
+ return false;
+
+ // Reciprocal f32 is handled separately without denormals.
+ return UnsafeDiv && CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+ Type *Ty = FDiv.getType();
+
+ // TODO: Handle half
+ if (!Ty->getScalarType()->isFloatTy())
+ return false;
+
+ MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+ if (!FPMath)
+ return false;
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+ float ULP = FPOp->getFPAccuracy();
+ if (ULP < 2.5f)
+ return false;
+
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+ FMF.allowReciprocal();
+ if (ST->hasFP32Denormals() && !UnsafeDiv)
+ return false;
+
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ Builder.setFastMathFlags(FMF);
+ Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+ const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+ Function *Decl
+ = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *NewFDiv = nullptr;
+
+ if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ NewFDiv = UndefValue::get(VT);
+
+ // FIXME: Doesn't do the right thing for cases where the vector is partially
+ // constant. This works when the scalarizer pass is run first.
+ for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+ Value *NumEltI = Builder.CreateExtractElement(Num, I);
+ Value *DenEltI = Builder.CreateExtractElement(Den, I);
+ Value *NewElt;
+
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+ NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+ } else {
+ NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+ }
+
+ NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+ }
+ } else {
+ if (!shouldKeepFDivF32(Num, UnsafeDiv))
+ NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ }
+
+ if (NewFDiv) {
+ FDiv.replaceAllUsesWith(NewFDiv);
+ NewFDiv->takeName(&FDiv);
+ FDiv.eraseFromParent();
+ }
+
+ return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+ Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+ return Attr.getValueAsString() == "true";
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
return false;
}
@@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (!TM || skipFunction(F))
return false;
+ ST = &TM->getSubtarget<SISubtarget>(F);
DA = &getAnalysis<DivergenceAnalysis>();
- visit(F);
+ HasUnsafeFPMath = hasUnsafeFPMath(F);
- return true;
+ bool MadeChange = false;
+
+ for (BasicBlock &BB : F) {
+ BasicBlock::iterator Next;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ Next = std::next(I);
+ MadeChange |= visit(*I);
+ }
+ }
+
+ return MadeChange;
}
INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
char AMDGPUCodeGenPrepare::ID = 0;
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
return new AMDGPUCodeGenPrepare(TM);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 791872a9db4..8e3471bd208 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
#undef GET_INTRINSIC_NAME_TABLE
};
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
- unsigned numTys) const {
- if (IntrID < Intrinsic::num_intrinsics) {
- return nullptr;
- }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ if (IntrID < Intrinsic::num_intrinsics)
+ return StringRef();
+
assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
"Invalid intrinsic ID");
- std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
- return Result;
+ return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+ unsigned NumTys) const {
+ return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys) const {
+ // FIXME: Re-use Intrinsic::getType machinery
+ switch (ID) {
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ Type *F32Ty = Type::getFloatTy(Context);
+ return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+ }
+ default:
+ llvm_unreachable("unhandled intrinsic");
+ }
}
unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
}
Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+ ArrayRef<Type *> Tys) const {
+ FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+ Function *F
+ = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+ AttributeSet AS = getAttributes(M->getContext(),
+ static_cast<AMDGPUIntrinsic::ID>(IntrID));
+ F->setAttributes(AS);
+ return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
Type **Tys,
- unsigned numTys) const {
- llvm_unreachable("Not implemented");
+ unsigned NumTys) const {
+ return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index f4173929259..6cb8b964464 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -34,13 +34,23 @@ enum ID {
class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
public:
AMDGPUIntrinsicInfo();
+
+ StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
std::string getName(unsigned IntrId, Type **Tys = nullptr,
- unsigned numTys = 0) const override;
+ unsigned NumTys = 0) const override;
+
unsigned lookupName(const char *Name, unsigned Len) const override;
bool isOverloaded(unsigned IID) const override;
Function *getDeclaration(Module *M, unsigned ID,
Type **Tys = nullptr,
- unsigned numTys = 0) const override;
+ unsigned NumTys = 0) const override;
+
+ Function *getDeclaration(Module *M, unsigned ID,
+ ArrayRef<Type *> = None) const;
+
+ FunctionType *getType(LLVMContext &Context, unsigned ID,
+ ArrayRef<Type*> Tys = None) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3e53f52c689..b2d4e1144c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -309,6 +309,7 @@ public:
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override;
+ void addIRPasses() override;
bool addPreISel() override;
void addMachineSSAOptimization() override;
bool addInstSelector() override;
@@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&DeadMachineInstructionElimID);
}
+void GCNPassConfig::addIRPasses() {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+ AMDGPUPassConfig::addIRPasses();
+}
+
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2c4e6a4056..e412d0750bc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2113,6 +2113,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
Op->getVTList(), Ops, VT, MMO);
}
+ case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+ return lowerFDIV_FAST(Op, DAG);
+ }
case AMDGPUIntrinsic::SI_vs_load_input:
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
Op.getOperand(1),
@@ -2427,7 +2430,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Catch division cases where we can use shortcuts with rcp and rsq
// instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -2468,47 +2472,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
- if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
- return FastLowered;
-
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
- // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
- if (EnableAMDGPUFastFDIV) {
- // This does not support denormals.
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+ SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
- const APFloat K0Val(BitsToFloat(0x6f800000));
- const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+ const APFloat K0Val(BitsToFloat(0x6f800000));
+ const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
- const APFloat K1Val(BitsToFloat(0x2f800000));
- const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+ const APFloat K1Val(BitsToFloat(0x2f800000));
+ const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
- const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
- SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+ SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
- SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
- // TODO: Should this propagate fast-math-flags?
+ // TODO: Should this propagate fast-math-flags?
+ r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
- r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+ // rcp does not support denormals.
+ SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
- // rcp does not support denormals.
- SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
- return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
- }
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+ return FastLowered;
+
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
- // Generates more precise fpdiv32.
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@@ -2538,7 +2543,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getTarget().Options.UnsafeFPMath)
- return LowerFastFDIV(Op, DAG);
+ return lowerFastUnsafeFDIV(Op, DAG);
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 8e055eea58c..1d349faa592 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/llvm/lib/Target/AMDGPU/SIIntrinsics.td
index a9b7c39096e..9d06ccfc6c7 100644
--- a/llvm/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -7,7 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
//
//===----------------------------------------------------------------------===//
@@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
} // End TargetPrefix = "SI", isTarget = 1
let TargetPrefix = "amdgcn", isTarget = 1 in {
+ // Emit 2.5 ulp, no denormal division. Should only be inserted by
+ // pass based on !fpmath metadata.
+ def int_amdgcn_fdiv_fast : Intrinsic<
+ [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+ >;
+
/* Control flow Intrinsics */
def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
OpenPOWER on IntegriCloud