diff options
Diffstat (limited to 'polly/lib/CodeGen/PPCGCodeGeneration.cpp')
-rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 110 |
1 files changed, 98 insertions, 12 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 2189df3b9e4..80c9f41711f 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -31,6 +31,8 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" @@ -102,6 +104,11 @@ static cl::opt<bool> cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt<std::string> CUDALibDevice( + "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden, + cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"), + cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::opt<std::string> CudaVersion("polly-acc-cuda-version", cl::desc("The CUDA version to compile for"), cl::Hidden, @@ -605,6 +612,12 @@ private: /// @param F The function to remove references to. void clearLoops(Function *F); + /// Check if the scop requires to be linked with CUDA's libdevice. + bool requiresCUDALibDevice(); + + /// Link with the NVIDIA libdevice library (if needed and available). + void addCUDALibDevice(); + /// Finalize the generation of the kernel function. /// /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- @@ -1324,13 +1337,32 @@ isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { return isl_bool_true; } +/// A list of functions that are available in NVIDIA's libdevice. +const std::set<std::string> CUDALibDeviceFunctions = { + "exp", "expf", "expl", "cos", "cosf", + "sqrt", "sqrtf", "copysign", "copysignf", "copysignl"}; + +/// Return the corresponding CUDA libdevice function name for @p F. +/// +/// Return "" if we are not compiling for CUDA. +std::string getCUDALibDeviceFuntion(Function *F) { + if (CUDALibDeviceFunctions.count(F->getName())) + return std::string("__nv_") + std::string(F->getName()); + + return ""; +} + /// Check if F is a function that we can code-generate in a GPU kernel. -static bool isValidFunctionInKernel(llvm::Function *F) { +static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) { assert(F && "F is an invalid pointer"); // We string compare against the name of the function to allow // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and // "llvm.copysign". const StringRef Name = F->getName(); + + if (AllowLibDevice && getCUDALibDeviceFuntion(F).length() > 0) + return true; + return F->isIntrinsic() && (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || Name.startswith("llvm.copysign")); @@ -1346,14 +1378,16 @@ static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); } /// Return `Function`s from `RawSubtreeValues`. static SetVector<Function *> -getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) { +getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues, + bool AllowCUDALibDevice) { SetVector<Function *> SubtreeFunctions; for (Value *It : RawSubtreeValues) { Function *F = dyn_cast<Function>(It); if (F) { - assert(isValidFunctionInKernel(F) && "Code should have bailed out by " - "this point if an invalid function " - "were present in a kernel."); + assert(isValidFunctionInKernel(F, AllowCUDALibDevice) && + "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel."); SubtreeFunctions.insert(F); } } @@ -1407,8 +1441,11 @@ GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { make_filter_range(SubtreeValues, isValidSubtreeValue); SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(), ValidSubtreeValuesIt.end()); + + bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64; + SetVector<Function *> ValidSubtreeFunctions( - getFunctionsFromRawSubtreeValues(SubtreeValues)); + getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice)); // @see IslNodeBuilder::getReferencesInSubtree SetVector<Value *> ReplacedValues; @@ -2232,6 +2269,49 @@ std::string GPUNodeBuilder::createKernelASM() { return ASMStream.str(); } +bool GPUNodeBuilder::requiresCUDALibDevice() { + for (Function &F : GPUModule->functions()) { + if (!F.isDeclaration()) + continue; + + std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F); + if (CUDALibDeviceFunc.length() != 0) { + F.setName(CUDALibDeviceFunc); + return true; + } + } + + return false; +} + +void GPUNodeBuilder::addCUDALibDevice() { + if (Arch != GPUArch::NVPTX64) + return; + + if (requiresCUDALibDevice()) { + SMDiagnostic Error; + + errs() << CUDALibDevice << "\n"; + auto LibDeviceModule = + parseIRFile(CUDALibDevice, Error, GPUModule->getContext()); + + if (!LibDeviceModule) { + BuildSuccessful = false; + report_fatal_error("Could not find or load libdevice. Skipping GPU " + "kernel generation. Please set -polly-acc-libdevice " + "accordingly.\n"); + return; + } + + Linker L(*GPUModule); + + // Set an nvptx64 target triple to avoid linker warnings. The original + // triple of the libdevice files are nvptx-unknown-unknown. + LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded); + } +} + std::string GPUNodeBuilder::finalizeKernelFunction() { if (verifyModule(*GPUModule)) { @@ -2247,6 +2327,8 @@ std::string GPUNodeBuilder::finalizeKernelFunction() { return ""; } + addCUDALibDevice(); + if (DumpKernelIR) outs() << *GPUModule << "\n"; @@ -3116,10 +3198,12 @@ public: /// /// If this basic block does something with a `Function` other than calling /// a function that we support in a kernel, return true. - bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB) { + bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB, + bool AllowCUDALibDevice) { for (const Instruction &Inst : *BB) { const CallInst *Call = dyn_cast<CallInst>(&Inst); - if (Call && isValidFunctionInKernel(Call->getCalledFunction())) { + if (Call && isValidFunctionInKernel(Call->getCalledFunction(), + AllowCUDALibDevice)) { continue; } @@ -3135,16 +3219,17 @@ public: } /// Return whether the Scop S uses functions in a way that we do not support. - bool containsInvalidKernelFunction(const Scop &S) { + bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) { for (auto &Stmt : S) { if (Stmt.isBlockStmt()) { - if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock())) + if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(), + AllowCUDALibDevice)) return true; } else { assert(Stmt.isRegionStmt() && "Stmt was neither block nor region statement"); for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (containsInvalidKernelFunctionInBlock(BB)) + if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice)) return true; } } @@ -3232,7 +3317,8 @@ public: // kernel. This may lead to a kernel trying to call a function on the host. // This also allows us to prevent codegen from trying to take the // address of an intrinsic function to send to the kernel. - if (containsInvalidKernelFunction(CurrentScop)) { + if (containsInvalidKernelFunction(CurrentScop, + Architecture == GPUArch::NVPTX64)) { DEBUG( dbgs() << "Scop contains function which cannot be materialised in a GPU " |