diff options
| -rw-r--r-- | polly/include/polly/CodeGen/PTXGenerator.h | 184 | ||||
| -rw-r--r-- | polly/lib/CMakeLists.txt | 3 | ||||
| -rw-r--r-- | polly/lib/CodeGen/PTXGenerator.cpp | 711 | ||||
| -rw-r--r-- | polly/lib/Makefile | 2 |
4 files changed, 2 insertions, 898 deletions
diff --git a/polly/include/polly/CodeGen/PTXGenerator.h b/polly/include/polly/CodeGen/PTXGenerator.h deleted file mode 100644 index f4c50384242..00000000000 --- a/polly/include/polly/CodeGen/PTXGenerator.h +++ /dev/null @@ -1,184 +0,0 @@ -//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains functions to create GPGPU parallel loops as LLVM-IR. -// -//===----------------------------------------------------------------------===// -#ifndef POLLY_CODEGEN_PTXGENERATOR_H -#define POLLY_CODEGEN_PTXGENERATOR_H - -#include "polly/Config/config.h" - -#ifdef GPU_CODEGEN -#include "polly/CodeGen/IRBuilder.h" -#include "llvm/ADT/SetVector.h" - -#include <map> - -namespace llvm { -class Value; -class Pass; -class BasicBlock; -} - -namespace polly { -using namespace llvm; - -class PTXGenerator { -public: - typedef std::map<Value *, Value *> ValueToValueMapTy; - - PTXGenerator(PollyIRBuilder &Builder, Pass *P, const std::string &Triple); - - /// @brief Create a GPGPU parallel loop. - /// - /// @param UsedValues A set of LLVM-IR Values that should be available to - /// the new loop body. - /// @param OriginalIVS The new values of the original induction variables. - /// @param VMap This map is filled by createParallelLoop(). It - /// maps the values in UsedValues to Values through which - /// their content is available within the loop body. - /// @param LoopBody A pointer to an iterator that is set to point to the - /// body of the created loop. It should be used to insert - /// instructions that form the actual loop body. - void startGeneration(SetVector<Value *> &UsedValues, - SetVector<Value *> &OriginalIVS, ValueToValueMapTy &VMap, - BasicBlock::iterator *LoopBody); - - /// @brief Execute the post-operations to build a GPGPU parallel loop. - /// - void finishGeneration(Function *SubFunction); - - /// @brief Set the parameters for launching PTX kernel. - /// - /// @param GridW A value of the width of a GPU grid. - /// @param GridH A value of the height of a GPU grid. - /// @param BlockW A value of the width of a GPU block. - /// @param BlockH A value of the height of a GPU block. - void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) { - GridWidth = GridW; - GridHeight = GridH; - BlockWidth = BlockW; - BlockHeight = BlockH; - } - - /// @brief Set the size of the output array. - /// - /// This size is used to allocate memory on the device and the host. - /// - /// @param Bytes Output array size in bytes. - void setOutputBytes(unsigned Bytes) { OutputBytes = Bytes; } - -private: - PollyIRBuilder &Builder; - Pass *P; - - /// @brief The target triple of the device. - const std::string &GPUTriple; - - ///@brief Parameters used for launching PTX kernel. - int GridWidth, GridHeight, BlockWidth, BlockHeight; - - /// @brief Size of the output array in bytes. - unsigned OutputBytes; - - /// @brief Polly's GPU data types. - StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy; - - void InitializeGPUDataTypes(); - IntegerType *getInt64Type(); // i64 - PointerType *getI8PtrType(); // char * - PointerType *getPtrI8PtrType(); // char ** - PointerType *getFloatPtrType(); // float * - PointerType *getGPUContextPtrType(); // %struct.PollyGPUContextT * - PointerType *getGPUModulePtrType(); // %struct.PollyGPUModuleT * - PointerType *getGPUDevicePtrType(); // %struct.PollyGPUDeviceT * - PointerType *getPtrGPUDevicePtrType(); // %struct.PollyGPUDevicePtrT * - PointerType *getGPUFunctionPtrType(); // %struct.PollyGPUFunctionT * - PointerType *getGPUEventPtrType(); // %struct.PollyGPUEventT * - - Module *getModule(); - - /// @brief Create the kernel string containing LLVM IR. - /// - /// @param SubFunction A pointer to the device code function. - /// @return A global string variable containing the LLVM IR codes - // of the SubFunction. - Value *createPTXKernelFunction(Function *SubFunction); - - /// @brief Get the entry name of the device kernel function. - /// - /// @param SubFunction A pointer to the device code function. - /// @return A global string variable containing the entry name of - /// the SubFunction. - Value *getPTXKernelEntryName(Function *SubFunction); - - void createCallInitDevice(Value *Context, Value *Device); - void createCallGetPTXModule(Value *Buffer, Value *Module); - void createCallGetPTXKernelEntry(Value *Entry, Value *Module, Value *Kernel); - void createCallAllocateMemoryForHostAndDevice(Value *HostData, - Value *DeviceData, Value *Size); - void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData, - Value *Size); - void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData, - Value *Size); - void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth, - Value *BlockHeight, Value *DeviceData); - void createCallLaunchKernel(Value *Kernel, Value *GridWidth, - Value *GridHeight); - void createCallStartTimerByCudaEvent(Value *StartEvent, Value *StopEvent); - void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent, - Value *Timer); - void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData, - Value *Module, Value *Context, - Value *Kernel); - - /// @brief Create the CUDA subfunction. - /// - /// @param UsedValues A set of LLVM-IR Values that should be available to - /// the new loop body. - /// @param VMap This map that is filled by createSubfunction(). It - /// maps the values in UsedValues to Values through which - /// their content is available within the loop body. - /// @param OriginalIVS The new values of the original induction variables. - /// @param SubFunction The newly created SubFunction is returned here. - void createSubfunction(SetVector<Value *> &UsedValues, - SetVector<Value *> &OriginalIVS, - ValueToValueMapTy &VMap, Function **SubFunction); - - /// @brief Create the definition of the CUDA subfunction. - /// - /// @param NumArgs The number of parameters of this subfunction. This is - /// usually set to the number of memory accesses which - /// will be copied from host to device. - Function *createSubfunctionDefinition(int NumArgs); - - /// @brief Get the Value of CUDA block width. - Value *getCUDABlockWidth(); - - /// @brief Get the Value of CUDA block height. - Value *getCUDABlockHeight(); - - /// @brief Get the Value of CUDA Gird width. - Value *getCUDAGridWidth(); - - /// @brief Get the Value of CUDA grid height. - Value *getCUDAGridHeight(); - - /// @brief Get the Value of the bytes of the output array. - Value *getOutputArraySizeInBytes(); - - /// @brief Erase the ptx-related subfunctions and declarations. - /// - /// @param SubFunction A pointer to the device code function. - void eraseUnusedFunctions(Function *SubFunction); -}; -} // end namespace polly -#endif /* GPU_CODEGEN */ -#endif /* POLLY_CODEGEN_PTXGENERATOR_H */ diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt index 1f88ba2f58a..7eaf0709719 100644 --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -13,8 +13,7 @@ set(ISL_CODEGEN_FILES CodeGen/CodeGeneration.cpp) if (GPU_CODEGEN) - set (GPGPU_CODEGEN_FILES - CodeGen/PTXGenerator.cpp) + set (GPGPU_CODEGEN_FILES) endif (GPU_CODEGEN) set (ISL_FILES diff --git a/polly/lib/CodeGen/PTXGenerator.cpp b/polly/lib/CodeGen/PTXGenerator.cpp deleted file mode 100644 index dc1339a3a2a..00000000000 --- a/polly/lib/CodeGen/PTXGenerator.cpp +++ /dev/null @@ -1,711 +0,0 @@ -//===------ PTXGenerator.cpp - IR helper to create loops -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains functions to create GPU parallel codes as LLVM-IR. -// -//===----------------------------------------------------------------------===// - -#include "polly/CodeGen/PTXGenerator.h" - -#ifdef GPU_CODEGEN -#include "polly/ScopDetection.h" -#include "polly/ScopInfo.h" - -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" - -using namespace llvm; -using namespace polly; - -PTXGenerator::PTXGenerator(PollyIRBuilder &Builder, Pass *P, - const std::string &Triple) - : Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1), - BlockWidth(1), BlockHeight(1), OutputBytes(0) { - InitializeGPUDataTypes(); -} - -Module *PTXGenerator::getModule() { - return Builder.GetInsertBlock()->getParent()->getParent(); -} - -Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) { - assert(NumArgs == 1 && "we support only one array access now."); - - Module *M = getModule(); - Function *F = Builder.GetInsertBlock()->getParent(); - std::vector<Type *> Arguments; - for (int i = 0; i < NumArgs; i++) - Arguments.push_back(Builder.getInt8PtrTy()); - FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); - Function *FN = Function::Create(FT, Function::InternalLinkage, - F->getName() + "_ptx_subfn", M); - FN->setCallingConv(CallingConv::PTX_Kernel); - - // Do not run any optimization pass on the new function. - P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN); - - for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI) - AI->setName("ptx.Array"); - - return FN; -} - -void PTXGenerator::createSubfunction(SetVector<Value *> &UsedValues, - SetVector<Value *> &OriginalIVS, - PTXGenerator::ValueToValueMapTy &VMap, - Function **SubFunction) { - Function *FN = createSubfunctionDefinition(UsedValues.size()); - Module *M = getModule(); - LLVMContext &Context = FN->getContext(); - IntegerType *Ty = Builder.getInt64Ty(); - - // Store the previous basic block. - BasicBlock *PrevBB = Builder.GetInsertBlock(); - - // Create basic blocks. - BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN); - BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN); - BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN); - - DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DT.addNewBlock(HeaderBB, PrevBB); - DT.addNewBlock(ExitBB, HeaderBB); - DT.addNewBlock(BodyBB, HeaderBB); - - Builder.SetInsertPoint(HeaderBB); - - // Insert VMap items with maps of array base address on the host to base - // address on the device. - Function::arg_iterator AI = FN->arg_begin(); - for (unsigned j = 0; j < UsedValues.size(); j++) { - Value *BaseAddr = UsedValues[j]; - Type *ArrayTy = BaseAddr->getType(); - Value *Param = Builder.CreateBitCast(AI, ArrayTy); - VMap.insert(std::make_pair(BaseAddr, Param)); - AI++; - } - - // FIXME: These intrinsics should be inserted on-demand. However, we insert - // them all currently for simplicity. - Function *GetNctaidX = - Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x); - Function *GetNctaidY = - Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y); - Function *GetCtaidX = - Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x); - Function *GetCtaidY = - Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y); - Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x); - Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y); - Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x); - Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y); - - Value *GridWidth = Builder.CreateCall(GetNctaidX); - GridWidth = Builder.CreateIntCast(GridWidth, Ty, false); - Value *GridHeight = Builder.CreateCall(GetNctaidY); - GridHeight = Builder.CreateIntCast(GridHeight, Ty, false); - Value *BlockWidth = Builder.CreateCall(GetNtidX); - BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false); - Value *BlockHeight = Builder.CreateCall(GetNtidY); - BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false); - Value *BIDx = Builder.CreateCall(GetCtaidX); - BIDx = Builder.CreateIntCast(BIDx, Ty, false); - Value *BIDy = Builder.CreateCall(GetCtaidY); - BIDy = Builder.CreateIntCast(BIDy, Ty, false); - Value *TIDx = Builder.CreateCall(GetTidX); - TIDx = Builder.CreateIntCast(TIDx, Ty, false); - Value *TIDy = Builder.CreateCall(GetTidY); - TIDy = Builder.CreateIntCast(TIDy, Ty, false); - - Builder.CreateBr(BodyBB); - Builder.SetInsertPoint(BodyBB); - - unsigned NumDims = OriginalIVS.size(); - std::vector<Value *> Substitutions; - Value *BlockID, *ThreadID; - switch (NumDims) { - case 1: { - Value *BlockSize = - Builder.CreateMul(BlockWidth, BlockHeight, "p_gpu_blocksize"); - BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); - BlockID = Builder.CreateAdd(BlockID, BIDx); - BlockID = Builder.CreateMul(BlockID, BlockSize); - ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); - ThreadID = Builder.CreateAdd(ThreadID, TIDx); - ThreadID = Builder.CreateAdd(ThreadID, BlockID); - Substitutions.push_back(ThreadID); - break; - } - case 2: { - BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); - BlockID = Builder.CreateAdd(BlockID, BIDx); - Substitutions.push_back(BlockID); - ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); - ThreadID = Builder.CreateAdd(ThreadID, TIDx); - Substitutions.push_back(ThreadID); - break; - } - case 3: { - BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); - BlockID = Builder.CreateAdd(BlockID, BIDx); - Substitutions.push_back(BlockID); - Substitutions.push_back(TIDy); - Substitutions.push_back(TIDx); - break; - } - case 4: { - Substitutions.push_back(BIDy); - Substitutions.push_back(BIDx); - Substitutions.push_back(TIDy); - Substitutions.push_back(TIDx); - break; - } - default: - assert(true && - "We cannot transform parallel loops whose depth is larger than 4."); - return; - } - - assert(OriginalIVS.size() == Substitutions.size() && - "The size of IVS should be equal to the size of substitutions."); - for (unsigned i = 0; i < OriginalIVS.size(); ++i) { - VMap.insert(std::make_pair(OriginalIVS[i], Substitutions[i])); - } - - Builder.CreateBr(ExitBB); - Builder.SetInsertPoint(--Builder.GetInsertPoint()); - BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); - - // Add the termination of the ptx-device subfunction. - Builder.SetInsertPoint(ExitBB); - Builder.CreateRetVoid(); - - Builder.SetInsertPoint(LoopBody); - *SubFunction = FN; -} - -void PTXGenerator::startGeneration(SetVector<Value *> &UsedValues, - SetVector<Value *> &OriginalIVS, - ValueToValueMapTy &VMap, - BasicBlock::iterator *LoopBody) { - Function *SubFunction; - BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint(); - createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction); - *LoopBody = Builder.GetInsertPoint(); - Builder.SetInsertPoint(PrevInsertPoint); -} - -IntegerType *PTXGenerator::getInt64Type() { return Builder.getInt64Ty(); } - -PointerType *PTXGenerator::getI8PtrType() { - return PointerType::getUnqual(Builder.getInt8Ty()); -} - -PointerType *PTXGenerator::getPtrI8PtrType() { - return PointerType::getUnqual(getI8PtrType()); -} - -PointerType *PTXGenerator::getFloatPtrType() { - return llvm::Type::getFloatPtrTy(getModule()->getContext()); -} - -PointerType *PTXGenerator::getGPUContextPtrType() { - return PointerType::getUnqual(ContextTy); -} - -PointerType *PTXGenerator::getGPUModulePtrType() { - return PointerType::getUnqual(ModuleTy); -} - -PointerType *PTXGenerator::getGPUDevicePtrType() { - return PointerType::getUnqual(DeviceTy); -} - -PointerType *PTXGenerator::getPtrGPUDevicePtrType() { - return PointerType::getUnqual(DevDataTy); -} - -PointerType *PTXGenerator::getGPUFunctionPtrType() { - return PointerType::getUnqual(KernelTy); -} - -PointerType *PTXGenerator::getGPUEventPtrType() { - return PointerType::getUnqual(EventTy); -} - -void PTXGenerator::InitializeGPUDataTypes() { - LLVMContext &Context = getModule()->getContext(); - - ContextTy = StructType::create(Context, "struct.PollyGPUContextT"); - ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT"); - KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT"); - DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT"); - DevDataTy = StructType::create(Context, "struct.PollyGPUDevicePtrT"); - EventTy = StructType::create(Context, "struct.PollyGPUEventT"); -} - -void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) { - const char *Name = "polly_initDevice"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(PointerType::getUnqual(getGPUContextPtrType())); - Args.push_back(PointerType::getUnqual(getGPUDevicePtrType())); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall2(F, Context, Device); -} - -void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) { - const char *Name = "polly_getPTXModule"; - llvm::Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getI8PtrType()); - Args.push_back(PointerType::getUnqual(getGPUModulePtrType())); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall2(F, Buffer, Module); -} - -void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module, - Value *Kernel) { - const char *Name = "polly_getPTXKernelEntry"; - llvm::Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getI8PtrType()); - Args.push_back(getGPUModulePtrType()); - Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType())); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, Entry, Module, Kernel); -} - -void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData, - Value *DeviceData, - Value *Size) { - const char *Name = "polly_allocateMemoryForHostAndDevice"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getPtrI8PtrType()); - Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType())); - Args.push_back(getInt64Type()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, HostData, DeviceData, Size); -} - -void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData, - Value *HostData, - Value *Size) { - const char *Name = "polly_copyFromHostToDevice"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getPtrGPUDevicePtrType()); - Args.push_back(getI8PtrType()); - Args.push_back(getInt64Type()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, DeviceData, HostData, Size); -} - -void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData, - Value *DeviceData, - Value *Size) { - const char *Name = "polly_copyFromDeviceToHost"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getI8PtrType()); - Args.push_back(getPtrGPUDevicePtrType()); - Args.push_back(getInt64Type()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, HostData, DeviceData, Size); -} - -void PTXGenerator::createCallSetKernelParameters(Value *Kernel, - Value *BlockWidth, - Value *BlockHeight, - Value *DeviceData) { - const char *Name = "polly_setKernelParameters"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getGPUFunctionPtrType()); - Args.push_back(getInt64Type()); - Args.push_back(getInt64Type()); - Args.push_back(getPtrGPUDevicePtrType()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData); -} - -void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth, - Value *GridHeight) { - const char *Name = "polly_launchKernel"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getGPUFunctionPtrType()); - Args.push_back(getInt64Type()); - Args.push_back(getInt64Type()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, Kernel, GridWidth, GridHeight); -} - -void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent, - Value *StopEvent) { - const char *Name = "polly_startTimerByCudaEvent"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); - Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall2(F, StartEvent, StopEvent); -} - -void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent, - Value *StopEvent, - Value *Timer) { - const char *Name = "polly_stopTimerByCudaEvent"; - Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getGPUEventPtrType()); - Args.push_back(getGPUEventPtrType()); - Args.push_back(getFloatPtrType()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall3(F, StartEvent, StopEvent, Timer); -} - -void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData, - Value *DeviceData, - Value *Module, - Value *Context, - Value *Kernel) { - const char *Name = "polly_cleanupGPGPUResources"; - llvm::Module *M = getModule(); - Function *F = M->getFunction(Name); - - // If F is not available, declare it. - if (!F) { - GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - std::vector<Type *> Args; - Args.push_back(getI8PtrType()); - Args.push_back(getPtrGPUDevicePtrType()); - Args.push_back(getGPUModulePtrType()); - Args.push_back(getGPUContextPtrType()); - Args.push_back(getGPUFunctionPtrType()); - FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); - F = Function::Create(Ty, Linkage, Name, M); - } - - Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel); -} - -Value *PTXGenerator::getCUDAGridWidth() { - return ConstantInt::get(getInt64Type(), GridWidth); -} - -Value *PTXGenerator::getCUDAGridHeight() { - return ConstantInt::get(getInt64Type(), GridHeight); -} - -Value *PTXGenerator::getCUDABlockWidth() { - return ConstantInt::get(getInt64Type(), BlockWidth); -} - -Value *PTXGenerator::getCUDABlockHeight() { - return ConstantInt::get(getInt64Type(), BlockHeight); -} - -Value *PTXGenerator::getOutputArraySizeInBytes() { - return ConstantInt::get(getInt64Type(), OutputBytes); -} - -static Module *extractPTXFunctionsFromModule(const Module *M, - const StringRef &Triple) { - llvm::ValueToValueMapTy VMap; - Module *New = new Module("TempGPUModule", M->getContext()); - New->setTargetTriple(Triple::normalize(Triple)); - - // Loop over the functions in the module, making external functions as before - for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { - if (!I->isDeclaration() && - (I->getCallingConv() == CallingConv::PTX_Device || - I->getCallingConv() == CallingConv::PTX_Kernel)) { - Function *NF = - Function::Create(cast<FunctionType>(I->getType()->getElementType()), - I->getLinkage(), I->getName(), New); - NF->copyAttributesFrom(I); - VMap[I] = NF; - - Function::arg_iterator DestI = NF->arg_begin(); - for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end(); - ++J) { - DestI->setName(J->getName()); - VMap[J] = DestI++; - } - SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned. - CloneFunctionInto(NF, I, VMap, /*ModuleLevelChanges=*/true, Returns); - } - } - - return New; -} - -static bool createASMAsString(Module *New, const StringRef &Triple, - const StringRef &MCPU, const StringRef &Features, - std::string &ASM) { - llvm::Triple TheTriple(Triple::normalize(Triple)); - std::string ErrMsg; - const Target *TheTarget = - TargetRegistry::lookupTarget(TheTriple.getTriple(), ErrMsg); - if (!TheTarget) { - errs() << ErrMsg << "\n"; - return false; - } - - TargetOptions Options; - std::unique_ptr<TargetMachine> target(TheTarget->createTargetMachine( - TheTriple.getTriple(), MCPU, Features, Options)); - assert(target.get() && "Could not allocate target machine!"); - TargetMachine &Target = *target.get(); - - // Build up all of the passes that we want to do to the module. - llvm::legacy::PassManager PM; - - PM.add(new TargetLibraryInfoWrapperPass(TheTriple)); - PM.add(createTargetTransformInfoWrapperPass(Target.getTargetIRAnalysis())); - - { - SmallString<100> ASMSmall; - raw_svector_ostream NameROSSmall(ASMSmall); - - // Ask the target to add backend passes as necessary. - int UseVerifier = true; - if (Target.addPassesToEmitFile( - PM, NameROSSmall, TargetMachine::CGFT_AssemblyFile, UseVerifier)) { - errs() << "The target does not support generation of this file type!\n"; - return false; - } - - ASM = ASMSmall.c_str(); - PM.run(*New); - } - - return true; -} - -Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) { - Module *M = getModule(); - Module *GPUModule = extractPTXFunctionsFromModule(M, GPUTriple); - std::string LLVMKernelStr; - if (!createASMAsString(GPUModule, GPUTriple, "sm_20" /*MCPU*/, - "" /*Features*/, LLVMKernelStr)) { - errs() << "Generate ptx string failed!\n"; - return NULL; - } - - Value *LLVMKernel = - Builder.CreateGlobalStringPtr(LLVMKernelStr, "llvm_kernel"); - - delete GPUModule; - return LLVMKernel; -} - -Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) { - StringRef Entry = SubFunction->getName(); - return Builder.CreateGlobalStringPtr(Entry, "ptx_entry"); -} - -void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) { - Module *M = getModule(); - SubFunction->eraseFromParent(); - - if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) { - FuncPTXReadNCtaidX->eraseFromParent(); - } - - if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) { - FuncPTXReadNCtaidY->eraseFromParent(); - } - - if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) { - FuncPTXReadCtaidX->eraseFromParent(); - } - - if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) { - FuncPTXReadCtaidY->eraseFromParent(); - } - - if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) { - FuncPTXReadNTidX->eraseFromParent(); - } - - if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) { - FuncPTXReadNTidY->eraseFromParent(); - } - - if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) { - FuncPTXReadTidX->eraseFromParent(); - } - - if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) { - FuncPTXReadTidY->eraseFromParent(); - } -} - -void PTXGenerator::finishGeneration(Function *F) { - // Define data used by the GPURuntime library. - AllocaInst *PtrCUContext = - Builder.CreateAlloca(getGPUContextPtrType(), 0, "phcontext"); - AllocaInst *PtrCUDevice = - Builder.CreateAlloca(getGPUDevicePtrType(), 0, "phdevice"); - AllocaInst *PtrCUModule = - Builder.CreateAlloca(getGPUModulePtrType(), 0, "phmodule"); - AllocaInst *PtrCUKernel = - Builder.CreateAlloca(getGPUFunctionPtrType(), 0, "phkernel"); - AllocaInst *PtrCUStartEvent = - Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstart_timer"); - AllocaInst *PtrCUStopEvent = - Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstop_timer"); - AllocaInst *PtrDevData = - Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0, "pdevice_data"); - AllocaInst *PtrHostData = - Builder.CreateAlloca(getI8PtrType(), 0, "phost_data"); - Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext()); - AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer"); - - // Initialize the GPU device. - createCallInitDevice(PtrCUContext, PtrCUDevice); - - // Create the GPU kernel module and entry function. - Value *PTXString = createPTXKernelFunction(F); - Value *PTXEntry = getPTXKernelEntryName(F); - createCallGetPTXModule(PTXString, PtrCUModule); - LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule"); - createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel); - - // Allocate device memory and its corresponding host memory. - createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData, - getOutputArraySizeInBytes()); - - // Get the pointer to the device memory and set the GPU execution parameters. - LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data"); - LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel"); - createCallSetKernelParameters(CUKernel, getCUDABlockWidth(), - getCUDABlockHeight(), DData); - - // Create the start and end timer and record the start time. - createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent); - - // Launch the GPU kernel. - createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight()); - - // Copy the results back from the GPU to the host. - LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data"); - createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes()); - - // Record the end time. - LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer"); - LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer"); - createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent, PtrElapsedTimes); - - // Cleanup all the resources used. - LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext"); - createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext, CUKernel); - - // Erase the ptx kernel and device subfunctions and ptx intrinsics from - // current module. - eraseUnusedFunctions(F); -} -#endif /* GPU_CODEGEN */ diff --git a/polly/lib/Makefile b/polly/lib/Makefile index e19938e5c23..06e9f3d8db9 100644 --- a/polly/lib/Makefile +++ b/polly/lib/Makefile @@ -18,7 +18,7 @@ include $(LEVEL)/Makefile.config # Enable optional source files ifeq ($(GPU_CODEGEN), yes) -GPGPU_CODEGEN_FILES= CodeGen/PTXGenerator.cpp +GPGPU_CODEGEN_FILES="" endif ISL_CODEGEN_FILES= CodeGen/IslAst.cpp \ |

