diff options
author | Arpith Chacko Jacob <acjacob@us.ibm.com> | 2016-03-15 21:04:57 +0000 |
---|---|---|
committer | Arpith Chacko Jacob <acjacob@us.ibm.com> | 2016-03-15 21:04:57 +0000 |
commit | 5e1493b560da95a64d9fb445f5e54664122fa6b9 (patch) | |
tree | 79aeae4b2971aa1cfcc6078155e70965854f4dc6 /clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | |
parent | 4240055ac8b4547439d5b8db9590619773727574 (diff) | |
download | bcm5719-llvm-5e1493b560da95a64d9fb445f5e54664122fa6b9.tar.gz bcm5719-llvm-5e1493b560da95a64d9fb445f5e54664122fa6b9.zip |
[OpenMP] Base support for target directive codegen on NVPTX device.
Summary:
This patch adds base support for codegen of the target directive on the NVPTX device.
Reviewers: ABataev
Differential Revision: http://reviews.llvm.org/D17877
llvm-svn: 263587
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 323 |
1 files changed, 322 insertions, 1 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 680ed578134..e362f900a42 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -18,5 +18,326 @@ using namespace clang; using namespace CodeGen; +/// \brief Get the GPU warp size. +llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateCall( + llvm::Intrinsic::getDeclaration( + &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), + llvm::None, "nvptx_warp_size"); +} + +/// \brief Get the id of the current thread on the GPU. +llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateCall( + llvm::Intrinsic::getDeclaration( + &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), + llvm::None, "nvptx_tid"); +} + +// \brief Get the maximum number of threads in a block of the GPU. +llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateCall( + llvm::Intrinsic::getDeclaration( + &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), + llvm::None, "nvptx_num_threads"); +} + +/// \brief Get barrier to synchronize all threads in a block. +void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + Bld.CreateCall(llvm::Intrinsic::getDeclaration( + &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); +} + +// \brief Synchronize all GPU threads in a block. +void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { + getNVPTXCTABarrier(CGF); +} + +/// \brief Get the thread id of the OMP master thread. +/// The master thread id is the first thread (lane) of the last warp in the +/// GPU block. Warp size is assumed to be some power of 2. +/// Thread id is 0 indexed. +/// E.g: If NumThreads is 33, master id is 32. +/// If NumThreads is 64, master id is 32. +/// If NumThreads is 1024, master id is 992. +llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Value *NumThreads = getNVPTXNumThreads(CGF); + + // We assume that the warp size is a power of 2. + llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); + + return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), + Bld.CreateNot(Mask), "master_tid"); +} + +namespace { +enum OpenMPRTLFunctionNVPTX { + /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle, + /// kmp_int32 thread_limit); + OMPRTL_NVPTX__kmpc_kernel_init, +}; + +// NVPTX Address space +enum ADDRESS_SPACE { + ADDRESS_SPACE_SHARED = 3, +}; +} // namespace + +CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( + CodeGenModule &CGM) + : WorkerFn(nullptr), CGFI(nullptr) { + createWorkerFunction(CGM); +}; + +void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( + CodeGenModule &CGM) { + // Create an worker function with no arguments. + CGFI = &CGM.getTypes().arrangeNullaryFunction(); + + WorkerFn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, + /* placeholder */ "_worker", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); + WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage); + WorkerFn->addFnAttr(llvm::Attribute::NoInline); +} + +void CGOpenMPRuntimeNVPTX::initializeEnvironment() { + // + // Initialize master-worker control state in shared memory. + // + + auto DL = CGM.getDataLayout(); + ActiveWorkers = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false, + llvm::GlobalValue::CommonLinkage, + llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0, + llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); + ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty)); + + WorkID = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false, + llvm::GlobalValue::CommonLinkage, + llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0, + llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); + WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty)); +} + +void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { + auto &Ctx = CGM.getContext(); + + CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); + emitWorkerLoop(CGF, WST); + CGF.FinishFunction(); +} + +void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, + WorkerFunctionState &WST) { + // + // The workers enter this loop and wait for parallel work from the master. + // When the master encounters a parallel region it sets up the work + variable + // arguments, and wakes up the workers. The workers first check to see if + // they are required for the parallel region, i.e., within the # of requested + // parallel threads. The activated workers load the variable arguments and + // execute the parallel work. + // + + CGBuilderTy &Bld = CGF.Builder; + + llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); + llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); + llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); + llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); + llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + + CGF.EmitBranch(AwaitBB); + + // Workers wait for work from master. + CGF.EmitBlock(AwaitBB); + // Wait for parallel work + syncCTAThreads(CGF); + // On termination condition (workid == 0), exit loop. + llvm::Value *ShouldTerminate = Bld.CreateICmpEQ( + Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()), + llvm::Constant::getNullValue(WorkID->getType()->getElementType()), + "should_terminate"); + Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); + + // Activate requested workers. + CGF.EmitBlock(SelectWorkersBB); + llvm::Value *ThreadID = getNVPTXThreadID(CGF); + llvm::Value *ActiveThread = Bld.CreateICmpSLT( + ThreadID, + Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()), + "active_thread"); + Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB); + + // Signal start of parallel region. + CGF.EmitBlock(ExecuteBB); + // TODO: Add parallel work. + + // Signal end of parallel region. + CGF.EmitBlock(TerminateBB); + CGF.EmitBranch(BarrierBB); + + // All active and inactive workers wait at a barrier after parallel region. + CGF.EmitBlock(BarrierBB); + // Barrier after parallel region. + syncCTAThreads(CGF); + CGF.EmitBranch(AwaitBB); + + // Exit target region. + CGF.EmitBlock(ExitBB); +} + +// Setup NVPTX threads for master-worker OpenMP scheme. +void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF, + EntryFunctionState &EST, + WorkerFunctionState &WST) { + CGBuilderTy &Bld = CGF.Builder; + + // Get the master thread id. + llvm::Value *MasterID = getMasterThreadID(CGF); + // Current thread's identifier. + llvm::Value *ThreadID = getNVPTXThreadID(CGF); + + // Setup BBs in entry function. + llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker"); + llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); + llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); + EST.ExitBB = CGF.createBasicBlock(".exit"); + + // The head (master thread) marches on while its body of companion threads in + // the warp go to sleep. + llvm::Value *ShouldDie = + Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp"); + Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB); + + // Select worker threads... + CGF.EmitBlock(WorkerCheckBB); + llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker"); + Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB); + + // ... and send to worker loop, awaiting parallel invocation. + CGF.EmitBlock(WorkerBB); + CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); + CGF.EmitBranch(EST.ExitBB); + + // Only master thread executes subsequent serial code. + CGF.EmitBlock(MasterBB); + + // First action in sequential region: + // Initialize the state of the OpenMP runtime library on the GPU. + llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), + Args); +} + +void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF, + EntryFunctionState &EST) { + CGBuilderTy &Bld = CGF.Builder; + llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); + CGF.EmitBranch(TerminateBB); + + CGF.EmitBlock(TerminateBB); + // Signal termination condition. + Bld.CreateAlignedStore( + llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID, + WorkID->getAlignment()); + // Barrier to terminate worker threads. + syncCTAThreads(CGF); + // Master thread jumps to exit point. + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(EST.ExitBB); +} + +/// \brief Returns specified OpenMP runtime function for the current OpenMP +/// implementation. Specialized for the NVPTX device. +/// \param Function OpenMP runtime function. +/// \return Specified function. +llvm::Constant * +CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { + llvm::Constant *RTLFn = nullptr; + switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { + case OMPRTL_NVPTX__kmpc_kernel_init: { + // Build void __kmpc_kernel_init(kmp_int32 omp_handle, + // kmp_int32 thread_limit); + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); + break; + } + } + return RTLFn; +} + +void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, + llvm::Constant *Addr, + uint64_t Size) { + auto *F = dyn_cast<llvm::Function>(Addr); + // TODO: Add support for global variables on the device after declare target + // support. + if (!F) + return; + llvm::Module *M = F->getParent(); + llvm::LLVMContext &Ctx = M->getContext(); + + // Get "nvvm.annotations" metadata node + llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); + + llvm::Metadata *MDVals[] = { + llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; + // Append metadata to nvvm.annotations + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); +} + +void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry) { + if (!IsOffloadEntry) // Nothing to do. + return; + + assert(!ParentName.empty() && "Invalid target region parent name!"); + + const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt()); + + EntryFunctionState EST; + WorkerFunctionState WST(CGM); + + // Emit target region as a standalone region. + auto &&CodeGen = [&EST, &WST, &CS, this](CodeGenFunction &CGF) { + emitEntryHeader(CGF, EST, WST); + CGF.EmitStmt(CS.getCapturedStmt()); + emitEntryFooter(CGF, EST); + }; + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); + + // Create the worker function + emitWorkerFunction(WST); + + // Now change the name of the worker function to correspond to this target + // region's entry function. + WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); +} + CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM) {} + : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP NVPTX can only handle device code."); + + // Called once per module during initialization. + initializeEnvironment(); +} |