diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-05-07 14:50:05 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-05-07 14:50:05 +0000 |
commit | d7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd (patch) | |
tree | a9c06047a5ac728d12872113e9cda85606a4621a /clang/lib | |
parent | 4a0f2c5047f59e8420dd04b2c453cc74a3d8963c (diff) | |
download | bcm5719-llvm-d7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd.tar.gz bcm5719-llvm-d7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd.zip |
[OPENMP, NVPTX] Added support for L2 parallelism.
Added initial codegen for level 2, 3 etc. parallelism. Currently, all
the second, the third etc. parallel regions will run sequentially.
llvm-svn: 331642
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntime.cpp | 7 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 443 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 48 | ||||
-rw-r--r-- | clang/lib/CodeGen/CodeGenModule.cpp | 13 |
4 files changed, 384 insertions, 127 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 18b51ea0bcc..9e14738dd8c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -2764,13 +2764,6 @@ Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF, CGM.getPointerAlign()); } -/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen -/// function. Here is the logic: -/// if (Cond) { -/// ThenGen(); -/// } else { -/// ElseGen(); -/// } void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 82be31f0f80..9e6f2b4b9a3 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -93,6 +93,9 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_end_sharing_variables, /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs) OMPRTL_NVPTX__kmpc_get_shared_variables, + /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 + /// global_tid); + OMPRTL_NVPTX__kmpc_parallel_level, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -131,19 +134,17 @@ public: } }; -// A class to track the execution mode when codegening directives within -// a target region. The appropriate mode (generic/spmd) is set on entry -// to the target region and used by containing directives such as 'parallel' -// to emit optimized code. +/// A class to track the execution mode when codegening directives within +/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry +/// to the target region and used by containing directives such as 'parallel' +/// to emit optimized code. class ExecutionModeRAII { private: - CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode; - CGOpenMPRuntimeNVPTX::ExecutionMode &Mode; + bool SavedMode; + bool &Mode; public: - ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, - CGOpenMPRuntimeNVPTX::ExecutionMode NewMode) - : Mode(Mode) { + ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) { SavedMode = Mode; Mode = NewMode; } @@ -579,24 +580,171 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( } bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const { - return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; + return IsInSPMDExecutionMode; +} + +static CGOpenMPRuntimeNVPTX::DataSharingMode +getDataSharingMode(CodeGenModule &CGM) { + return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA + : CGOpenMPRuntimeNVPTX::Generic; } -static CGOpenMPRuntimeNVPTX::ExecutionMode -getExecutionMode(CodeGenModule &CGM) { - return CGM.getLangOpts().OpenMPCUDAMode - ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd - : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic; +/// Check for inner (nested) SPMD construct, if any +static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) { + const auto *CS = D.getCapturedStmt(OMPD_target); + const auto *Body = CS->getCapturedStmt()->IgnoreContainers(); + const Stmt *ChildStmt = nullptr; + if (const auto *C = dyn_cast<CompoundStmt>(Body)) + if (C->size() == 1) + ChildStmt = C->body_front(); + if (!ChildStmt) + return false; + + if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); + // TODO: add further analysis for inner teams|distribute directives, if any. + switch (D.getDirectiveKind()) { + case OMPD_target: + return (isOpenMPParallelDirective(DKind) && + !isOpenMPTeamsDirective(DKind) && + !isOpenMPDistributeDirective(DKind)) || + isOpenMPSimdDirective(DKind) || + DKind == OMPD_teams_distribute_parallel_for; + case OMPD_target_teams: + return (isOpenMPParallelDirective(DKind) && + !isOpenMPDistributeDirective(DKind)) || + isOpenMPSimdDirective(DKind) || + DKind == OMPD_distribute_parallel_for; + case OMPD_target_teams_distribute: + return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind); + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_teams_distribute_simd: + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + llvm_unreachable("Unexpected directive."); + } + } + + return false; +} + +static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) { + OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); + switch (DirectiveKind) { + case OMPD_target: + case OMPD_target_teams: + case OMPD_target_teams_distribute: + return hasNestedSPMDDirective(D); + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_teams_distribute_simd: + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + return true; + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + break; + } + llvm_unreachable( + "Unknown programming model for OpenMP directive on NVPTX target."); } -void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, +void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { - ExecutionModeRAII ModeRAII(CurrentExecutionMode, - CGOpenMPRuntimeNVPTX::ExecutionMode::Generic); + ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false); EntryFunctionState EST; WorkerFunctionState WST(CGM, D.getLocStart()); Work.clear(); @@ -613,11 +761,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, : EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) - .emitGenericEntryHeader(CGF, EST, WST); + .emitNonSPMDEntryHeader(CGF, EST, WST); } void Exit(CodeGenFunction &CGF) override { static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) - .emitGenericEntryFooter(CGF, EST); + .emitNonSPMDEntryFooter(CGF, EST); } } Action(EST, WST); CodeGen.setAction(Action); @@ -633,7 +781,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, } // Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, +void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST) { CGBuilderTy &Bld = CGF.Builder; @@ -657,6 +805,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); + IsInTargetMasterThreadRegion = true; // SEQUENTIAL (MASTER) REGION START // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. @@ -674,12 +823,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, emitGenericVarsProlog(CGF, WST.Loc); } -void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, +void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { - emitGenericVarsEpilog(CGF); + IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) return; + emitGenericVarsEpilog(CGF); + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -707,8 +858,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { - ExecutionModeRAII ModeRAII(CurrentExecutionMode, - CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd); + ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true); EntryFunctionState EST; // Emit target region as a standalone region. @@ -754,10 +904,17 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader( CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); + + emitGenericVarsProlog(CGF, D.getLocStart()); } void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { + if (!CGF.HaveInsertPoint()) + return; + + emitGenericVarsEpilog(CGF); + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -781,11 +938,12 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, // 'generic', the runtime reserves one warp for the master, otherwise, all // warps participate in parallel work. static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, - CGOpenMPRuntimeNVPTX::ExecutionMode Mode) { - auto *GVMode = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode), Twine(Name, "_exec_mode")); + bool Mode) { + auto *GVMode = + new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), + Twine(Name, "_exec_mode")); CGM.addCompilerUsedGlobal(GVMode); } @@ -846,8 +1004,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); // On termination condition (workid == 0), exit loop. - llvm::Value *ShouldTerminate = - Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); + llvm::Value *WorkID = Bld.CreateLoad(WorkFn); + llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. @@ -886,6 +1044,22 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(CheckNextBB); } + // Default case: call to outlined function through pointer if the target + // region makes a declare target call that may contain an orphaned parallel + // directive. + auto *ParallelFnTy = + llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, + /*isVarArg=*/false) + ->getPointerTo(); + llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy); + // Insert call to work function via shared wrapper. The shared + // wrapper takes two arguments: + // - the parallelism level; + // - the thread ID; + emitCall(CGF, WST.Loc, WorkFnCast, + {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); + // Go to end of parallel region. + CGF.EmitBranch(TerminateBB); // Signal end of parallel region. CGF.EmitBlock(TerminateBB); @@ -1163,6 +1337,14 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); break; } + case OMPRTL_NVPTX__kmpc_parallel_level: { + // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level"); + break; + } } return RTLFn; } @@ -1198,27 +1380,19 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( assert(!ParentName.empty() && "Invalid target region parent name!"); - CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM); - switch (Mode) { - case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic: - emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - break; - case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd: + bool Mode = supportsSPMDExecutionMode(D); + if (Mode) emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); - break; - case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown: - llvm_unreachable( - "Unknown programming model for OpenMP directive on NVPTX target."); - } + else + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM, "_", "$"), - CurrentExecutionMode(ExecutionMode::Unknown) { + : CGOpenMPRuntime(CGM, "_", "$") { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } @@ -1258,23 +1432,32 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { SourceLocation &Loc; + bool &IsInParallelRegion; + bool PrevIsInParallelRegion; public: - NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {} + NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion) + : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {} void Enter(CodeGenFunction &CGF) override { static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsProlog(CGF, Loc); + PrevIsInParallelRegion = IsInParallelRegion; + IsInParallelRegion = true; } void Exit(CodeGenFunction &CGF) override { + IsInParallelRegion = PrevIsInParallelRegion; static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsEpilog(CGF); } - } Action(Loc); + } Action(Loc, IsInParallelRegion); CodeGen.setAction(Action); + bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; + IsInTargetMasterThreadRegion = false; auto *OutlinedFun = cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen)); - if (!isInSpmdExecutionMode()) { + IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; + if (!isInSpmdExecutionMode() && !IsInParallelRegion) { llvm::Function *WrapperFun = createParallelDataSharingWrapper(OutlinedFun, D); WrapperFunctionsMap[OutlinedFun] = WrapperFun; @@ -1316,6 +1499,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + CGBuilderTy &Bld = CGF.Builder; const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); @@ -1402,6 +1588,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, } void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { I->getSecond().MappedParams->restore(CGF); @@ -1449,31 +1638,61 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall( if (isInSpmdExecutionMode()) emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); else - emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); + emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); } -void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( +void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); - llvm::Function *WFn = WrapperFunctionsMap[Fn]; - - assert(WFn && "Wrapper function does not exist!"); // Force inline this outlined function at its call site. Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF, - PrePostActionTy &) { - CGBuilderTy &Bld = CGF.Builder; + Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( + /*DestWidth=*/32, /*Signed=*/1), + ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc); + auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr]( + CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + + llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; + OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); + }; + auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, + PrePostActionTy &) { + + RegionCodeGenTy RCG(CodeGen); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *Args[] = {RTLoc, ThreadID}; + + NVPTXActionTy Action( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), + Args, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), + Args); + RCG.setAction(Action); + RCG(CGF); + }; + auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, + PrePostActionTy &Action) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + assert(WFn && "Wrapper function does not exist!"); llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); // Prepare for parallel region. Indicate the outlined function. llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); // Create a private scope that will globalize the arguments // passed from the outside of the target region. @@ -1496,13 +1715,13 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( // Store variable address in a list of references to pass to workers. unsigned Idx = 0; ASTContext &Ctx = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs, - Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) - .castAs<PointerType>()); + Address SharedArgListAddress = CGF.EmitLoadOfPointer( + SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) + .castAs<PointerType>()); for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP( - SharedArgListAddress, Idx, CGF.getPointerSize()); - llvm::Value * PtrV; + Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, + CGF.getPointerSize()); + llvm::Value *PtrV; if (V->getType()->isIntegerTy()) PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); else @@ -1533,43 +1752,67 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( Work.emplace_back(WFn); }; - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF, - PrePostActionTy &Action) { - Action.Enter(CGF); - - llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; - Address ZeroAddr = - CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( - /*DestWidth=*/32, /*Signed=*/1), - ".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); - }; - + auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen]( + CodeGenFunction &CGF, PrePostActionTy &Action) { RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), - Args, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); + if (IsInParallelRegion) { + SeqGen(CGF, Action); + } else if (IsInTargetMasterThreadRegion) { + L0ParallelGen(CGF, Action); + } else { + // Check for master and then parallelism: + // if (is_master) { + // Worker call. + // } else if (__kmpc_parallel_level(loc, gtid)) { + // Serialized execution. + // } else { + // Outlined function call. + // } + CGBuilderTy &Bld = CGF.Builder; + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + if (!isInSpmdExecutionMode()) { + llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); + llvm::BasicBlock *ParallelCheckBB = + CGF.createBasicBlock(".parallelcheck"); + llvm::Value *IsMaster = + Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB); + CGF.EmitBlock(MasterCheckBB); + L0ParallelGen(CGF, Action); + CGF.EmitBranch(ExitBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ParallelCheckBB); + } + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *PL = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), + {RTLoc, ThreadID}); + llvm::Value *Res = Bld.CreateIsNotNull(PL); + llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then"); + llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else"); + Bld.CreateCondBr(Res, ThenBlock, ElseBlock); + // Emit the 'then' code. + CGF.EmitBlock(ThenBlock); + SeqGen(CGF, Action); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + // Emit the 'else' code. + CGF.EmitBlock(ElseBlock); + RCG(CGF); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + // Emit the continuation block for code after the if. + CGF.EmitBlock(ExitBB, /*IsFinished=*/true); + } }; if (IfCond) { - emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); + emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen); } else { CodeGenFunction::RunCleanupsScope Scope(CGF); - RegionCodeGenTy ThenRCG(L0ParallelGen); + RegionCodeGenTy ThenRCG(LNParallelGen); ThenRCG(CGF); } } @@ -3090,6 +3333,9 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); @@ -3143,6 +3389,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return Address::invalid(); + VD = VD->getCanonicalDecl(); auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 365d4f52aa6..a5f39c28a7a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -25,7 +25,7 @@ namespace CodeGen { class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { private: - // Parallel outlined function work for workers to execute. + /// Parallel outlined function work for workers to execute. llvm::SmallVector<llvm::Function *, 16> Work; struct EntryFunctionState { @@ -52,14 +52,14 @@ private: /// \brief Helper for worker function. Emit body of worker loop. void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); - /// \brief Helper for generic target entry function. Guide the master and + /// \brief Helper for non-SPMD target entry function. Guide the master and /// worker threads to their respective locations. - void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST); - /// \brief Signal termination of OMP execution for generic target entry + /// \brief Signal termination of OMP execution for non-SPMD target entry /// function. - void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); /// Helper for generic variables globalization prolog. void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc); @@ -93,7 +93,7 @@ private: /// \param IsOffloadEntry True if the outlined function is an offload entry. /// An outlined function may not be an entry if, e.g. the if clause always /// evaluates to false. - void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, + void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); @@ -133,14 +133,14 @@ private: /// \brief Emits code for parallel or serial call of the \a OutlinedFn with /// variables captured in a record which address is stored in \a /// CapturedStruct. - /// This call is for the Generic Execution Mode. + /// This call is for the Non-SPMD Execution Mode. /// \param OutlinedFn Outlined function to be run in parallel threads. Type of /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). /// \param CapturedVars A pointer to the record with the references to /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. - void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond); @@ -304,15 +304,15 @@ public: Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override; - /// Target codegen is specialized based on two programming models: the - /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' - /// model for constructs like 'target parallel' that support it. - enum ExecutionMode { - /// Single Program Multiple Data. - Spmd, - /// Generic codegen to support fork-join model. + /// Target codegen is specialized based on two data-sharing modes: CUDA, in + /// which the local variables are actually global threadlocal, and Generic, in + /// which the local variables are placed in global memory if they may escape + /// their declaration context. + enum DataSharingMode { + /// CUDA data sharing mode. + CUDA, + /// Generic data-sharing mode. Generic, - Unknown, }; /// Cleans up references to the objects in finished function. @@ -320,11 +320,17 @@ public: void functionFinished(CodeGenFunction &CGF) override; private: - // Track the execution mode when codegening directives within a target - // region. The appropriate mode (generic/spmd) is set on entry to the - // target region and used by containing directives such as 'parallel' - // to emit optimized code. - ExecutionMode CurrentExecutionMode; + /// Track the execution mode when codegening directives within a target + /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the + /// target region and used by containing directives such as 'parallel' + /// to emit optimized code. + bool IsInSPMDExecutionMode = false; + + /// true if we're emitting the code for the target region and next parallel + /// region is L0 for sure. + bool IsInTargetMasterThreadRegion = false; + /// true if we're definitely in the parallel region. + bool IsInParallelRegion = false; /// Map between an outlined function and its wrapper. llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 6e3747a188b..4bb3c7b0d3b 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2399,8 +2399,17 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( // For the device mark the function as one that should be emitted. if (getLangOpts().OpenMPIsDevice && OpenMPRuntime && !OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() && - !DontDefer && !IsForDefinition) - addDeferredDeclToEmit(GD); + !DontDefer && !IsForDefinition) { + const FunctionDecl *FDDef = FD->getDefinition(); + GlobalDecl GDDef; + if (const auto *CD = dyn_cast<CXXConstructorDecl>(FDDef)) + GDDef = GlobalDecl(CD, GD.getCtorType()); + else if (const auto *DD = dyn_cast<CXXDestructorDecl>(FDDef)) + GDDef = GlobalDecl(DD, GD.getDtorType()); + else + GDDef = GlobalDecl(FDDef); + addDeferredDeclToEmit(GDDef); + } if (FD->isMultiVersion() && FD->getAttr<TargetAttr>()->isDefaultVersion()) { UpdateMultiVersionNames(GD, FD); |