summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-05-07 14:50:05 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-05-07 14:50:05 +0000
commitd7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd (patch)
treea9c06047a5ac728d12872113e9cda85606a4621a /clang/lib
parent4a0f2c5047f59e8420dd04b2c453cc74a3d8963c (diff)
downloadbcm5719-llvm-d7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd.tar.gz
bcm5719-llvm-d7ff6d647f765ee8ce26d446a0cdd1f03d43e6dd.zip
[OPENMP, NVPTX] Added support for L2 parallelism.
Added initial codegen for level 2, 3 etc. parallelism. Currently, all the second, the third etc. parallel regions will run sequentially. llvm-svn: 331642
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntime.cpp7
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp443
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h48
-rw-r--r--clang/lib/CodeGen/CodeGenModule.cpp13
4 files changed, 384 insertions, 127 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 18b51ea0bcc..9e14738dd8c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2764,13 +2764,6 @@ Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
CGM.getPointerAlign());
}
-/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
-/// function. Here is the logic:
-/// if (Cond) {
-/// ThenGen();
-/// } else {
-/// ElseGen();
-/// }
void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
const RegionCodeGenTy &ThenGen,
const RegionCodeGenTy &ElseGen) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 82be31f0f80..9e6f2b4b9a3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -93,6 +93,9 @@ enum OpenMPRTLFunctionNVPTX {
OMPRTL_NVPTX__kmpc_end_sharing_variables,
/// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
OMPRTL_NVPTX__kmpc_get_shared_variables,
+ /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
+ /// global_tid);
+ OMPRTL_NVPTX__kmpc_parallel_level,
};
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -131,19 +134,17 @@ public:
}
};
-// A class to track the execution mode when codegening directives within
-// a target region. The appropriate mode (generic/spmd) is set on entry
-// to the target region and used by containing directives such as 'parallel'
-// to emit optimized code.
+/// A class to track the execution mode when codegening directives within
+/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
+/// to the target region and used by containing directives such as 'parallel'
+/// to emit optimized code.
class ExecutionModeRAII {
private:
- CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
- CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+ bool SavedMode;
+ bool &Mode;
public:
- ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
- CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
- : Mode(Mode) {
+ ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) {
SavedMode = Mode;
Mode = NewMode;
}
@@ -579,24 +580,171 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
}
bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
- return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+ return IsInSPMDExecutionMode;
+}
+
+static CGOpenMPRuntimeNVPTX::DataSharingMode
+getDataSharingMode(CodeGenModule &CGM) {
+ return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
+ : CGOpenMPRuntimeNVPTX::Generic;
}
-static CGOpenMPRuntimeNVPTX::ExecutionMode
-getExecutionMode(CodeGenModule &CGM) {
- return CGM.getLangOpts().OpenMPCUDAMode
- ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
- : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+/// Check for inner (nested) SPMD construct, if any
+static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
+ const auto *CS = D.getCapturedStmt(OMPD_target);
+ const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
+ const Stmt *ChildStmt = nullptr;
+ if (const auto *C = dyn_cast<CompoundStmt>(Body))
+ if (C->size() == 1)
+ ChildStmt = C->body_front();
+ if (!ChildStmt)
+ return false;
+
+ if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+ OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
+ // TODO: add further analysis for inner teams|distribute directives, if any.
+ switch (D.getDirectiveKind()) {
+ case OMPD_target:
+ return (isOpenMPParallelDirective(DKind) &&
+ !isOpenMPTeamsDirective(DKind) &&
+ !isOpenMPDistributeDirective(DKind)) ||
+ isOpenMPSimdDirective(DKind) ||
+ DKind == OMPD_teams_distribute_parallel_for;
+ case OMPD_target_teams:
+ return (isOpenMPParallelDirective(DKind) &&
+ !isOpenMPDistributeDirective(DKind)) ||
+ isOpenMPSimdDirective(DKind) ||
+ DKind == OMPD_distribute_parallel_for;
+ case OMPD_target_teams_distribute:
+ return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind);
+ case OMPD_target_simd:
+ case OMPD_target_parallel:
+ case OMPD_target_parallel_for:
+ case OMPD_target_parallel_for_simd:
+ case OMPD_target_teams_distribute_simd:
+ case OMPD_target_teams_distribute_parallel_for:
+ case OMPD_target_teams_distribute_parallel_for_simd:
+ case OMPD_parallel:
+ case OMPD_for:
+ case OMPD_parallel_for:
+ case OMPD_parallel_sections:
+ case OMPD_for_simd:
+ case OMPD_parallel_for_simd:
+ case OMPD_cancel:
+ case OMPD_cancellation_point:
+ case OMPD_ordered:
+ case OMPD_threadprivate:
+ case OMPD_task:
+ case OMPD_simd:
+ case OMPD_sections:
+ case OMPD_section:
+ case OMPD_single:
+ case OMPD_master:
+ case OMPD_critical:
+ case OMPD_taskyield:
+ case OMPD_barrier:
+ case OMPD_taskwait:
+ case OMPD_taskgroup:
+ case OMPD_atomic:
+ case OMPD_flush:
+ case OMPD_teams:
+ case OMPD_target_data:
+ case OMPD_target_exit_data:
+ case OMPD_target_enter_data:
+ case OMPD_distribute:
+ case OMPD_distribute_simd:
+ case OMPD_distribute_parallel_for:
+ case OMPD_distribute_parallel_for_simd:
+ case OMPD_teams_distribute:
+ case OMPD_teams_distribute_simd:
+ case OMPD_teams_distribute_parallel_for:
+ case OMPD_teams_distribute_parallel_for_simd:
+ case OMPD_target_update:
+ case OMPD_declare_simd:
+ case OMPD_declare_target:
+ case OMPD_end_declare_target:
+ case OMPD_declare_reduction:
+ case OMPD_taskloop:
+ case OMPD_taskloop_simd:
+ case OMPD_unknown:
+ llvm_unreachable("Unexpected directive.");
+ }
+ }
+
+ return false;
+}
+
+static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) {
+ OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+ switch (DirectiveKind) {
+ case OMPD_target:
+ case OMPD_target_teams:
+ case OMPD_target_teams_distribute:
+ return hasNestedSPMDDirective(D);
+ case OMPD_target_simd:
+ case OMPD_target_parallel:
+ case OMPD_target_parallel_for:
+ case OMPD_target_parallel_for_simd:
+ case OMPD_target_teams_distribute_simd:
+ case OMPD_target_teams_distribute_parallel_for:
+ case OMPD_target_teams_distribute_parallel_for_simd:
+ return true;
+ case OMPD_parallel:
+ case OMPD_for:
+ case OMPD_parallel_for:
+ case OMPD_parallel_sections:
+ case OMPD_for_simd:
+ case OMPD_parallel_for_simd:
+ case OMPD_cancel:
+ case OMPD_cancellation_point:
+ case OMPD_ordered:
+ case OMPD_threadprivate:
+ case OMPD_task:
+ case OMPD_simd:
+ case OMPD_sections:
+ case OMPD_section:
+ case OMPD_single:
+ case OMPD_master:
+ case OMPD_critical:
+ case OMPD_taskyield:
+ case OMPD_barrier:
+ case OMPD_taskwait:
+ case OMPD_taskgroup:
+ case OMPD_atomic:
+ case OMPD_flush:
+ case OMPD_teams:
+ case OMPD_target_data:
+ case OMPD_target_exit_data:
+ case OMPD_target_enter_data:
+ case OMPD_distribute:
+ case OMPD_distribute_simd:
+ case OMPD_distribute_parallel_for:
+ case OMPD_distribute_parallel_for_simd:
+ case OMPD_teams_distribute:
+ case OMPD_teams_distribute_simd:
+ case OMPD_teams_distribute_parallel_for:
+ case OMPD_teams_distribute_parallel_for_simd:
+ case OMPD_target_update:
+ case OMPD_declare_simd:
+ case OMPD_declare_target:
+ case OMPD_end_declare_target:
+ case OMPD_declare_reduction:
+ case OMPD_taskloop:
+ case OMPD_taskloop_simd:
+ case OMPD_unknown:
+ break;
+ }
+ llvm_unreachable(
+ "Unknown programming model for OpenMP directive on NVPTX target.");
}
-void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
- ExecutionModeRAII ModeRAII(CurrentExecutionMode,
- CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
+ ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false);
EntryFunctionState EST;
WorkerFunctionState WST(CGM, D.getLocStart());
Work.clear();
@@ -613,11 +761,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
: EST(EST), WST(WST) {}
void Enter(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericEntryHeader(CGF, EST, WST);
+ .emitNonSPMDEntryHeader(CGF, EST, WST);
}
void Exit(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
- .emitGenericEntryFooter(CGF, EST);
+ .emitNonSPMDEntryFooter(CGF, EST);
}
} Action(EST, WST);
CodeGen.setAction(Action);
@@ -633,7 +781,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
}
// Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
EntryFunctionState &EST,
WorkerFunctionState &WST) {
CGBuilderTy &Bld = CGF.Builder;
@@ -657,6 +805,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
CGF.EmitBlock(MasterBB);
+ IsInTargetMasterThreadRegion = true;
// SEQUENTIAL (MASTER) REGION START
// First action in sequential region:
// Initialize the state of the OpenMP runtime library on the GPU.
@@ -674,12 +823,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
emitGenericVarsProlog(CGF, WST.Loc);
}
-void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
EntryFunctionState &EST) {
- emitGenericVarsEpilog(CGF);
+ IsInTargetMasterThreadRegion = false;
if (!CGF.HaveInsertPoint())
return;
+ emitGenericVarsEpilog(CGF);
+
if (!EST.ExitBB)
EST.ExitBB = CGF.createBasicBlock(".exit");
@@ -707,8 +858,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
- ExecutionModeRAII ModeRAII(CurrentExecutionMode,
- CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+ ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true);
EntryFunctionState EST;
// Emit target region as a standalone region.
@@ -754,10 +904,17 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
CGF.EmitBranch(ExecuteBB);
CGF.EmitBlock(ExecuteBB);
+
+ emitGenericVarsProlog(CGF, D.getLocStart());
}
void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
EntryFunctionState &EST) {
+ if (!CGF.HaveInsertPoint())
+ return;
+
+ emitGenericVarsEpilog(CGF);
+
if (!EST.ExitBB)
EST.ExitBB = CGF.createBasicBlock(".exit");
@@ -781,11 +938,12 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
// 'generic', the runtime reserves one warp for the master, otherwise, all
// warps participate in parallel work.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
- CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
- auto *GVMode = new llvm::GlobalVariable(
- CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage,
- llvm::ConstantInt::get(CGM.Int8Ty, Mode), Twine(Name, "_exec_mode"));
+ bool Mode) {
+ auto *GVMode =
+ new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+ llvm::GlobalValue::WeakAnyLinkage,
+ llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
+ Twine(Name, "_exec_mode"));
CGM.addCompilerUsedGlobal(GVMode);
}
@@ -846,8 +1004,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
// On termination condition (workid == 0), exit loop.
- llvm::Value *ShouldTerminate =
- Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
+ llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
+ llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
// Activate requested workers.
@@ -886,6 +1044,22 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
CGF.EmitBlock(CheckNextBB);
}
+ // Default case: call to outlined function through pointer if the target
+ // region makes a declare target call that may contain an orphaned parallel
+ // directive.
+ auto *ParallelFnTy =
+ llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
+ /*isVarArg=*/false)
+ ->getPointerTo();
+ llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
+ // Insert call to work function via shared wrapper. The shared
+ // wrapper takes two arguments:
+ // - the parallelism level;
+ // - the thread ID;
+ emitCall(CGF, WST.Loc, WorkFnCast,
+ {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
+ // Go to end of parallel region.
+ CGF.EmitBranch(TerminateBB);
// Signal end of parallel region.
CGF.EmitBlock(TerminateBB);
@@ -1163,6 +1337,14 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
break;
}
+ case OMPRTL_NVPTX__kmpc_parallel_level: {
+ // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
+ llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
+ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
+ break;
+ }
}
return RTLFn;
}
@@ -1198,27 +1380,19 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
assert(!ParentName.empty() && "Invalid target region parent name!");
- CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
- switch (Mode) {
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
- emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
- break;
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+ bool Mode = supportsSPMDExecutionMode(D);
+ if (Mode)
emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
- break;
- case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
- llvm_unreachable(
- "Unknown programming model for OpenMP directive on NVPTX target.");
- }
+ else
+ emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
- : CGOpenMPRuntime(CGM, "_", "$"),
- CurrentExecutionMode(ExecutionMode::Unknown) {
+ : CGOpenMPRuntime(CGM, "_", "$") {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP NVPTX can only handle device code.");
}
@@ -1258,23 +1432,32 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
SourceLocation &Loc;
+ bool &IsInParallelRegion;
+ bool PrevIsInParallelRegion;
public:
- NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
+ NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion)
+ : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {}
void Enter(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsProlog(CGF, Loc);
+ PrevIsInParallelRegion = IsInParallelRegion;
+ IsInParallelRegion = true;
}
void Exit(CodeGenFunction &CGF) override {
+ IsInParallelRegion = PrevIsInParallelRegion;
static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF);
}
- } Action(Loc);
+ } Action(Loc, IsInParallelRegion);
CodeGen.setAction(Action);
+ bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
+ IsInTargetMasterThreadRegion = false;
auto *OutlinedFun =
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen));
- if (!isInSpmdExecutionMode()) {
+ IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
+ if (!isInSpmdExecutionMode() && !IsInParallelRegion) {
llvm::Function *WrapperFun =
createParallelDataSharingWrapper(OutlinedFun, D);
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
@@ -1316,6 +1499,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
CGBuilderTy &Bld = CGF.Builder;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1402,6 +1588,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
}
void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
I->getSecond().MappedParams->restore(CGF);
@@ -1449,31 +1638,61 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
if (isInSpmdExecutionMode())
emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
else
- emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+ emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
}
-void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
+void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
- llvm::Function *WFn = WrapperFunctionsMap[Fn];
-
- assert(WFn && "Wrapper function does not exist!");
// Force inline this outlined function at its call site.
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
- auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF,
- PrePostActionTy &) {
- CGBuilderTy &Bld = CGF.Builder;
+ Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/32, /*Signed=*/1),
+ ".zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+ Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+ auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr](
+ CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+
+ llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+ OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
+ OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+ emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
+ };
+ auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
+ PrePostActionTy &) {
+
+ RegionCodeGenTy RCG(CodeGen);
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadID = getThreadID(CGF, Loc);
+ llvm::Value *Args[] = {RTLoc, ThreadID};
+
+ NVPTXActionTy Action(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+ Args,
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+ Args);
+ RCG.setAction(Action);
+ RCG(CGF);
+ };
+ auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Function *WFn = WrapperFunctionsMap[Fn];
+ assert(WFn && "Wrapper function does not exist!");
llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
// Prepare for parallel region. Indicate the outlined function.
llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
- CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
- OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
- Args);
+ CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
+ Args);
// Create a private scope that will globalize the arguments
// passed from the outside of the target region.
@@ -1496,13 +1715,13 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
// Store variable address in a list of references to pass to workers.
unsigned Idx = 0;
ASTContext &Ctx = CGF.getContext();
- Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
- Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
- .castAs<PointerType>());
+ Address SharedArgListAddress = CGF.EmitLoadOfPointer(
+ SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
+ .castAs<PointerType>());
for (llvm::Value *V : CapturedVars) {
- Address Dst = Bld.CreateConstInBoundsGEP(
- SharedArgListAddress, Idx, CGF.getPointerSize());
- llvm::Value * PtrV;
+ Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+ CGF.getPointerSize());
+ llvm::Value *PtrV;
if (V->getType()->isIntegerTy())
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
else
@@ -1533,43 +1752,67 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
Work.emplace_back(WFn);
};
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::Value *ThreadID = getThreadID(CGF, Loc);
- llvm::Value *Args[] = {RTLoc, ThreadID};
-
- auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF,
- PrePostActionTy &) {
- auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF,
- PrePostActionTy &Action) {
- Action.Enter(CGF);
-
- llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
- Address ZeroAddr =
- CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
- /*DestWidth=*/32, /*Signed=*/1),
- ".zero.addr");
- CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
- OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
- OutlinedFnArgs.push_back(ZeroAddr.getPointer());
- OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
- emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
- };
-
+ auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen](
+ CodeGenFunction &CGF, PrePostActionTy &Action) {
RegionCodeGenTy RCG(CodeGen);
- NVPTXActionTy Action(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
- Args,
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
- Args);
- RCG.setAction(Action);
- RCG(CGF);
+ if (IsInParallelRegion) {
+ SeqGen(CGF, Action);
+ } else if (IsInTargetMasterThreadRegion) {
+ L0ParallelGen(CGF, Action);
+ } else {
+ // Check for master and then parallelism:
+ // if (is_master) {
+ // Worker call.
+ // } else if (__kmpc_parallel_level(loc, gtid)) {
+ // Serialized execution.
+ // } else {
+ // Outlined function call.
+ // }
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
+ if (!isInSpmdExecutionMode()) {
+ llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+ llvm::BasicBlock *ParallelCheckBB =
+ CGF.createBasicBlock(".parallelcheck");
+ llvm::Value *IsMaster =
+ Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+ Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
+ CGF.EmitBlock(MasterCheckBB);
+ L0ParallelGen(CGF, Action);
+ CGF.EmitBranch(ExitBB);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ParallelCheckBB);
+ }
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadID = getThreadID(CGF, Loc);
+ llvm::Value *PL = CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
+ {RTLoc, ThreadID});
+ llvm::Value *Res = Bld.CreateIsNotNull(PL);
+ llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then");
+ llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
+ Bld.CreateCondBr(Res, ThenBlock, ElseBlock);
+ // Emit the 'then' code.
+ CGF.EmitBlock(ThenBlock);
+ SeqGen(CGF, Action);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ // Emit the 'else' code.
+ CGF.EmitBlock(ElseBlock);
+ RCG(CGF);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ // Emit the continuation block for code after the if.
+ CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+ }
};
if (IfCond) {
- emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
+ emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
} else {
CodeGenFunction::RunCleanupsScope Scope(CGF);
- RegionCodeGenTy ThenRCG(L0ParallelGen);
+ RegionCodeGenTy ThenRCG(LNParallelGen);
ThenRCG(CGF);
}
}
@@ -3090,6 +3333,9 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
const Decl *D) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return;
+
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
@@ -3143,6 +3389,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) {
+ if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+ return Address::invalid();
+
VD = VD->getCanonicalDecl();
auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 365d4f52aa6..a5f39c28a7a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -25,7 +25,7 @@ namespace CodeGen {
class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
private:
- // Parallel outlined function work for workers to execute.
+ /// Parallel outlined function work for workers to execute.
llvm::SmallVector<llvm::Function *, 16> Work;
struct EntryFunctionState {
@@ -52,14 +52,14 @@ private:
/// \brief Helper for worker function. Emit body of worker loop.
void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
- /// \brief Helper for generic target entry function. Guide the master and
+ /// \brief Helper for non-SPMD target entry function. Guide the master and
/// worker threads to their respective locations.
- void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+ void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
WorkerFunctionState &WST);
- /// \brief Signal termination of OMP execution for generic target entry
+ /// \brief Signal termination of OMP execution for non-SPMD target entry
/// function.
- void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+ void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
/// Helper for generic variables globalization prolog.
void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc);
@@ -93,7 +93,7 @@ private:
/// \param IsOffloadEntry True if the outlined function is an offload entry.
/// An outlined function may not be an entry if, e.g. the if clause always
/// evaluates to false.
- void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName,
+ void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen);
@@ -133,14 +133,14 @@ private:
/// \brief Emits code for parallel or serial call of the \a OutlinedFn with
/// variables captured in a record which address is stored in \a
/// CapturedStruct.
- /// This call is for the Generic Execution Mode.
+ /// This call is for the Non-SPMD Execution Mode.
/// \param OutlinedFn Outlined function to be run in parallel threads. Type of
/// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
/// \param CapturedVars A pointer to the record with the references to
/// variables used in \a OutlinedFn function.
/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.
- void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+ void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond);
@@ -304,15 +304,15 @@ public:
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) override;
- /// Target codegen is specialized based on two programming models: the
- /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
- /// model for constructs like 'target parallel' that support it.
- enum ExecutionMode {
- /// Single Program Multiple Data.
- Spmd,
- /// Generic codegen to support fork-join model.
+ /// Target codegen is specialized based on two data-sharing modes: CUDA, in
+ /// which the local variables are actually global threadlocal, and Generic, in
+ /// which the local variables are placed in global memory if they may escape
+ /// their declaration context.
+ enum DataSharingMode {
+ /// CUDA data sharing mode.
+ CUDA,
+ /// Generic data-sharing mode.
Generic,
- Unknown,
};
/// Cleans up references to the objects in finished function.
@@ -320,11 +320,17 @@ public:
void functionFinished(CodeGenFunction &CGF) override;
private:
- // Track the execution mode when codegening directives within a target
- // region. The appropriate mode (generic/spmd) is set on entry to the
- // target region and used by containing directives such as 'parallel'
- // to emit optimized code.
- ExecutionMode CurrentExecutionMode;
+ /// Track the execution mode when codegening directives within a target
+ /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
+ /// target region and used by containing directives such as 'parallel'
+ /// to emit optimized code.
+ bool IsInSPMDExecutionMode = false;
+
+ /// true if we're emitting the code for the target region and next parallel
+ /// region is L0 for sure.
+ bool IsInTargetMasterThreadRegion = false;
+ /// true if we're definitely in the parallel region.
+ bool IsInParallelRegion = false;
/// Map between an outlined function and its wrapper.
llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6e3747a188b..4bb3c7b0d3b 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2399,8 +2399,17 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
// For the device mark the function as one that should be emitted.
if (getLangOpts().OpenMPIsDevice && OpenMPRuntime &&
!OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() &&
- !DontDefer && !IsForDefinition)
- addDeferredDeclToEmit(GD);
+ !DontDefer && !IsForDefinition) {
+ const FunctionDecl *FDDef = FD->getDefinition();
+ GlobalDecl GDDef;
+ if (const auto *CD = dyn_cast<CXXConstructorDecl>(FDDef))
+ GDDef = GlobalDecl(CD, GD.getCtorType());
+ else if (const auto *DD = dyn_cast<CXXDestructorDecl>(FDDef))
+ GDDef = GlobalDecl(DD, GD.getDtorType());
+ else
+ GDDef = GlobalDecl(FDDef);
+ addDeferredDeclToEmit(GDDef);
+ }
if (FD->isMultiVersion() && FD->getAttr<TargetAttr>()->isDefaultVersion()) {
UpdateMultiVersionNames(GD, FD);
OpenPOWER on IntegriCloud