diff options
author | Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> | 2018-10-29 15:45:47 +0000 |
---|---|---|
committer | Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> | 2018-10-29 15:45:47 +0000 |
commit | e92567601b4b457d7f68a31ee21d2c2769c8de3b (patch) | |
tree | c5618d3b4991e48ae9e50d218a9a2d720437e4f0 /clang/lib/CodeGen/CGStmtOpenMP.cpp | |
parent | 98640f8456bc9b66f983d00d02e54ba012581b8b (diff) | |
download | bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.tar.gz bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.zip |
[OpenMP][NVPTX] Use single loops when generating code for distribute parallel for
Summary: This patch adds a new code generation path for bound sharing directives containing distribute parallel for. The new code generation scheme applies to chunked schedules on distribute and parallel for directives. The scheme simplifies the code that is being generated by eliminating the need for an outer for loop over chunks for both distribute and parallel for directives. In the case of distribute it applies to any sized chunk while in the parallel for case it only applies when chunk size is 1.
Reviewers: ABataev, caomhin
Reviewed By: ABataev
Subscribers: jholewinski, guansong, cfe-commits
Differential Revision: https://reviews.llvm.org/D53448
llvm-svn: 345509
Diffstat (limited to 'clang/lib/CodeGen/CGStmtOpenMP.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGStmtOpenMP.cpp | 116 |
1 files changed, 88 insertions, 28 deletions
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index c8937956f45..718b8c326dc 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2006,7 +2006,7 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop( RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, StaticInit); // for combined 'distribute' and 'for' the increment expression of distribute - // is store in DistInc. For 'distribute' alone, it is in Inc. + // is stored in DistInc. For 'distribute' alone, it is in Inc. Expr *IncExpr; if (isOpenMPLoopBoundSharingDirective(S.getDirectiveKind())) IncExpr = S.getDistInc(); @@ -2298,22 +2298,28 @@ bool CodeGenFunction::EmitOMPWorksharingLoop( (void)LoopScope.Privatize(); // Detect the loop schedule kind and chunk. - llvm::Value *Chunk = nullptr; + const Expr *ChunkExpr = nullptr; OpenMPScheduleTy ScheduleKind; if (const auto *C = S.getSingleClause<OMPScheduleClause>()) { ScheduleKind.Schedule = C->getScheduleKind(); ScheduleKind.M1 = C->getFirstScheduleModifier(); ScheduleKind.M2 = C->getSecondScheduleModifier(); - if (const Expr *Ch = C->getChunkSize()) { - Chunk = EmitScalarExpr(Ch); - Chunk = EmitScalarConversion(Chunk, Ch->getType(), - S.getIterationVariable()->getType(), - S.getBeginLoc()); - } + ChunkExpr = C->getChunkSize(); } else { // Default behaviour for schedule clause. CGM.getOpenMPRuntime().getDefaultScheduleAndChunk( - *this, S, ScheduleKind.Schedule, Chunk); + *this, S, ScheduleKind.Schedule, ChunkExpr); + } + bool HasChunkSizeOne = false; + llvm::Value *Chunk = nullptr; + if (ChunkExpr) { + Chunk = EmitScalarExpr(ChunkExpr); + Chunk = EmitScalarConversion(Chunk, ChunkExpr->getType(), + S.getIterationVariable()->getType(), + S.getBeginLoc()); + llvm::APSInt EvaluatedChunk; + if (ChunkExpr->EvaluateAsInt(EvaluatedChunk, getContext())) + HasChunkSizeOne = (EvaluatedChunk.getLimitedValue() == 1); } const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); @@ -2321,8 +2327,12 @@ bool CodeGenFunction::EmitOMPWorksharingLoop( // If the static schedule kind is specified or if the ordered clause is // specified, and if no monotonic modifier is specified, the effect will // be as if the monotonic modifier was specified. - if (RT.isStaticNonchunked(ScheduleKind.Schedule, - /* Chunked */ Chunk != nullptr) && + bool StaticChunkedOne = RT.isStaticChunked(ScheduleKind.Schedule, + /* Chunked */ Chunk != nullptr) && HasChunkSizeOne && + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()); + if ((RT.isStaticNonchunked(ScheduleKind.Schedule, + /* Chunked */ Chunk != nullptr) || + StaticChunkedOne) && !Ordered) { if (isOpenMPSimdDirective(S.getDirectiveKind())) EmitOMPSimdInit(S, /*IsMonotonic=*/true); @@ -2333,23 +2343,38 @@ bool CodeGenFunction::EmitOMPWorksharingLoop( // unspecified in this case. CGOpenMPRuntime::StaticRTInput StaticInit( IVSize, IVSigned, Ordered, IL.getAddress(), LB.getAddress(), - UB.getAddress(), ST.getAddress()); + UB.getAddress(), ST.getAddress(), + StaticChunkedOne ? Chunk : nullptr); RT.emitForStaticInit(*this, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind, StaticInit); JumpDest LoopExit = getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); // UB = min(UB, GlobalUB); - EmitIgnoredExpr(S.getEnsureUpperBound()); + if (!StaticChunkedOne) + EmitIgnoredExpr(S.getEnsureUpperBound()); // IV = LB; EmitIgnoredExpr(S.getInit()); - // while (idx <= UB) { BODY; ++idx; } - EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(), - S.getInc(), - [&S, LoopExit](CodeGenFunction &CGF) { - CGF.EmitOMPLoopBody(S, LoopExit); - CGF.EmitStopPoint(&S); - }, - [](CodeGenFunction &) {}); + // For unchunked static schedule generate: + // + // while (idx <= UB) { + // BODY; + // ++idx; + // } + // + // For static schedule with chunk one: + // + // while (IV <= PrevUB) { + // BODY; + // IV += ST; + // } + EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), + StaticChunkedOne ? S.getCombinedParForInDistCond() : S.getCond(), + StaticChunkedOne ? S.getDistInc() : S.getInc(), + [&S, LoopExit](CodeGenFunction &CGF) { + CGF.EmitOMPLoopBody(S, LoopExit); + CGF.EmitStopPoint(&S); + }, + [](CodeGenFunction &) {}); EmitBlock(LoopExit.getBlock()); // Tell the runtime we are done. auto &&CodeGen = [&S](CodeGenFunction &CGF) { @@ -3345,13 +3370,18 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, // iteration space is divided into chunks that are approximately equal // in size, and at most one chunk is distributed to each team of the // league. The size of the chunks is unspecified in this case. + bool StaticChunked = RT.isStaticChunked( + ScheduleKind, /* Chunked */ Chunk != nullptr) && + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()); if (RT.isStaticNonchunked(ScheduleKind, - /* Chunked */ Chunk != nullptr)) { + /* Chunked */ Chunk != nullptr) || + StaticChunked) { if (isOpenMPSimdDirective(S.getDirectiveKind())) EmitOMPSimdInit(S, /*IsMonotonic=*/true); CGOpenMPRuntime::StaticRTInput StaticInit( IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(), - LB.getAddress(), UB.getAddress(), ST.getAddress()); + LB.getAddress(), UB.getAddress(), ST.getAddress(), + StaticChunked ? Chunk : nullptr); RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, StaticInit); JumpDest LoopExit = @@ -3370,15 +3400,45 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, ? S.getCombinedCond() : S.getCond(); - // for distribute alone, codegen - // while (idx <= UB) { BODY; ++idx; } - // when combined with 'for' (e.g. as in 'distribute parallel for') - // while (idx <= UB) { <CodeGen rest of pragma>; idx += ST; } + if (StaticChunked) + Cond = S.getCombinedDistCond(); + + // For static unchunked schedules generate: + // + // 1. For distribute alone, codegen + // while (idx <= UB) { + // BODY; + // ++idx; + // } + // + // 2. When combined with 'for' (e.g. as in 'distribute parallel for') + // while (idx <= UB) { + // <CodeGen rest of pragma>(LB, UB); + // idx += ST; + // } + // + // For static chunk one schedule generate: + // + // while (IV <= GlobalUB) { + // <CodeGen rest of pragma>(LB, UB); + // LB += ST; + // UB += ST; + // UB = min(UB, GlobalUB); + // IV = LB; + // } + // EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), Cond, IncExpr, [&S, LoopExit, &CodeGenLoop](CodeGenFunction &CGF) { CodeGenLoop(CGF, S, LoopExit); }, - [](CodeGenFunction &) {}); + [&S, StaticChunked](CodeGenFunction &CGF) { + if (StaticChunked) { + CGF.EmitIgnoredExpr(S.getCombinedNextLowerBound()); + CGF.EmitIgnoredExpr(S.getCombinedNextUpperBound()); + CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound()); + CGF.EmitIgnoredExpr(S.getCombinedInit()); + } + }); EmitBlock(LoopExit.getBlock()); // Tell the runtime we are done. RT.emitForStaticFinish(*this, S.getBeginLoc(), S.getDirectiveKind()); |