[OpenMP][NVPTX] Use single loops when generating code for distribute parallel for

Summary: This patch adds a new code generation path for bound sharing directives containing distribute parallel for. The new code generation scheme applies to chunked schedules on distribute and parallel for directives. The scheme simplifies the code that is being generated by eliminating the need for an outer for loop over chunks for both distribute and parallel for directives. In the case of distribute it applies to any sized chunk while in the parallel for case it only applies when chunk size is 1. Reviewers: ABataev, caomhin Reviewed By: ABataev Subscribers: jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D53448 llvm-svn: 345509
author: Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> 2018-10-29 15:45:47 +0000
committer: Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com> 2018-10-29 15:45:47 +0000
commit: e92567601b4b457d7f68a31ee21d2c2769c8de3b (patch)
tree: c5618d3b4991e48ae9e50d218a9a2d720437e4f0 /clang/lib/CodeGen
parent: 98640f8456bc9b66f983d00d02e54ba012581b8b (diff)
download: bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.tar.gz
bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.zip
5 files changed, 122 insertions, 33 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index cd3bc3a8606..47828d189c8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3292,6 +3292,18 @@ bool CGOpenMPRuntime::isStaticNonchunked(
   return Schedule == OMP_dist_sch_static;
 }
 
+bool CGOpenMPRuntime::isStaticChunked(OpenMPScheduleClauseKind ScheduleKind,
+                                      bool Chunked) const {
+  OpenMPSchedType Schedule =
+      getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false);
+  return Schedule == OMP_sch_static_chunked;
+}
+
+bool CGOpenMPRuntime::isStaticChunked(
+    OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
+  OpenMPSchedType Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
+  return Schedule == OMP_dist_sch_static_chunked;
+}
 
 bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
   OpenMPSchedType Schedule =
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 3fb1a1538fa..0cc1056ffab 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -890,6 +890,20 @@ public:
   virtual bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind,
                                   bool Chunked) const;
 
+  /// Check if the specified \a ScheduleKind is static chunked.
+  /// \param ScheduleKind Schedule kind specified in the 'schedule' clause.
+  /// \param Chunked True if chunk is specified in the clause.
+  ///
+  virtual bool isStaticChunked(OpenMPScheduleClauseKind ScheduleKind,
+                               bool Chunked) const;
+
+  /// Check if the specified \a ScheduleKind is static non-chunked.
+  /// \param ScheduleKind Schedule kind specified in the 'dist_schedule' clause.
+  /// \param Chunked True if chunk is specified in the clause.
+  ///
+  virtual bool isStaticChunked(OpenMPDistScheduleClauseKind ScheduleKind,
+                               bool Chunked) const;
+
   /// Check if the specified \a ScheduleKind is dynamic.
   /// This kind of worksharing directive is emitted without outer loop.
   /// \param ScheduleKind Schedule Kind specified in the 'schedule' clause.
@@ -1506,7 +1520,7 @@ public:
   /// schedule clause.
   virtual void getDefaultScheduleAndChunk(CodeGenFunction &CGF,
       const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind,
-      llvm::Value *&Chunk) const {}
+      const Expr *&ChunkExpr) const {}
 
   /// Emits call of the outlined function with the provided arguments,
   /// translating these arguments to correct target-specific arguments.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index c8dd1a6f7b6..101ab54d585 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4247,8 +4247,11 @@ void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
 void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
     CodeGenFunction &CGF, const OMPLoopDirective &S,
     OpenMPScheduleClauseKind &ScheduleKind,
-    llvm::Value *&Chunk) const {
+    const Expr *&ChunkExpr) const {
   ScheduleKind = OMPC_SCHEDULE_static;
-  Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
-      S.getIterationVariable()->getType()), 1);
+  // Chunk size is 1 in this case.
+  llvm::APInt ChunkSize(32, 1);
+  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
+      CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+      SourceLocation());
 }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index d40e6cfbdd1..1ce8a175dae 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -348,7 +348,7 @@ public:
   /// Choose a default value for the schedule clause.
   void getDefaultScheduleAndChunk(CodeGenFunction &CGF,
       const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind,
-      llvm::Value *&Chunk) const override;
+      const Expr *&ChunkExpr) const override;
 
 private:
   /// Track the execution mode when codegening directives within a target
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index c8937956f45..718b8c326dc 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2006,7 +2006,7 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop(
   RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, StaticInit);
 
   // for combined 'distribute' and 'for' the increment expression of distribute
-  // is store in DistInc. For 'distribute' alone, it is in Inc.
+  // is stored in DistInc. For 'distribute' alone, it is in Inc.
   Expr *IncExpr;
   if (isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()))
     IncExpr = S.getDistInc();
@@ -2298,22 +2298,28 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       (void)LoopScope.Privatize();
 
       // Detect the loop schedule kind and chunk.
-      llvm::Value *Chunk = nullptr;
+      const Expr *ChunkExpr = nullptr;
       OpenMPScheduleTy ScheduleKind;
       if (const auto *C = S.getSingleClause<OMPScheduleClause>()) {
         ScheduleKind.Schedule = C->getScheduleKind();
         ScheduleKind.M1 = C->getFirstScheduleModifier();
         ScheduleKind.M2 = C->getSecondScheduleModifier();
-        if (const Expr *Ch = C->getChunkSize()) {
-          Chunk = EmitScalarExpr(Ch);
-          Chunk = EmitScalarConversion(Chunk, Ch->getType(),
-                                       S.getIterationVariable()->getType(),
-                                       S.getBeginLoc());
-        }
+        ChunkExpr = C->getChunkSize();
       } else {
         // Default behaviour for schedule clause.
         CGM.getOpenMPRuntime().getDefaultScheduleAndChunk(
-            *this, S, ScheduleKind.Schedule, Chunk);
+            *this, S, ScheduleKind.Schedule, ChunkExpr);
+      }
+      bool HasChunkSizeOne = false;
+      llvm::Value *Chunk = nullptr;
+      if (ChunkExpr) {
+        Chunk = EmitScalarExpr(ChunkExpr);
+        Chunk = EmitScalarConversion(Chunk, ChunkExpr->getType(),
+                                     S.getIterationVariable()->getType(),
+                                     S.getBeginLoc());
+        llvm::APSInt EvaluatedChunk;
+        if (ChunkExpr->EvaluateAsInt(EvaluatedChunk, getContext()))
+          HasChunkSizeOne = (EvaluatedChunk.getLimitedValue() == 1);
       }
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
@@ -2321,8 +2327,12 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       // If the static schedule kind is specified or if the ordered clause is
       // specified, and if no monotonic modifier is specified, the effect will
       // be as if the monotonic modifier was specified.
-      if (RT.isStaticNonchunked(ScheduleKind.Schedule,
-                                /* Chunked */ Chunk != nullptr) &&
+      bool StaticChunkedOne = RT.isStaticChunked(ScheduleKind.Schedule,
+          /* Chunked */ Chunk != nullptr) && HasChunkSizeOne &&
+          isOpenMPLoopBoundSharingDirective(S.getDirectiveKind());
+      if ((RT.isStaticNonchunked(ScheduleKind.Schedule,
+                                 /* Chunked */ Chunk != nullptr) ||
+           StaticChunkedOne) &&
           !Ordered) {
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
           EmitOMPSimdInit(S, /*IsMonotonic=*/true);
@@ -2333,23 +2343,38 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
         // unspecified in this case.
         CGOpenMPRuntime::StaticRTInput StaticInit(
             IVSize, IVSigned, Ordered, IL.getAddress(), LB.getAddress(),
-            UB.getAddress(), ST.getAddress());
+            UB.getAddress(), ST.getAddress(),
+            StaticChunkedOne ? Chunk : nullptr);
         RT.emitForStaticInit(*this, S.getBeginLoc(), S.getDirectiveKind(),
                              ScheduleKind, StaticInit);
         JumpDest LoopExit =
             getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
         // UB = min(UB, GlobalUB);
-        EmitIgnoredExpr(S.getEnsureUpperBound());
+        if (!StaticChunkedOne)
+          EmitIgnoredExpr(S.getEnsureUpperBound());
         // IV = LB;
         EmitIgnoredExpr(S.getInit());
-        // while (idx <= UB) { BODY; ++idx; }
-        EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
-                         S.getInc(),
-                         [&S, LoopExit](CodeGenFunction &CGF) {
-                           CGF.EmitOMPLoopBody(S, LoopExit);
-                           CGF.EmitStopPoint(&S);
-                         },
-                         [](CodeGenFunction &) {});
+        // For unchunked static schedule generate:
+        //
+        // while (idx <= UB) {
+        //   BODY;
+        //   ++idx;
+        // }
+        //
+        // For static schedule with chunk one:
+        //
+        // while (IV <= PrevUB) {
+        //   BODY;
+        //   IV += ST;
+        // }
+        EmitOMPInnerLoop(S, LoopScope.requiresCleanups(),
+            StaticChunkedOne ? S.getCombinedParForInDistCond() : S.getCond(),
+            StaticChunkedOne ? S.getDistInc() : S.getInc(),
+            [&S, LoopExit](CodeGenFunction &CGF) {
+             CGF.EmitOMPLoopBody(S, LoopExit);
+             CGF.EmitStopPoint(&S);
+            },
+            [](CodeGenFunction &) {});
         EmitBlock(LoopExit.getBlock());
         // Tell the runtime we are done.
         auto &&CodeGen = [&S](CodeGenFunction &CGF) {
@@ -3345,13 +3370,18 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
       // iteration space is divided into chunks that are approximately equal
       // in size, and at most one chunk is distributed to each team of the
       // league. The size of the chunks is unspecified in this case.
+      bool StaticChunked = RT.isStaticChunked(
+          ScheduleKind, /* Chunked */ Chunk != nullptr) &&
+          isOpenMPLoopBoundSharingDirective(S.getDirectiveKind());
       if (RT.isStaticNonchunked(ScheduleKind,
-                                /* Chunked */ Chunk != nullptr)) {
+                                /* Chunked */ Chunk != nullptr) ||
+          StaticChunked) {
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
           EmitOMPSimdInit(S, /*IsMonotonic=*/true);
         CGOpenMPRuntime::StaticRTInput StaticInit(
             IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(),
-            LB.getAddress(), UB.getAddress(), ST.getAddress());
+            LB.getAddress(), UB.getAddress(), ST.getAddress(),
+            StaticChunked ? Chunk : nullptr);
         RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind,
                                     StaticInit);
         JumpDest LoopExit =
@@ -3370,15 +3400,45 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
                 ? S.getCombinedCond()
                 : S.getCond();
 
-        // for distribute alone,  codegen
-        // while (idx <= UB) { BODY; ++idx; }
-        // when combined with 'for' (e.g. as in 'distribute parallel for')
-        // while (idx <= UB) { <CodeGen rest of pragma>; idx += ST; }
+        if (StaticChunked)
+          Cond = S.getCombinedDistCond();
+
+        // For static unchunked schedules generate:
+        //
+        //  1. For distribute alone, codegen
+        //    while (idx <= UB) {
+        //      BODY;
+        //      ++idx;
+        //    }
+        //
+        //  2. When combined with 'for' (e.g. as in 'distribute parallel for')
+        //    while (idx <= UB) {
+        //      <CodeGen rest of pragma>(LB, UB);
+        //      idx += ST;
+        //    }
+        //
+        // For static chunk one schedule generate:
+        //
+        // while (IV <= GlobalUB) {
+        //   <CodeGen rest of pragma>(LB, UB);
+        //   LB += ST;
+        //   UB += ST;
+        //   UB = min(UB, GlobalUB);
+        //   IV = LB;
+        // }
+        //
         EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), Cond, IncExpr,
                          [&S, LoopExit, &CodeGenLoop](CodeGenFunction &CGF) {
                            CodeGenLoop(CGF, S, LoopExit);
                          },
-                         [](CodeGenFunction &) {});
+                         [&S, StaticChunked](CodeGenFunction &CGF) {
+                           if (StaticChunked) {
+                             CGF.EmitIgnoredExpr(S.getCombinedNextLowerBound());
+                             CGF.EmitIgnoredExpr(S.getCombinedNextUpperBound());
+                             CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound());
+                             CGF.EmitIgnoredExpr(S.getCombinedInit());
+                           }
+                         });
         EmitBlock(LoopExit.getBlock());
         // Tell the runtime we are done.
         RT.emitForStaticFinish(*this, S.getBeginLoc(), S.getDirectiveKind());
author	Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>	2018-10-29 15:45:47 +0000
committer	Gheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>	2018-10-29 15:45:47 +0000
commit	e92567601b4b457d7f68a31ee21d2c2769c8de3b (patch)
tree	c5618d3b4991e48ae9e50d218a9a2d720437e4f0 /clang/lib/CodeGen
parent	98640f8456bc9b66f983d00d02e54ba012581b8b (diff)
download	bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.tar.gz bcm5719-llvm-e92567601b4b457d7f68a31ee21d2c2769c8de3b.zip