[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.

This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428
author: Arpith Chacko Jacob <acjacob@us.ibm.com> 2017-01-18 19:35:00 +0000
committer: Arpith Chacko Jacob <acjacob@us.ibm.com> 2017-01-18 19:35:00 +0000
commit: 44a87c9f1b330d243eae87c60151711c5b0b486a (patch)
tree: 66af7f701bfa080c252bcea9549369c3533c4088 /clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
parent: 11255ec7659cec933e0b0415c14d50ca80cb16cd (diff)
download: bcm5719-llvm-44a87c9f1b330d243eae87c60151711c5b0b486a.tar.gz
bcm5719-llvm-44a87c9f1b330d243eae87c60151711c5b0b486a.zip
1 files changed, 198 insertions, 9 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index f03c0d9c380..f87649c7d6b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -26,6 +26,11 @@ enum OpenMPRTLFunctionNVPTX {
   OMPRTL_NVPTX__kmpc_kernel_init,
   /// \brief Call to void __kmpc_kernel_deinit();
   OMPRTL_NVPTX__kmpc_kernel_deinit,
+  /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+  /// short RequiresOMPRuntime, short RequiresDataSharing);
+  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
+  /// \brief Call to void __kmpc_spmd_kernel_deinit();
+  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
   /// \brief Call to void __kmpc_kernel_prepare_parallel(void
   /// *outlined_function);
   OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
@@ -76,6 +81,25 @@ public:
     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
   }
 };
+
+// A class to track the execution mode when codegening directives within
+// a target region. The appropriate mode (generic/spmd) is set on entry
+// to the target region and used by containing directives such as 'parallel'
+// to emit optimized code.
+class ExecutionModeRAII {
+private:
+  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+
+public:
+  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
+                    CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
+      : Mode(Mode) {
+    SavedMode = Mode;
+    Mode = NewMode;
+  }
+  ~ExecutionModeRAII() { Mode = SavedMode; }
+};
 } // anonymous namespace
 
 /// Get the GPU warp size.
@@ -116,12 +140,17 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
 static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
 
 /// Get the value of the thread_limit clause in the teams directive.
-/// The runtime encodes thread_limit in the launch parameter, always starting
-/// thread_limit+warpSize threads per team.
-static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
+/// For the 'generic' execution mode, the runtime encodes thread_limit in
+/// the launch parameters, always starting thread_limit+warpSize threads per
+/// CTA. The threads in the last warp are reserved for master execution.
+/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
+static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
+                                   bool IsInSpmdExecutionMode = false) {
   CGBuilderTy &Bld = CGF.Builder;
-  return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
-                       "thread_limit");
+  return IsInSpmdExecutionMode
+             ? getNVPTXNumThreads(CGF)
+             : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
+                             "thread_limit");
 }
 
 /// Get the thread id of the OMP master thread.
@@ -159,12 +188,33 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
 }
 
+bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
+  return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+}
+
+static CGOpenMPRuntimeNVPTX::ExecutionMode
+getExecutionModeForDirective(CodeGenModule &CGM,
+                             const OMPExecutableDirective &D) {
+  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+  switch (DirectiveKind) {
+  case OMPD_target:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+  case OMPD_target_parallel:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+  default:
+    llvm_unreachable("Unsupported directive on NVPTX device.");
+  }
+  llvm_unreachable("Unsupported directive on NVPTX device.");
+}
+
 void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
                                              StringRef ParentName,
                                              llvm::Function *&OutlinedFn,
                                              llvm::Constant *&OutlinedFnID,
                                              bool IsOffloadEntry,
                                              const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
   EntryFunctionState EST;
   WorkerFunctionState WST(CGM);
   Work.clear();
@@ -252,6 +302,94 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
   EST.ExitBB = nullptr;
 }
 
+void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
+                                          StringRef ParentName,
+                                          llvm::Function *&OutlinedFn,
+                                          llvm::Constant *&OutlinedFnID,
+                                          bool IsOffloadEntry,
+                                          const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+  EntryFunctionState EST;
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeNVPTX &RT;
+    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+    const OMPExecutableDirective &D;
+
+  public:
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+                         const OMPExecutableDirective &D)
+        : RT(RT), EST(EST), D(D) {}
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryHeader(CGF, EST, D);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryFooter(CGF, EST);
+    }
+  } Action(*this, EST, D);
+  CodeGen.setAction(Action);
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
+  return;
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
+    CodeGenFunction &CGF, EntryFunctionState &EST,
+    const OMPExecutableDirective &D) {
+  auto &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  // Initialize the OMP state in the runtime; called by all active threads.
+  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
+  // based on code analysis of the target region.
+  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
+                         /*RequiresOMPRuntime=*/Bld.getInt16(1),
+                         /*RequiresDataSharing=*/Bld.getInt16(1)};
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
+  CGF.EmitBranch(ExecuteBB);
+
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
+                                               EntryFunctionState &EST) {
+  if (!EST.ExitBB)
+    EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(EST.ExitBB);
+  EST.ExitBB = nullptr;
+}
+
+// Create a unique global variable to indicate the execution mode of this target
+// region. The execution mode is either 'generic', or 'spmd' depending on the
+// target directive. This variable is picked up by the offload library to setup
+// the device appropriately before kernel launch. If the execution mode is
+// 'generic', the runtime reserves one warp for the master, otherwise, all
+// warps participate in parallel work.
+static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
+                                     CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
+  (void)new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
+}
+
 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
   auto &Ctx = CGM.getContext();
 
@@ -385,6 +523,22 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
+    // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+    // short RequiresOMPRuntime, short RequiresDataSharing);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
+    // Build void __kmpc_spmd_kernel_deinit();
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
+    break;
+  }
   case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
     /// Build void __kmpc_kernel_prepare_parallel(
     /// void *outlined_function);
@@ -463,12 +617,27 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                    CodeGen);
+  CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
+      getExecutionModeForDirective(CGM, D);
+  switch (Mode) {
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
+    emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                      CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+    emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                   CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
+    llvm_unreachable(
+        "Unknown programming model for OpenMP directive on NVPTX target.");
+  }
+
+  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntime(CGM) {
+    : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
 }
@@ -523,7 +692,10 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
   if (!CGF.HaveInsertPoint())
     return;
 
-  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  if (isInSpmdExecutionMode())
+    emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  else
+    emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
 }
 
 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
@@ -593,3 +765,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
     ThenRCG(CGF);
   }
 }
+
+void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  // Just call the outlined function to execute the parallel region.
+  // OutlinedFn(&GTid, &zero, CapturedStruct);
+  //
+  // TODO: Do something with IfCond when support for the 'if' clause
+  // is added on Spmd target directives.
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}
author	Arpith Chacko Jacob <acjacob@us.ibm.com>	2017-01-18 19:35:00 +0000
committer	Arpith Chacko Jacob <acjacob@us.ibm.com>	2017-01-18 19:35:00 +0000
commit	44a87c9f1b330d243eae87c60151711c5b0b486a (patch)
tree	66af7f701bfa080c252bcea9549369c3533c4088 /clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
parent	11255ec7659cec933e0b0415c14d50ca80cb16cd (diff)
download	bcm5719-llvm-44a87c9f1b330d243eae87c60151711c5b0b486a.tar.gz bcm5719-llvm-44a87c9f1b330d243eae87c60151711c5b0b486a.zip