summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-02-07 18:21:55 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2018-02-07 18:21:55 +0000
commitaaeab8d4efa59b62e47e062a34d567e3ca4ae567 (patch)
tree6a1905948bb46ba784332ea05d6e0c89a773a184
parentf4e3f3e31cacd412bcef0906da3c6a4bd503d0a2 (diff)
downloadbcm5719-llvm-aaeab8d4efa59b62e47e062a34d567e3ca4ae567.tar.gz
bcm5719-llvm-aaeab8d4efa59b62e47e062a34d567e3ca4ae567.zip
[OpenMP][libomptarget] Add data sharing support in libomptarget
Summary: This patch extends the libomptarget functionality in patch D14254 with support for the data sharing scheme for supporting implicitly shared variables. The runtime therefore maintains a list of references to shared variables. Reviewers: carlo.bertolli, ABataev, Hahnfeld, grokos, caomhin, hfinkel Reviewed By: Hahnfeld, grokos Subscribers: guansong, llvm-commits, openmp-commits Differential Revision: https://reviews.llvm.org/D41485 llvm-svn: 324495
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/interface.h3
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu5
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu6
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h40
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/option.h4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu8
6 files changed, 65 insertions, 1 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index f314443a8f2..4005353b17c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -470,8 +470,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing);
EXTERN void __kmpc_spmd_kernel_deinit();
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ void ***SharedArgs, int32_t nArgs,
int16_t IsOMPRuntimeInitialized);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, void ***SharedArgs,
int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_kernel_end_parallel();
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index adedc0b3aa5..8ff8f7b4723 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -46,3 +46,8 @@ __device__ __shared__ DataSharingStateTy DataSharingState;
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 5c5c88b2cb6..fca2aa0e4ac 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -54,6 +54,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);
+ // init parallel work arguments
+ omptarget_nvptx_sharedArgs.Init();
+
if (!RequiresOMPRuntime) {
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Generic, RuntimeUninitialized);
@@ -107,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
}
// Done with work. Kill the workers.
omptarget_nvptx_workFn = 0;
+
+ // Deinit parallel work arguments
+ omptarget_nvptx_sharedArgs.DeInit();
}
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 3ee32f96480..43b8dd27bf5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -62,6 +62,46 @@
#define __ACTIVEMASK() __ballot(1)
#endif
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+ // All these methods must be called by the master thread only.
+ INLINE void Init() {
+ args = buffer;
+ nArgs = MAX_SHARED_ARGS;
+ }
+ INLINE void DeInit() {
+ // Free any memory allocated for outlined parallel function with a large
+ // number of arguments.
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ Init();
+ }
+ }
+ INLINE void EnsureSize(int size) {
+ if (size > nArgs) {
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ }
+ args = (void **) SafeMalloc(size * sizeof(void *),
+ (char *)"new extended args");
+ nArgs = size;
+ }
+ }
+ // Called by all threads.
+ INLINE void **GetArgs() { return args; };
+private:
+ // buffer of pre-allocated arguments.
+ void *buffer[MAX_SHARED_ARGS];
+ // pointer to arguments buffer.
+ // starts off as a pointer to 'buffer' but can be dynamically allocated.
+ void **args;
+ // starts off as MAX_SHARED_ARGS but can increase in size.
+ uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
+
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
index c5f5d9c04fb..43172ad45d0 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -28,6 +28,10 @@
// region to synchronize with each other.
#define L1_BARRIER (1)
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d4546284fac..0b9ebd401f8 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -214,10 +214,16 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
//
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ void ***SharedArgs, int32_t nArgs,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
omptarget_nvptx_workFn = WorkFn;
+ if (nArgs > 0) {
+ omptarget_nvptx_sharedArgs.EnsureSize(nArgs);
+ *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
+ }
+
if (!IsOMPRuntimeInitialized)
return;
@@ -317,11 +323,13 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
//
// Only the worker threads call this routine.
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+ void ***SharedArgs,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
// Work function and arguments for L1 parallel region.
*WorkFn = omptarget_nvptx_workFn;
+ *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
if (!IsOMPRuntimeInitialized)
return true;
OpenPOWER on IntegriCloud