diff options
6 files changed, 65 insertions, 1 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h index f314443a8f2..4005353b17c 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -470,8 +470,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,                                      int16_t RequiresDataSharing);  EXTERN void __kmpc_spmd_kernel_deinit();  EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, +                                           void ***SharedArgs, int32_t nArgs,                                             int16_t IsOMPRuntimeInitialized); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, void ***SharedArgs,                                     int16_t IsOMPRuntimeInitialized);  EXTERN void __kmpc_kernel_end_parallel();  EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu index adedc0b3aa5..8ff8f7b4723 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -46,3 +46,8 @@ __device__ __shared__ DataSharingStateTy DataSharingState;  // Scratchpad for teams reduction.  ////////////////////////////////////////////////////////////////////////////////  __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing related variables. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 5c5c88b2cb6..fca2aa0e4ac 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -54,6 +54,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {    PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",          OMPTARGET_NVPTX_VERSION); +  // init parallel work arguments +  omptarget_nvptx_sharedArgs.Init(); +    if (!RequiresOMPRuntime) {      // If OMP runtime is not required don't initialize OMP state.      setExecutionParameters(Generic, RuntimeUninitialized); @@ -107,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {    }    // Done with work.  Kill the workers.    omptarget_nvptx_workFn = 0; + +  // Deinit parallel work arguments +  omptarget_nvptx_sharedArgs.DeInit();  }  EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 3ee32f96480..43b8dd27bf5 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -62,6 +62,46 @@  #define __ACTIVEMASK() __ballot(1)  #endif +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: +  // All these methods must be called by the master thread only. +  INLINE void Init() { +    args  = buffer; +    nArgs = MAX_SHARED_ARGS; +  } +  INLINE void DeInit() { +    // Free any memory allocated for outlined parallel function with a large +    // number of arguments. +    if (nArgs > MAX_SHARED_ARGS) { +      SafeFree(args, (char *)"new extended args"); +      Init(); +    } +  } +  INLINE void EnsureSize(int size) { +    if (size > nArgs) { +      if (nArgs > MAX_SHARED_ARGS) { +        SafeFree(args, (char *)"new extended args"); +      } +      args = (void **) SafeMalloc(size * sizeof(void *), +                                  (char *)"new extended args"); +      nArgs = size; +    } +  } +  // Called by all threads. +  INLINE void **GetArgs() { return args; }; +private: +  // buffer of pre-allocated arguments. +  void *buffer[MAX_SHARED_ARGS]; +  // pointer to arguments buffer. +  // starts off as a pointer to 'buffer' but can be dynamically allocated. +  void **args; +  // starts off as MAX_SHARED_ARGS but can increase in size. +  uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; +  // Data sharing related quantities, need to match what is used in the compiler.  enum DATA_SHARING_SIZES {    // The maximum number of workers in a kernel. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h index c5f5d9c04fb..43172ad45d0 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h @@ -28,6 +28,10 @@  // region to synchronize with each other.  #define L1_BARRIER (1) +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 +  // Maximum number of omp state objects per SM allocated statically in global  // memory.  #if __CUDA_ARCH__ >= 600 diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d4546284fac..0b9ebd401f8 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -214,10 +214,16 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {  //  // This routine is always called by the team master..  EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, +                                           void ***SharedArgs, int32_t nArgs,                                             int16_t IsOMPRuntimeInitialized) {    PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");    omptarget_nvptx_workFn = WorkFn; +  if (nArgs > 0) { +    omptarget_nvptx_sharedArgs.EnsureSize(nArgs); +    *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); +  } +    if (!IsOMPRuntimeInitialized)      return; @@ -317,11 +323,13 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,  //  // Only the worker threads call this routine.  EXTERN bool __kmpc_kernel_parallel(void **WorkFn, +                                   void ***SharedArgs,                                     int16_t IsOMPRuntimeInitialized) {    PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");    // Work function and arguments for L1 parallel region.    *WorkFn = omptarget_nvptx_workFn; +  *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();    if (!IsOMPRuntimeInitialized)      return true;  | 

