5 files changed, 222 insertions, 0 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 65ba1f53b34..e2a38e35e0b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -322,3 +322,162 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization).
+EXTERN void __kmpc_data_sharing_init_stack() {
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  unsigned WID = getWarpId();
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+  __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
+
+  DataSharingState.SlotPtr[WID] = RootS;
+  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+  // We initialize the list of references to arguments here.
+  omptarget_nvptx_globalArgs.Init();
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+//
+// TODO: allow more than one push per slot to save on calls to malloc.
+// Currently there is only one slot for each push so the data size in the slot
+// is the same size as the size being requested.
+//
+// Called by: master, TODO: call by workers
+EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
+    int16_t UseSharedMemory) {
+  // TODO: Add shared memory support. For now, use global memory only for
+  // storing the data sharing slots so ignore the pre-allocated
+  // shared memory slot.
+
+  // Use global memory for storing the stack.
+  if (IsMasterThread()) {
+    unsigned WID = getWarpId();
+
+    // SlotP will point to either the shared memory slot or an existing
+    // global memory slot.
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
+
+    // The slot for holding the data we are pushing.
+    __kmpc_data_sharing_slot *NewSlot = 0;
+    size_t NewSize = size;
+
+    // Check if there is a next slot.
+    if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+      // Attempt to re-use an existing slot provided the data fits in the slot.
+      // The leftover data space will not be used.
+      ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                                   (uintptr_t)(&ExistingSlot->Data[0]);
+      if (ExistingSlotSize >= NewSize)
+        NewSlot = ExistingSlot;
+      else
+        free(ExistingSlot);
+    }
+
+    if (!NewSlot) {
+      NewSlot = (__kmpc_data_sharing_slot *)malloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize);
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+
+      // This is the last slot, save it.
+      TailSlotP = NewSlot;
+    }
+
+    NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+    SlotP->Next = NewSlot;
+    SlotP = NewSlot;
+
+    return (void*)&SlotP->Data[0];
+  }
+
+  // TODO: add memory fence here when this function can be called by
+  // worker threads also. For now, this function is only called by the
+  // master thread of each team.
+
+  // TODO: implement sharing across workers.
+  return 0;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *a) {
+  if (IsMasterThread()) {
+    unsigned WID = getWarpId();
+
+    __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID];
+
+    if (S->Prev)
+      S = S->Prev;
+
+    // If this will "pop" the last global memory node then it is likely
+    // that we are at the end of the data sharing region and we can
+    // de-allocate any existing global memory slots.
+    if (!S->Prev) {
+      __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
+
+      while(Tail && Tail->Prev) {
+        Tail = Tail->Prev;
+        free(Tail->Next);
+        Tail->Next=0;
+      }
+    }
+
+    return;
+  }
+
+  // TODO: add memory fence here when this function can be called by
+  // worker threads also. For now, this function is only called by the
+  // master thread of each team.
+
+  // TODO: implement sharing across workers.
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+  omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index f314443a8f2..34e33d104b5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -483,11 +483,20 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
                                           int32_t *LaneId, int32_t *NumLanes);
 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
 
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
 // The slot used for data sharing by the master and worker threads. We use a
 // complete (default size version and an incomplete one so that we allow sizes
 // greater than the default).
 struct __kmpc_data_sharing_slot {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[];
 };
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index adedc0b3aa5..33303e75fd9 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -46,3 +46,8 @@ __device__ __shared__ DataSharingStateTy DataSharingState;
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 9ceebfc4f20..4276f02215d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -62,6 +62,46 @@
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, (char *)"new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(size_t size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, (char *)"new extended args");
+      }
+      args = (void **) SafeMalloc(size * sizeof(void *),
+                                  (char *)"new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
@@ -80,6 +120,7 @@ enum DATA_SHARING_SIZES {
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
+  __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
   void *FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
@@ -87,6 +128,7 @@ struct DataSharingStateTy {
 // size of 4*32 bytes.
 struct __kmpc_data_sharing_worker_slot_static {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[DS_Worker_Warp_Slot_Size];
 };
@@ -94,6 +136,7 @@ struct __kmpc_data_sharing_worker_slot_static {
 // size of 4 bytes.
 struct __kmpc_data_sharing_master_slot_static {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[DS_Slot_Size];
 };
@@ -223,6 +266,7 @@ public:
       master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
       // We currently do not have a next slot.
       master_rootS[0].Next = 0;
+      master_rootS[0].Prev = 0;
       return (__kmpc_data_sharing_slot *)&master_rootS[0];
     }
     // Initialize the pointer to the end of the slot given the size of the data
@@ -231,6 +275,7 @@ public:
         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
     // We currently do not have a next slot.
     worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
   }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
index c5f5d9c04fb..43172ad45d0 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -28,6 +28,10 @@
 // region to synchronize with each other.
 #define L1_BARRIER (1)
 
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600