summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu159
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/interface.h9
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu5
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h45
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/option.h4
5 files changed, 222 insertions, 0 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 65ba1f53b34..e2a38e35e0b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -322,3 +322,162 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization).
+EXTERN void __kmpc_data_sharing_init_stack() {
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ unsigned WID = getWarpId();
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+ __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
+
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+ // We initialize the list of references to arguments here.
+ omptarget_nvptx_globalArgs.Init();
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+//
+// TODO: allow more than one push per slot to save on calls to malloc.
+// Currently there is only one slot for each push so the data size in the slot
+// is the same size as the size being requested.
+//
+// Called by: master, TODO: call by workers
+EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
+ int16_t UseSharedMemory) {
+ // TODO: Add shared memory support. For now, use global memory only for
+ // storing the data sharing slots so ignore the pre-allocated
+ // shared memory slot.
+
+ // Use global memory for storing the stack.
+ if (IsMasterThread()) {
+ unsigned WID = getWarpId();
+
+ // SlotP will point to either the shared memory slot or an existing
+ // global memory slot.
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
+
+ // The slot for holding the data we are pushing.
+ __kmpc_data_sharing_slot *NewSlot = 0;
+ size_t NewSize = size;
+
+ // Check if there is a next slot.
+ if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+ // Attempt to re-use an existing slot provided the data fits in the slot.
+ // The leftover data space will not be used.
+ ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+ (uintptr_t)(&ExistingSlot->Data[0]);
+ if (ExistingSlotSize >= NewSize)
+ NewSlot = ExistingSlot;
+ else
+ free(ExistingSlot);
+ }
+
+ if (!NewSlot) {
+ NewSlot = (__kmpc_data_sharing_slot *)malloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize);
+ NewSlot->Next = 0;
+ NewSlot->Prev = SlotP;
+
+ // This is the last slot, save it.
+ TailSlotP = NewSlot;
+ }
+
+ NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+ SlotP->Next = NewSlot;
+ SlotP = NewSlot;
+
+ return (void*)&SlotP->Data[0];
+ }
+
+ // TODO: add memory fence here when this function can be called by
+ // worker threads also. For now, this function is only called by the
+ // master thread of each team.
+
+ // TODO: implement sharing across workers.
+ return 0;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *a) {
+ if (IsMasterThread()) {
+ unsigned WID = getWarpId();
+
+ __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID];
+
+ if (S->Prev)
+ S = S->Prev;
+
+ // If this will "pop" the last global memory node then it is likely
+ // that we are at the end of the data sharing region and we can
+ // de-allocate any existing global memory slots.
+ if (!S->Prev) {
+ __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
+
+ while(Tail && Tail->Prev) {
+ Tail = Tail->Prev;
+ free(Tail->Next);
+ Tail->Next=0;
+ }
+ }
+
+ return;
+ }
+
+ // TODO: add memory fence here when this function can be called by
+ // worker threads also. For now, this function is only called by the
+ // master thread of each team.
+
+ // TODO: implement sharing across workers.
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+ omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+ omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index f314443a8f2..34e33d104b5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -483,11 +483,20 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
int32_t *LaneId, int32_t *NumLanes);
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
// The slot used for data sharing by the master and worker threads. We use a
// complete (default size version and an incomplete one so that we allow sizes
// greater than the default).
struct __kmpc_data_sharing_slot {
__kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
void *DataEnd;
char Data[];
};
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index adedc0b3aa5..33303e75fd9 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -46,3 +46,8 @@ __device__ __shared__ DataSharingStateTy DataSharingState;
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 9ceebfc4f20..4276f02215d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -62,6 +62,46 @@
#define __ACTIVEMASK() __ballot(1)
#endif
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+ // All these methods must be called by the master thread only.
+ INLINE void Init() {
+ args = buffer;
+ nArgs = MAX_SHARED_ARGS;
+ }
+ INLINE void DeInit() {
+ // Free any memory allocated for outlined parallel function with a large
+ // number of arguments.
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ Init();
+ }
+ }
+ INLINE void EnsureSize(size_t size) {
+ if (size > nArgs) {
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ }
+ args = (void **) SafeMalloc(size * sizeof(void *),
+ (char *)"new extended args");
+ nArgs = size;
+ }
+ }
+ // Called by all threads.
+ INLINE void **GetArgs() { return args; };
+private:
+ // buffer of pre-allocated arguments.
+ void *buffer[MAX_SHARED_ARGS];
+ // pointer to arguments buffer.
+ // starts off as a pointer to 'buffer' but can be dynamically allocated.
+ void **args;
+ // starts off as MAX_SHARED_ARGS but can increase in size.
+ uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
@@ -80,6 +120,7 @@ enum DATA_SHARING_SIZES {
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
+ __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
void *FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
@@ -87,6 +128,7 @@ struct DataSharingStateTy {
// size of 4*32 bytes.
struct __kmpc_data_sharing_worker_slot_static {
__kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];
};
@@ -94,6 +136,7 @@ struct __kmpc_data_sharing_worker_slot_static {
// size of 4 bytes.
struct __kmpc_data_sharing_master_slot_static {
__kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
void *DataEnd;
char Data[DS_Slot_Size];
};
@@ -223,6 +266,7 @@ public:
master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
// We currently do not have a next slot.
master_rootS[0].Next = 0;
+ master_rootS[0].Prev = 0;
return (__kmpc_data_sharing_slot *)&master_rootS[0];
}
// Initialize the pointer to the end of the slot given the size of the data
@@ -231,6 +275,7 @@ public:
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
// We currently do not have a next slot.
worker_rootS[wid].Next = 0;
+ worker_rootS[wid].Prev = 0;
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
index c5f5d9c04fb..43172ad45d0 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -28,6 +28,10 @@
// region to synchronize with each other.
#define L1_BARRIER (1)
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
OpenPOWER on IntegriCloud