summaryrefslogtreecommitdiffstats
path: root/openmp/libomptarget
diff options
context:
space:
mode:
Diffstat (limited to 'openmp/libomptarget')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu73
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/interface.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h1
5 files changed, 45 insertions, 35 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index c7b9bdf9a9b..4db9f31a55d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -129,7 +129,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
@@ -283,7 +283,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void * volatile &FrameP = DataSharingState.FramePtr[WID];
SlotP = *SavedSharedSlot;
StackP = *SavedSharedStack;
@@ -321,7 +321,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
- void *P = DataSharingState.FramePtr[SourceWID];
+ void * volatile P = DataSharingState.FramePtr[SourceWID];
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
@@ -369,47 +369,31 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
__threadfence_block();
}
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
- int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+ return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
}
+ // Only warp active master threads manage the stack.
+ bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
const size_t Alignment = 8;
- if (DataSize % Alignment != 0) {
- DataSize += (Alignment - DataSize % Alignment);
- }
+ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
unsigned WID = getWarpId();
- void *&FrameP = DataSharingState.FramePtr[WID];
+ void *volatile &FrameP = DataSharingState.FramePtr[WID];
- // Only warp active master threads manage the stack.
- if (getThreadId() % WARPSIZE == 0) {
+ if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
- // Compute the total memory footprint of the requested data.
- // The master thread requires a stack only for itself. A worker
- // thread (which at this point is a warp master) will require
- // space for the variables of each thread in the warp,
- // i.e. one DataSize chunk per warp lane.
- // TODO: change WARPSIZE to the number of active threads in the warp.
- size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
-
// Check if we have room for the data in the current slot.
const uintptr_t StartAddress = (uintptr_t)StackP;
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
@@ -453,12 +437,39 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// Reset stack pointer to the requested address.
StackP = (void *)RequestedEndAddress;
}
+ } else {
+ while (!FrameP);
}
- __threadfence_block();
+ return FrameP;
+}
+
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ // Compute the total memory footprint of the requested data.
+ // The master thread requires a stack only for itself. A worker
+ // thread (which at this point is a warp master) will require
+ // space for the variables of each thread in the warp,
+ // i.e. one DataSize chunk per warp lane.
+ // TODO: change WARPSIZE to the number of active threads in the warp.
+ size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+ DataSize : WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
- uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+ uintptr_t FrameStartAddress =
+ (uintptr_t) data_sharing_push_stack_common(PushSize);
FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
return (void *)FrameStartAddress;
}
@@ -475,6 +486,8 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
}
+ __threadfence_block();
+
if (getThreadId() % WARPSIZE == 0) {
unsigned WID = getWarpId();
@@ -501,8 +514,6 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
SlotP->Next = 0;
}
}
-
- __threadfence_block();
}
// Begin a data sharing context. Maintain a list of references to shared
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
index aca8fbe7e88..bf36a5a3e6a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+ int16_t UseSharedMemory);
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
EXTERN void __kmpc_data_sharing_pop_stack(void *a);
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index f23679ca7b0..8b70faef04b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@ INLINE unsigned nsmid() {
INLINE unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
- ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
- "Expected number of SMs is less than reported.");
return id;
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index e0d4c1679cd..5b621ea5b79 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -123,7 +123,7 @@ enum DATA_SHARING_SIZES {
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
- void *FramePtr[DS_Max_Warp_Number];
+ void * volatile FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
// Additional worker slot type which is initialized with the default worker slot
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 9cdcc162dd4..c93657e45e1 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@ INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
{
void *ptr = malloc(size);
PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
- ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
return ptr;
}
OpenPOWER on IntegriCloud