summaryrefslogtreecommitdiffstats
path: root/openmp/libomptarget/src
diff options
context:
space:
mode:
authorGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2019-08-07 17:29:45 +0000
committerGheorghe-Teodor Bercea <gheorghe-teod.bercea@ibm.com>2019-08-07 17:29:45 +0000
commita1d20506e771a376e293a61e26842a906487d7ef (patch)
treefe8beb028f7eafb3255520f80620cff775a6c213 /openmp/libomptarget/src
parentd8c3c173945e7653b76e0326868464bad2130482 (diff)
downloadbcm5719-llvm-a1d20506e771a376e293a61e26842a906487d7ef.tar.gz
bcm5719-llvm-a1d20506e771a376e293a61e26842a906487d7ef.zip
[OpenMP][libomptarget] Add support for unified memory for regular maps
Summary: This patch adds support for using unified memory in the case of regular maps that happen when a target region is offloaded to the device. For cases where only a single version of the data is required then the host address can be used. When variables need to be privatized in any way or globalized, then the copy to the device is still required for correctness. Reviewers: ABataev, jdoerfert, Hahnfeld, AlexEichenberger, caomhin, grokos Reviewed By: Hahnfeld Subscribers: mgorny, guansong, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D65001 llvm-svn: 368192
Diffstat (limited to 'openmp/libomptarget/src')
-rw-r--r--openmp/libomptarget/src/api.cpp10
-rw-r--r--openmp/libomptarget/src/device.cpp54
-rw-r--r--openmp/libomptarget/src/device.h4
-rw-r--r--openmp/libomptarget/src/omptarget.cpp82
4 files changed, 104 insertions, 46 deletions
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 430425a62f5..9eeef4e4872 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -113,7 +113,15 @@ EXTERN int omp_target_is_present(void *ptr, int device_num) {
DeviceTy& Device = Devices[device_num];
bool IsLast; // not used
- int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
+ bool IsHostPtr;
+ void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
+ int rc = (TgtPtr != NULL);
+ // Under unified memory the host pointer can be returned by the
+ // getTgtPtrBegin() function which means that there is no device
+ // corresponding point for ptr. This function should return false
+ // in that situation.
+ if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+ rc = !IsHostPtr;
DP("Call to omp_target_is_present returns %d\n", rc);
return rc;
}
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 5ecba5759eb..0bdcc504c06 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -157,12 +157,17 @@ LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
// If NULL is returned, then either data allocation failed or the user tried
// to do an illegal mapping.
void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
- int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
+ int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
+ bool UpdateRefCount) {
void *rc = NULL;
+ IsHostPtr = false;
DataMapMtx.lock();
LookupResult lr = lookupMapping(HstPtrBegin, Size);
// Check if the pointer is contained.
+ // If a variable is mapped to the device manually by the user - which would
+ // lead to the IsContained flag to be true - then we must ensure that the
+ // device address is returned even under unified memory conditions.
if (lr.Flags.IsContained ||
((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
auto &HT = *lr.Entry;
@@ -183,15 +188,28 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
// Explicit extension of mapped data - not allowed.
DP("Explicit extension of mapping is not allowed.\n");
} else if (Size) {
- // If it is not contained and Size > 0 we should create a new entry for it.
- IsNew = true;
- uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
- DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
- "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
- DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
- HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
- (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
- rc = (void *)tp;
+ // If unified shared memory is active, implicitly mapped variables that are not
+ // privatized use host address. Any explicitly mapped variables also use
+ // host address where correctness is not impeded. In all other cases
+ // maps are respected.
+ // TODO: In addition to the mapping rules above, when the close map
+ // modifier is implemented, foce the mapping of the variable to the device.
+ if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+ DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+ DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+ IsHostPtr = true;
+ rc = HstPtrBegin;
+ } else {
+ // If it is not contained and Size > 0 we should create a new entry for it.
+ IsNew = true;
+ uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
+ DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+ "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+ DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+ HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
+ (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
+ rc = (void *)tp;
+ }
}
DataMapMtx.unlock();
@@ -202,8 +220,10 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
// Return the target pointer begin (where the data will be moved).
// Decrement the reference counter if called from target_data_end.
void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
- bool UpdateRefCount) {
+ bool UpdateRefCount, bool &IsHostPtr) {
void *rc = NULL;
+ IsHostPtr = false;
+ IsLast = false;
DataMapMtx.lock();
LookupResult lr = lookupMapping(HstPtrBegin, Size);
@@ -221,8 +241,14 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
(CONSIDERED_INF(HT.RefCount)) ? "INF" :
std::to_string(HT.RefCount).c_str());
rc = (void *)tp;
- } else {
- IsLast = false;
+ } else if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+ // If the value isn't found in the mapping and unified shared memory
+ // is on then it means we have stumbled upon a value which we need to
+ // use directly from the host.
+ DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+ DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+ IsHostPtr = true;
+ rc = HstPtrBegin;
}
DataMapMtx.unlock();
@@ -244,6 +270,8 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
}
int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
+ if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+ return OFFLOAD_SUCCESS;
// Check if the pointer is contained in any sub-nodes.
int rc;
DataMapMtx.lock();
diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index ded84e300ee..211507685a6 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -137,10 +137,10 @@ struct DeviceTy {
long getMapEntryRefCnt(void *HstPtrBegin);
LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
- bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
+ bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true);
void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
- bool UpdateRefCount);
+ bool UpdateRefCount, bool &IsHostPtr);
int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
int disassociatePtr(void *HstPtrBegin);
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index c41bf3167cc..2813f28573b 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -242,6 +242,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
// Address of pointer on the host and device, respectively.
void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
bool IsNew, Pointer_IsNew;
+ bool IsHostPtr = false;
bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
// UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
// have reached this point via __tgt_target_data_begin and not __tgt_target
@@ -253,7 +254,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
DP("Has a pointer entry: \n");
// base is address of pointer.
Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
- sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
+ sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef);
if (!Pointer_TgtPtrBegin) {
DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
"illegal mapping).\n");
@@ -269,7 +270,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
}
void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
- data_size, IsNew, IsImplicit, UpdateRef);
+ data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef);
if (!TgtPtrBegin && data_size) {
// If data_size==0, then the argument could be a zero-length pointer to
// NULL, so getOrAlloc() returning NULL is not an error.
@@ -289,19 +290,21 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
bool copy = false;
- if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
- copy = true;
- } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
- // Copy data only if the "parent" struct has RefCount==1.
- int32_t parent_idx = member_of(arg_types[i]);
- long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
- assert(parent_rc > 0 && "parent struct not found");
- if (parent_rc == 1) {
+ if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+ if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
copy = true;
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ copy = true;
+ }
}
}
- if (copy) {
+ if (copy && !IsHostPtr) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
@@ -312,7 +315,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
}
}
- if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
@@ -363,14 +366,14 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
}
}
- bool IsLast;
+ bool IsLast, IsHostPtr;
bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
// If PTR_AND_OBJ, HstPtrBegin is address of pointee
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
- UpdateRef);
+ UpdateRef, IsHostPtr);
DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
" - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
(IsLast ? "" : " not"));
@@ -387,18 +390,22 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
bool CopyMember = false;
- if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
- !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
- // Copy data only if the "parent" struct has RefCount==1.
- int32_t parent_idx = member_of(arg_types[i]);
- long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
- assert(parent_rc > 0 && "parent struct not found");
- if (parent_rc == 1) {
- CopyMember = true;
+ if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+ !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ CopyMember = true;
+ }
}
}
- if (DelEntry || Always || CopyMember) {
+ if ((DelEntry || Always || CopyMember) &&
+ !(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ TgtPtrBegin == HstPtrBegin)) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
@@ -471,14 +478,21 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
void *HstPtrBegin = args[i];
int64_t MapSize = arg_sizes[i];
- bool IsLast;
+ bool IsLast, IsHostPtr;
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
- false);
+ false, IsHostPtr);
if (!TgtPtrBegin) {
DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
continue;
}
+ if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ TgtPtrBegin == HstPtrBegin) {
+ DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
+ DPxPTR(HstPtrBegin));
+ continue;
+ }
+
if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
@@ -514,6 +528,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
DP("Copying data to device failed.\n");
return OFFLOAD_FAIL;
}
+
uintptr_t lb = (uintptr_t) HstPtrBegin;
uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
Device.ShadowMtx.lock();
@@ -640,19 +655,26 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
void *HstPtrVal = args[i];
void *HstPtrBegin = args_base[i];
void *HstPtrBase = args[idx];
- bool IsLast; // unused.
+ bool IsLast, IsHostPtr; // unused.
void *TgtPtrBase =
(void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
void *Pointer_TgtPtrBegin =
- Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false);
+ Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
+ IsHostPtr);
if (!Pointer_TgtPtrBegin) {
DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
DPxPTR(HstPtrVal));
continue;
}
+ if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ TgtPtrBegin == HstPtrBegin) {
+ DP("Unified memory is active, no need to map lambda captured"
+ "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
+ continue;
+ }
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
@@ -668,7 +690,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
void *HstPtrBase = args_base[i];
void *TgtPtrBegin;
ptrdiff_t TgtBaseOffset;
- bool IsLast; // unused.
+ bool IsLast, IsHostPtr; // unused.
if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
DP("Forwarding first-private value " DPxMOD " to the target construct\n",
DPxPTR(HstPtrBase));
@@ -705,14 +727,14 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
}
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
- false);
+ false, IsHostPtr);
TgtBaseOffset = 0; // no offset for ptrs.
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
"object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
DPxPTR(HstPtrBase));
} else {
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
- false);
+ false, IsHostPtr);
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
#ifdef OMPTARGET_DEBUG
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
OpenPOWER on IntegriCloud