summaryrefslogtreecommitdiffstats
path: root/openmp/runtime/src/kmp_affinity.cpp
diff options
context:
space:
mode:
authorJonathan Peyton <jonathan.l.peyton@intel.com>2015-11-30 20:02:59 +0000
committerJonathan Peyton <jonathan.l.peyton@intel.com>2015-11-30 20:02:59 +0000
commit01dcf36bd54be75dcf6c3cc9cc8d1f20e783d86a (patch)
tree538111ad7545036c7e9d75ab83e37da63b4ddaa9 /openmp/runtime/src/kmp_affinity.cpp
parent7a096596b2eead02405329a5504b0d71dd5b4a8d (diff)
downloadbcm5719-llvm-01dcf36bd54be75dcf6c3cc9cc8d1f20e783d86a.tar.gz
bcm5719-llvm-01dcf36bd54be75dcf6c3cc9cc8d1f20e783d86a.zip
Adding Hwloc library option for affinity mechanism
These changes allow libhwloc to be used as the topology discovery/affinity mechanism for libomp. It is supported on Unices. The code additions: * Canonicalize KMP_CPU_* interface macros so bitmask operations are implementation independent and work with both hwloc bitmaps and libomp bitmaps. So there are new KMP_CPU_ALLOC_* and KMP_CPU_ITERATE() macros and the like. These are all in kmp.h and appropriately placed. * Hwloc topology discovery code in kmp_affinity.cpp. This uses the hwloc interface to create a libomp address2os object which the rest of libomp knows how to handle already. * To build, use -DLIBOMP_USE_HWLOC=on and -DLIBOMP_HWLOC_INSTALL_DIR=/path/to/install/dir [default /usr/local]. If CMake can't find the library or hwloc.h, then it will tell you and exit. Differential Revision: http://reviews.llvm.org/D13991 llvm-svn: 254320
Diffstat (limited to 'openmp/runtime/src/kmp_affinity.cpp')
-rw-r--r--openmp/runtime/src/kmp_affinity.cpp639
1 files changed, 517 insertions, 122 deletions
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 0dc0d4829ea..4e6699ff214 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -50,6 +50,50 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
//
// Print the affinity mask to the character array in a pretty format.
//
+#if KMP_USE_HWLOC
+char *
+__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
+{
+ int num_chars_to_write, num_chars_written;
+ char* scan;
+ KMP_ASSERT(buf_len >= 40);
+
+ // bufsize of 0 just retrieves the needed buffer size.
+ num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
+
+ // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
+ // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
+ // take into account the '\0' character.
+ if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
+ KMP_SNPRINTF(buf, buf_len, "{<empty>}");
+ } else if(num_chars_to_write < buf_len - 3) {
+ // no problem fitting the mask into buf_len number of characters
+ buf[0] = '{';
+ // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
+ num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
+ buf[num_chars_written+1] = '}';
+ buf[num_chars_written+2] = '\0';
+ } else {
+ // Need to truncate the affinity mask string and add ellipsis.
+ // To do this, we first write out the '{' + str(mask)
+ buf[0] = '{';
+ hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask);
+ // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
+ // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't
+ // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
+ // { 45, 67,...} instead.
+ scan = buf + buf_len - 7;
+ while(*scan >= '0' && *scan <= '9' && scan >= buf)
+ scan--;
+ *(scan+1) = '.';
+ *(scan+2) = '.';
+ *(scan+3) = '.';
+ *(scan+4) = '}';
+ *(scan+5) = '\0';
+ }
+ return buf;
+}
+#else
char *
__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
{
@@ -102,6 +146,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
KMP_ASSERT(scan <= end);
return buf;
}
+#endif // KMP_USE_HWLOC
void
@@ -263,6 +308,291 @@ __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
}
}
+#if KMP_USE_HWLOC
+static int
+__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
+ kmp_i18n_id_t *const msg_id)
+{
+ *address2os = NULL;
+ *msg_id = kmp_i18n_null;
+
+ //
+ // Save the affinity mask for the current thread.
+ //
+ kmp_affin_mask_t *oldMask;
+ KMP_CPU_ALLOC(oldMask);
+ __kmp_get_system_affinity(oldMask, TRUE);
+
+ unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology);
+ int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU);
+ int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
+ int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
+ __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0;
+
+ //
+ // This makes an assumption about the topology being four levels:
+ // machines -> packages -> cores -> hardware threads
+ //
+ hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology);
+ hwloc_obj_t child_iterator;
+ for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+ child_iterator != NULL;
+ child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+ {
+ nPackages++;
+ }
+ current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0);
+ for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+ child_iterator != NULL;
+ child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+ {
+ nCoresPerPkg++;
+ }
+ current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0);
+ for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+ child_iterator != NULL;
+ child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+ {
+ __kmp_nThreadsPerCore++;
+ }
+
+ if (! KMP_AFFINITY_CAPABLE())
+ {
+ //
+ // Hack to try and infer the machine topology using only the data
+ // available from cpuid on the current thread, and __kmp_xproc.
+ //
+ KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+ __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+ nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+ if (__kmp_affinity_verbose) {
+ KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+ KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+ if (__kmp_affinity_uniform_topology()) {
+ KMP_INFORM(Uniform, "KMP_AFFINITY");
+ } else {
+ KMP_INFORM(NonUniform, "KMP_AFFINITY");
+ }
+ KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+ __kmp_nThreadsPerCore, __kmp_ncores);
+ }
+ return 0;
+ }
+
+ //
+ // Allocate the data structure to be returned.
+ //
+ AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+
+ unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
+ unsigned i;
+ hwloc_obj_t hardware_thread_iterator;
+ int nActiveThreads = 0;
+ for(i=0;i<num_hardware_threads;i++) {
+ hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i);
+ Address addr(3);
+ if(! KMP_CPU_ISSET(i, fullMask)) continue;
+ addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index;
+ addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg;
+ addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore;
+ retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index);
+ nActiveThreads++;
+ }
+
+ //
+ // If there's only one thread context to bind to, return now.
+ //
+ KMP_ASSERT(nActiveThreads > 0);
+ if (nActiveThreads == 1) {
+ __kmp_ncores = nPackages = 1;
+ __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+ if (__kmp_affinity_verbose) {
+ char buf[KMP_AFFIN_MASK_PRINT_LEN];
+ __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+ KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+ if (__kmp_affinity_respect_mask) {
+ KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+ } else {
+ KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+ }
+ KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+ KMP_INFORM(Uniform, "KMP_AFFINITY");
+ KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+ __kmp_nThreadsPerCore, __kmp_ncores);
+ }
+
+ if (__kmp_affinity_type == affinity_none) {
+ __kmp_free(retval);
+ KMP_CPU_FREE(oldMask);
+ return 0;
+ }
+
+ //
+ // Form an Address object which only includes the package level.
+ //
+ Address addr(1);
+ addr.labels[0] = retval[0].first.labels[pkgLevel-1];
+ retval[0].first = addr;
+
+ if (__kmp_affinity_gran_levels < 0) {
+ __kmp_affinity_gran_levels = 0;
+ }
+
+ if (__kmp_affinity_verbose) {
+ __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+ }
+
+ *address2os = retval;
+ KMP_CPU_FREE(oldMask);
+ return 1;
+ }
+
+ //
+ // Sort the table by physical Id.
+ //
+ qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+
+ //
+ // When affinity is off, this routine will still be called to set
+ // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+ // nCoresPerPkg, & nPackages. Make sure all these vars are set
+ // correctly, and return if affinity is not enabled.
+ //
+ __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel);
+
+ //
+ // Check to see if the machine topology is uniform
+ //
+ unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel);
+ unsigned ncores = __kmp_ncores;
+ unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
+ unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads);
+
+ //
+ // Print the machine topology summary.
+ //
+ if (__kmp_affinity_verbose) {
+ char mask[KMP_AFFIN_MASK_PRINT_LEN];
+ __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+ KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+ if (__kmp_affinity_respect_mask) {
+ KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+ } else {
+ KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
+ }
+ KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+ if (uniform) {
+ KMP_INFORM(Uniform, "KMP_AFFINITY");
+ } else {
+ KMP_INFORM(NonUniform, "KMP_AFFINITY");
+ }
+
+ kmp_str_buf_t buf;
+ __kmp_str_buf_init(&buf);
+
+ __kmp_str_buf_print(&buf, "%d", npackages);
+ //for (level = 1; level <= pkgLevel; level++) {
+ // __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+ // }
+ KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+ __kmp_nThreadsPerCore, __kmp_ncores);
+
+ __kmp_str_buf_free(&buf);
+ }
+
+ if (__kmp_affinity_type == affinity_none) {
+ KMP_CPU_FREE(oldMask);
+ return 0;
+ }
+
+ //
+ // Find any levels with radiix 1, and remove them from the map
+ // (except for the package level).
+ //
+ int new_depth = 0;
+ int level;
+ unsigned proc;
+ for (level = 1; level < (int)depth; level++) {
+ if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
+ continue;
+ }
+ new_depth++;
+ }
+
+ //
+ // If we are removing any levels, allocate a new vector to return,
+ // and copy the relevant information to it.
+ //
+ if (new_depth != depth-1) {
+ AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
+ sizeof(AddrUnsPair) * nActiveThreads);
+ for (proc = 0; (int)proc < nActiveThreads; proc++) {
+ Address addr(new_depth);
+ new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
+ }
+ int new_level = 0;
+ for (level = 1; level < (int)depth; level++) {
+ if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
+ if (level == threadLevel) {
+ threadLevel = -1;
+ }
+ else if ((threadLevel >= 0) && (level < threadLevel)) {
+ threadLevel--;
+ }
+ if (level == coreLevel) {
+ coreLevel = -1;
+ }
+ else if ((coreLevel >= 0) && (level < coreLevel)) {
+ coreLevel--;
+ }
+ if (level < pkgLevel) {
+ pkgLevel--;
+ }
+ continue;
+ }
+ for (proc = 0; (int)proc < nActiveThreads; proc++) {
+ new_retval[proc].first.labels[new_level]
+ = retval[proc].first.labels[level];
+ }
+ new_level++;
+ }
+
+ __kmp_free(retval);
+ retval = new_retval;
+ depth = new_depth;
+ }
+
+ if (__kmp_affinity_gran_levels < 0) {
+ //
+ // Set the granularity level based on what levels are modeled
+ // in the machine topology map.
+ //
+ __kmp_affinity_gran_levels = 0;
+ if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+ __kmp_affinity_gran_levels++;
+ }
+ if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+ __kmp_affinity_gran_levels++;
+ }
+ if (__kmp_affinity_gran > affinity_gran_package) {
+ __kmp_affinity_gran_levels++;
+ }
+ }
+
+ if (__kmp_affinity_verbose) {
+ __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1,
+ coreLevel-1, threadLevel-1);
+ }
+
+ KMP_CPU_FREE(oldMask);
+ *address2os = retval;
+ if(depth == 0) return 0;
+ else return depth-1;
+}
+#endif // KMP_USE_HWLOC
//
// If we don't know how to retrieve the machine's processor topology, or
@@ -329,7 +659,7 @@ __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
int avail_ct = 0;
unsigned int i;
- for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+ KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@@ -394,7 +724,7 @@ __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
int avail_ct = 0;
int i;
- for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+ KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@@ -656,7 +986,7 @@ __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
__kmp_avail_proc * sizeof(apicThreadInfo));
unsigned nApics = 0;
- for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+ KMP_CPU_SET_ITERATE(i, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@@ -1167,7 +1497,7 @@ __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
//
unsigned int proc;
int nApics = 0;
- for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
+ KMP_CPU_SET_ITERATE(proc, fullMask) {
//
// Skip this proc if it is not included in the machine model.
//
@@ -2282,8 +2612,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
maxOsId = osId;
}
}
- kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
- (maxOsId + 1) * __kmp_affin_mask_size);
+ kmp_affin_mask_t *osId2Mask;
+ KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
//
// Sort the address2os table according to physical order. Doing so
@@ -2314,8 +2644,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
unsigned j = 0; // index of 1st thread on core
unsigned leader = 0;
Address *leaderAddr = &(address2os[0].first);
- kmp_affin_mask_t *sum
- = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+ kmp_affin_mask_t *sum;
+ KMP_CPU_ALLOC_ON_STACK(sum);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(address2os[0].second, sum);
for (i = 1; i < numAddrs; i++) {
@@ -2365,6 +2695,7 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
address2os[j].first.leader = (j == leader);
}
unique++;
+ KMP_CPU_FREE_FROM_STACK(sum);
*maxIndex = maxOsId;
*numUnique = unique;
@@ -2384,9 +2715,17 @@ static int nextNewMask;
#define ADD_MASK(_mask) \
{ \
if (nextNewMask >= numNewMasks) { \
+ int i; \
numNewMasks *= 2; \
- newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
- numNewMasks * __kmp_affin_mask_size); \
+ kmp_affin_mask_t* temp; \
+ KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
+ for(i=0;i<numNewMasks/2;i++) { \
+ kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \
+ kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \
+ KMP_CPU_COPY(dest, src); \
+ } \
+ KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \
+ newMasks = temp; \
} \
KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
nextNewMask++; \
@@ -2416,6 +2755,7 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks, const char *proclist,
kmp_affin_mask_t *osId2Mask, int maxOsId)
{
+ int i;
const char *scan = proclist;
const char *next = proclist;
@@ -2424,11 +2764,10 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
// so that we can use realloc() to extend it.
//
numNewMasks = 2;
- newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
- * __kmp_affin_mask_size);
+ KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
- kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
- __kmp_affin_mask_size);
+ kmp_affin_mask_t *sumMask;
+ KMP_CPU_ALLOC(sumMask);
int setSize = 0;
for (;;) {
@@ -2632,14 +2971,17 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
- KMP_INTERNAL_FREE(newMasks);
+ KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
- *out_masks
- = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
- KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
- __kmp_free(sumMask);
- KMP_INTERNAL_FREE(newMasks);
+ KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+ for(i = 0; i < nextNewMask; i++) {
+ kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
+ kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
+ KMP_CPU_COPY(dest, src);
+ }
+ KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+ KMP_CPU_FREE(sumMask);
}
@@ -2834,7 +3176,7 @@ __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
else if (**scan == '!') {
(*scan)++; // skip '!'
__kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
- KMP_CPU_COMPLEMENT(tempMask);
+ KMP_CPU_COMPLEMENT(maxOsId, tempMask);
}
else if ((**scan >= '0') && (**scan <= '9')) {
next = *scan;
@@ -2866,17 +3208,23 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks, const char *placelist,
kmp_affin_mask_t *osId2Mask, int maxOsId)
{
+ int i,j,count,stride,sign;
const char *scan = placelist;
const char *next = placelist;
numNewMasks = 2;
- newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
- * __kmp_affin_mask_size);
+ KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
- kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
- __kmp_affin_mask_size);
+ // tempMask is modified based on the previous or initial
+ // place to form the current place
+ // previousMask contains the previous place
+ kmp_affin_mask_t *tempMask;
+ kmp_affin_mask_t *previousMask;
+ KMP_CPU_ALLOC(tempMask);
KMP_CPU_ZERO(tempMask);
+ KMP_CPU_ALLOC(previousMask);
+ KMP_CPU_ZERO(previousMask);
int setSize = 0;
for (;;) {
@@ -2910,7 +3258,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
"bad explicit places list");
next = scan;
SKIP_DIGITS(next);
- int count = __kmp_str_to_int(scan, *next);
+ count = __kmp_str_to_int(scan, *next);
KMP_ASSERT(count >= 0);
scan = next;
@@ -2918,7 +3266,6 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
// valid follow sets are ',' ':' and EOL
//
SKIP_WS(scan);
- int stride;
if (*scan == '\0' || *scan == ',') {
stride = +1;
}
@@ -2929,7 +3276,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
//
// Read stride parameter
//
- int sign = +1;
+ sign = +1;
for (;;) {
SKIP_WS(scan);
if (*scan == '+') {
@@ -2954,66 +3301,30 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
stride *= sign;
}
- if (stride > 0) {
- int i;
- for (i = 0; i < count; i++) {
- int j;
- if (setSize == 0) {
- break;
- }
- ADD_MASK(tempMask);
- setSize = 0;
- for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
- if (! KMP_CPU_ISSET(j - stride, tempMask)) {
- KMP_CPU_CLR(j, tempMask);
- }
- else if ((j > maxOsId) ||
- (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
- if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
- && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
- KMP_WARNING(AffIgnoreInvalidProcID, j);
- }
- KMP_CPU_CLR(j, tempMask);
- }
- else {
- KMP_CPU_SET(j, tempMask);
- setSize++;
- }
- }
- for (; j >= 0; j--) {
- KMP_CPU_CLR(j, tempMask);
- }
+ // Add places determined by initial_place : count : stride
+ for (i = 0; i < count; i++) {
+ if (setSize == 0) {
+ break;
}
- }
- else {
- int i;
- for (i = 0; i < count; i++) {
- int j;
- if (setSize == 0) {
- break;
+ // Add the current place, then build the next place (tempMask) from that
+ KMP_CPU_COPY(previousMask, tempMask);
+ ADD_MASK(previousMask);
+ KMP_CPU_ZERO(tempMask);
+ setSize = 0;
+ KMP_CPU_SET_ITERATE(j, previousMask) {
+ if (! KMP_CPU_ISSET(j, previousMask)) {
+ continue;
}
- ADD_MASK(tempMask);
- setSize = 0;
- for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
- j++) {
- if (! KMP_CPU_ISSET(j - stride, tempMask)) {
- KMP_CPU_CLR(j, tempMask);
- }
- else if ((j > maxOsId) ||
- (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
- if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
- && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
- KMP_WARNING(AffIgnoreInvalidProcID, j);
- }
- KMP_CPU_CLR(j, tempMask);
- }
- else {
- KMP_CPU_SET(j, tempMask);
- setSize++;
+ else if ((j+stride > maxOsId) || (j+stride < 0) ||
+ (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
+ if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
+ && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
+ KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
}
}
- for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
- KMP_CPU_CLR(j, tempMask);
+ else {
+ KMP_CPU_SET(j+stride, tempMask);
+ setSize++;
}
}
}
@@ -3038,14 +3349,18 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
- KMP_INTERNAL_FREE(newMasks);
+ KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
- *out_masks
- = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
- KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
- __kmp_free(tempMask);
- KMP_INTERNAL_FREE(newMasks);
+ KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+ KMP_CPU_FREE(tempMask);
+ KMP_CPU_FREE(previousMask);
+ for(i = 0; i < nextNewMask; i++) {
+ kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i);
+ kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
+ KMP_CPU_COPY(dest, src);
+ }
+ KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
}
# endif /* OMP_40_ENABLED */
@@ -3140,7 +3455,7 @@ __kmp_aux_affinity_initialize(void)
// processors that we know about on the machine.
//
if (fullMask == NULL) {
- fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
+ KMP_CPU_ALLOC(fullMask);
}
if (KMP_AFFINITY_CAPABLE()) {
if (__kmp_affinity_respect_mask) {
@@ -3151,7 +3466,7 @@ __kmp_aux_affinity_initialize(void)
//
unsigned i;
__kmp_avail_proc = 0;
- for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+ KMP_CPU_SET_ITERATE(i, fullMask) {
if (! KMP_CPU_ISSET(i, fullMask)) {
continue;
}
@@ -3193,39 +3508,60 @@ __kmp_aux_affinity_initialize(void)
//
const char *file_name = NULL;
int line = 0;
-
-# if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
- if (__kmp_affinity_verbose) {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+# if KMP_USE_HWLOC
+ if (depth < 0) {
+ if (__kmp_affinity_verbose) {
+ KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+ }
+ if(!__kmp_hwloc_error) {
+ depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+ if (depth == 0) {
+ KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(address2os == NULL);
+ return;
+ } else if(depth < 0 && __kmp_affinity_verbose) {
+ KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+ }
+ } else if(__kmp_affinity_verbose) {
+ KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+ }
}
+# endif
- file_name = NULL;
- depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
- if (depth == 0) {
- KMP_ASSERT(__kmp_affinity_type == affinity_none);
- KMP_ASSERT(address2os == NULL);
- return;
- }
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
if (depth < 0) {
if (__kmp_affinity_verbose) {
- if (msg_id != kmp_i18n_null) {
- KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
- KMP_I18N_STR(DecodingLegacyAPIC));
- }
- else {
- KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
- }
+ KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
file_name = NULL;
- depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+ depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
KMP_ASSERT(address2os == NULL);
return;
}
+
+ if (depth < 0) {
+ if (__kmp_affinity_verbose) {
+ if (msg_id != kmp_i18n_null) {
+ KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
+ KMP_I18N_STR(DecodingLegacyAPIC));
+ }
+ else {
+ KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+ }
+ }
+
+ file_name = NULL;
+ depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+ if (depth == 0) {
+ KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(address2os == NULL);
+ return;
+ }
+ }
}
# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -3430,6 +3766,50 @@ __kmp_aux_affinity_initialize(void)
KMP_ASSERT(address2os != NULL);
}
+# if KMP_USE_HWLOC
+ else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+ if (__kmp_affinity_verbose) {
+ KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+ }
+ depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+ if (depth == 0) {
+ KMP_ASSERT(__kmp_affinity_type == affinity_none);
+ KMP_ASSERT(address2os == NULL);
+ return;
+ }
+# if KMP_DEBUG
+ AddrUnsPair *otheraddress2os = NULL;
+ int otherdepth = -1;
+# if KMP_MIC
+ otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id);
+# else
+ otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id);
+# endif
+ if(otheraddress2os != NULL && address2os != NULL) {
+ int i;
+ unsigned arent_equal_flag = 0;
+ for(i=0;i<__kmp_avail_proc;i++) {
+ if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1;
+ }
+ if(arent_equal_flag) {
+ KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n"));
+ KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n"));
+ for(i=0;i<__kmp_avail_proc;i++) {
+ otheraddress2os[i].print(); __kmp_printf("\n");
+ }
+ KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n"));
+ for(i=0;i<__kmp_avail_proc;i++) {
+ address2os[i].print(); __kmp_printf("\n");
+ }
+ }
+ else {
+ KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n"));
+ }
+ }
+# endif // KMP_DEBUG
+ }
+# endif // KMP_USE_HWLOC
+
if (address2os == NULL) {
if (KMP_AFFINITY_CAPABLE()
&& (__kmp_affinity_verbose || (__kmp_affinity_warnings
@@ -3608,8 +3988,7 @@ __kmp_aux_affinity_initialize(void)
}
# endif
- __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
- __kmp_affinity_num_masks * __kmp_affin_mask_size);
+ KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
//
// Sort the address2os table according to the current setting of
@@ -3679,7 +4058,7 @@ void
__kmp_affinity_uninitialize(void)
{
if (__kmp_affinity_masks != NULL) {
- __kmp_free(__kmp_affinity_masks);
+ KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
__kmp_affinity_masks = NULL;
}
if (fullMask != NULL) {
@@ -3909,7 +4288,7 @@ __kmp_aux_set_affinity(void **mask)
unsigned proc;
int num_procs = 0;
- for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
+ KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
continue;
}
@@ -4027,7 +4406,11 @@ __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
}
}
- if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+ if ((proc < 0)
+# if !KMP_USE_HWLOC
+ || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+ ) {
return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
@@ -4063,7 +4446,11 @@ __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
}
}
- if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+ if ((proc < 0)
+# if !KMP_USE_HWLOC
+ || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+ ) {
return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
@@ -4099,8 +4486,12 @@ __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
}
}
- if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
- return 0;
+ if ((proc < 0)
+# if !KMP_USE_HWLOC
+ || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+ ) {
+ return -1;
}
if (! KMP_CPU_ISSET(proc, fullMask)) {
return 0;
@@ -4137,7 +4528,8 @@ void __kmp_balanced_affinity( int tid, int nthreads )
KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal set affinity operation when not capable");
- kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+ kmp_affin_mask_t *mask;
+ KMP_CPU_ALLOC_ON_STACK(mask);
KMP_CPU_ZERO(mask);
// Granularity == thread
@@ -4158,9 +4550,11 @@ void __kmp_balanced_affinity( int tid, int nthreads )
tid, buf);
}
__kmp_set_system_affinity( mask, TRUE );
+ KMP_CPU_FREE_FROM_STACK(mask);
} else { // Non-uniform topology
- kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+ kmp_affin_mask_t *mask;
+ KMP_CPU_ALLOC_ON_STACK(mask);
KMP_CPU_ZERO(mask);
// Number of hyper threads per core in HT machine
@@ -4334,6 +4728,7 @@ void __kmp_balanced_affinity( int tid, int nthreads )
tid, buf);
}
__kmp_set_system_affinity( mask, TRUE );
+ KMP_CPU_FREE_FROM_STACK(mask);
}
}
OpenPOWER on IntegriCloud