diff options
Diffstat (limited to 'openmp/runtime/src/kmp_affinity.cpp')
-rw-r--r-- | openmp/runtime/src/kmp_affinity.cpp | 639 |
1 files changed, 517 insertions, 122 deletions
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index 0dc0d4829ea..4e6699ff214 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -50,6 +50,50 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { // // Print the affinity mask to the character array in a pretty format. // +#if KMP_USE_HWLOC +char * +__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) +{ + int num_chars_to_write, num_chars_written; + char* scan; + KMP_ASSERT(buf_len >= 40); + + // bufsize of 0 just retrieves the needed buffer size. + num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask); + + // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes + // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not + // take into account the '\0' character. + if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) { + KMP_SNPRINTF(buf, buf_len, "{<empty>}"); + } else if(num_chars_to_write < buf_len - 3) { + // no problem fitting the mask into buf_len number of characters + buf[0] = '{'; + // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer + num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask); + buf[num_chars_written+1] = '}'; + buf[num_chars_written+2] = '\0'; + } else { + // Need to truncate the affinity mask string and add ellipsis. + // To do this, we first write out the '{' + str(mask) + buf[0] = '{'; + hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask); + // then, what we do here is go to the 7th to last character, then go backwards until we are NOT + // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't + // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get + // { 45, 67,...} instead. + scan = buf + buf_len - 7; + while(*scan >= '0' && *scan <= '9' && scan >= buf) + scan--; + *(scan+1) = '.'; + *(scan+2) = '.'; + *(scan+3) = '.'; + *(scan+4) = '}'; + *(scan+5) = '\0'; + } + return buf; +} +#else char * __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) { @@ -102,6 +146,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) KMP_ASSERT(scan <= end); return buf; } +#endif // KMP_USE_HWLOC void @@ -263,6 +308,291 @@ __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, } } +#if KMP_USE_HWLOC +static int +__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, + kmp_i18n_id_t *const msg_id) +{ + *address2os = NULL; + *msg_id = kmp_i18n_null; + + // + // Save the affinity mask for the current thread. + // + kmp_affin_mask_t *oldMask; + KMP_CPU_ALLOC(oldMask); + __kmp_get_system_affinity(oldMask, TRUE); + + unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology); + int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU); + int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE); + int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET); + __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0; + + // + // This makes an assumption about the topology being four levels: + // machines -> packages -> cores -> hardware threads + // + hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology); + hwloc_obj_t child_iterator; + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + nPackages++; + } + current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0); + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + nCoresPerPkg++; + } + current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0); + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + __kmp_nThreadsPerCore++; + } + + if (! KMP_AFFINITY_CAPABLE()) + { + // + // Hack to try and infer the machine topology using only the data + // available from cpuid on the current thread, and __kmp_xproc. + // + KMP_ASSERT(__kmp_affinity_type == affinity_none); + + __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; + nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; + if (__kmp_affinity_verbose) { + KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + if (__kmp_affinity_uniform_topology()) { + KMP_INFORM(Uniform, "KMP_AFFINITY"); + } else { + KMP_INFORM(NonUniform, "KMP_AFFINITY"); + } + KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + } + return 0; + } + + // + // Allocate the data structure to be returned. + // + AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); + + unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); + unsigned i; + hwloc_obj_t hardware_thread_iterator; + int nActiveThreads = 0; + for(i=0;i<num_hardware_threads;i++) { + hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i); + Address addr(3); + if(! KMP_CPU_ISSET(i, fullMask)) continue; + addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index; + addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg; + addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore; + retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index); + nActiveThreads++; + } + + // + // If there's only one thread context to bind to, return now. + // + KMP_ASSERT(nActiveThreads > 0); + if (nActiveThreads == 1) { + __kmp_ncores = nPackages = 1; + __kmp_nThreadsPerCore = nCoresPerPkg = 1; + if (__kmp_affinity_verbose) { + char buf[KMP_AFFIN_MASK_PRINT_LEN]; + __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); + + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + if (__kmp_affinity_respect_mask) { + KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); + } else { + KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); + } + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + KMP_INFORM(Uniform, "KMP_AFFINITY"); + KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + } + + if (__kmp_affinity_type == affinity_none) { + __kmp_free(retval); + KMP_CPU_FREE(oldMask); + return 0; + } + + // + // Form an Address object which only includes the package level. + // + Address addr(1); + addr.labels[0] = retval[0].first.labels[pkgLevel-1]; + retval[0].first = addr; + + if (__kmp_affinity_gran_levels < 0) { + __kmp_affinity_gran_levels = 0; + } + + if (__kmp_affinity_verbose) { + __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); + } + + *address2os = retval; + KMP_CPU_FREE(oldMask); + return 1; + } + + // + // Sort the table by physical Id. + // + qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); + + // + // When affinity is off, this routine will still be called to set + // __kmp_ncores, as well as __kmp_nThreadsPerCore, + // nCoresPerPkg, & nPackages. Make sure all these vars are set + // correctly, and return if affinity is not enabled. + // + __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel); + + // + // Check to see if the machine topology is uniform + // + unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel); + unsigned ncores = __kmp_ncores; + unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); + unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads); + + // + // Print the machine topology summary. + // + if (__kmp_affinity_verbose) { + char mask[KMP_AFFIN_MASK_PRINT_LEN]; + __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); + + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + if (__kmp_affinity_respect_mask) { + KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); + } else { + KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); + } + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + if (uniform) { + KMP_INFORM(Uniform, "KMP_AFFINITY"); + } else { + KMP_INFORM(NonUniform, "KMP_AFFINITY"); + } + + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + + __kmp_str_buf_print(&buf, "%d", npackages); + //for (level = 1; level <= pkgLevel; level++) { + // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); + // } + KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + + __kmp_str_buf_free(&buf); + } + + if (__kmp_affinity_type == affinity_none) { + KMP_CPU_FREE(oldMask); + return 0; + } + + // + // Find any levels with radiix 1, and remove them from the map + // (except for the package level). + // + int new_depth = 0; + int level; + unsigned proc; + for (level = 1; level < (int)depth; level++) { + if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { + continue; + } + new_depth++; + } + + // + // If we are removing any levels, allocate a new vector to return, + // and copy the relevant information to it. + // + if (new_depth != depth-1) { + AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( + sizeof(AddrUnsPair) * nActiveThreads); + for (proc = 0; (int)proc < nActiveThreads; proc++) { + Address addr(new_depth); + new_retval[proc] = AddrUnsPair(addr, retval[proc].second); + } + int new_level = 0; + for (level = 1; level < (int)depth; level++) { + if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { + if (level == threadLevel) { + threadLevel = -1; + } + else if ((threadLevel >= 0) && (level < threadLevel)) { + threadLevel--; + } + if (level == coreLevel) { + coreLevel = -1; + } + else if ((coreLevel >= 0) && (level < coreLevel)) { + coreLevel--; + } + if (level < pkgLevel) { + pkgLevel--; + } + continue; + } + for (proc = 0; (int)proc < nActiveThreads; proc++) { + new_retval[proc].first.labels[new_level] + = retval[proc].first.labels[level]; + } + new_level++; + } + + __kmp_free(retval); + retval = new_retval; + depth = new_depth; + } + + if (__kmp_affinity_gran_levels < 0) { + // + // Set the granularity level based on what levels are modeled + // in the machine topology map. + // + __kmp_affinity_gran_levels = 0; + if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { + __kmp_affinity_gran_levels++; + } + if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { + __kmp_affinity_gran_levels++; + } + if (__kmp_affinity_gran > affinity_gran_package) { + __kmp_affinity_gran_levels++; + } + } + + if (__kmp_affinity_verbose) { + __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1, + coreLevel-1, threadLevel-1); + } + + KMP_CPU_FREE(oldMask); + *address2os = retval; + if(depth == 0) return 0; + else return depth-1; +} +#endif // KMP_USE_HWLOC // // If we don't know how to retrieve the machine's processor topology, or @@ -329,7 +659,7 @@ __kmp_affinity_create_flat_map(AddrUnsPair **address2os, __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); int avail_ct = 0; unsigned int i; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -394,7 +724,7 @@ __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); int avail_ct = 0; int i; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -656,7 +986,7 @@ __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( __kmp_avail_proc * sizeof(apicThreadInfo)); unsigned nApics = 0; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -1167,7 +1497,7 @@ __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, // unsigned int proc; int nApics = 0; - for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { + KMP_CPU_SET_ITERATE(proc, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -2282,8 +2612,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, maxOsId = osId; } } - kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( - (maxOsId + 1) * __kmp_affin_mask_size); + kmp_affin_mask_t *osId2Mask; + KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); // // Sort the address2os table according to physical order. Doing so @@ -2314,8 +2644,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, unsigned j = 0; // index of 1st thread on core unsigned leader = 0; Address *leaderAddr = &(address2os[0].first); - kmp_affin_mask_t *sum - = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *sum; + KMP_CPU_ALLOC_ON_STACK(sum); KMP_CPU_ZERO(sum); KMP_CPU_SET(address2os[0].second, sum); for (i = 1; i < numAddrs; i++) { @@ -2365,6 +2695,7 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, address2os[j].first.leader = (j == leader); } unique++; + KMP_CPU_FREE_FROM_STACK(sum); *maxIndex = maxOsId; *numUnique = unique; @@ -2384,9 +2715,17 @@ static int nextNewMask; #define ADD_MASK(_mask) \ { \ if (nextNewMask >= numNewMasks) { \ + int i; \ numNewMasks *= 2; \ - newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ - numNewMasks * __kmp_affin_mask_size); \ + kmp_affin_mask_t* temp; \ + KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ + for(i=0;i<numNewMasks/2;i++) { \ + kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \ + kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \ + KMP_CPU_COPY(dest, src); \ + } \ + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \ + newMasks = temp; \ } \ KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ nextNewMask++; \ @@ -2416,6 +2755,7 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, unsigned int *out_numMasks, const char *proclist, kmp_affin_mask_t *osId2Mask, int maxOsId) { + int i; const char *scan = proclist; const char *next = proclist; @@ -2424,11 +2764,10 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, // so that we can use realloc() to extend it. // numNewMasks = 2; - newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks - * __kmp_affin_mask_size); + KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); nextNewMask = 0; - kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( - __kmp_affin_mask_size); + kmp_affin_mask_t *sumMask; + KMP_CPU_ALLOC(sumMask); int setSize = 0; for (;;) { @@ -2632,14 +2971,17 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, *out_numMasks = nextNewMask; if (nextNewMask == 0) { *out_masks = NULL; - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); return; } - *out_masks - = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); - KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); - __kmp_free(sumMask); - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); + for(i = 0; i < nextNewMask; i++) { + kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); + kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); + KMP_CPU_COPY(dest, src); + } + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); + KMP_CPU_FREE(sumMask); } @@ -2834,7 +3176,7 @@ __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, else if (**scan == '!') { (*scan)++; // skip '!' __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); - KMP_CPU_COMPLEMENT(tempMask); + KMP_CPU_COMPLEMENT(maxOsId, tempMask); } else if ((**scan >= '0') && (**scan <= '9')) { next = *scan; @@ -2866,17 +3208,23 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, unsigned int *out_numMasks, const char *placelist, kmp_affin_mask_t *osId2Mask, int maxOsId) { + int i,j,count,stride,sign; const char *scan = placelist; const char *next = placelist; numNewMasks = 2; - newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks - * __kmp_affin_mask_size); + KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); nextNewMask = 0; - kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( - __kmp_affin_mask_size); + // tempMask is modified based on the previous or initial + // place to form the current place + // previousMask contains the previous place + kmp_affin_mask_t *tempMask; + kmp_affin_mask_t *previousMask; + KMP_CPU_ALLOC(tempMask); KMP_CPU_ZERO(tempMask); + KMP_CPU_ALLOC(previousMask); + KMP_CPU_ZERO(previousMask); int setSize = 0; for (;;) { @@ -2910,7 +3258,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, "bad explicit places list"); next = scan; SKIP_DIGITS(next); - int count = __kmp_str_to_int(scan, *next); + count = __kmp_str_to_int(scan, *next); KMP_ASSERT(count >= 0); scan = next; @@ -2918,7 +3266,6 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, // valid follow sets are ',' ':' and EOL // SKIP_WS(scan); - int stride; if (*scan == '\0' || *scan == ',') { stride = +1; } @@ -2929,7 +3276,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, // // Read stride parameter // - int sign = +1; + sign = +1; for (;;) { SKIP_WS(scan); if (*scan == '+') { @@ -2954,66 +3301,30 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, stride *= sign; } - if (stride > 0) { - int i; - for (i = 0; i < count; i++) { - int j; - if (setSize == 0) { - break; - } - ADD_MASK(tempMask); - setSize = 0; - for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { - if (! KMP_CPU_ISSET(j - stride, tempMask)) { - KMP_CPU_CLR(j, tempMask); - } - else if ((j > maxOsId) || - (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { - if ((__kmp_affinity_verbose || (__kmp_affinity_warnings - && (__kmp_affinity_type != affinity_none))) && i < count - 1) { - KMP_WARNING(AffIgnoreInvalidProcID, j); - } - KMP_CPU_CLR(j, tempMask); - } - else { - KMP_CPU_SET(j, tempMask); - setSize++; - } - } - for (; j >= 0; j--) { - KMP_CPU_CLR(j, tempMask); - } + // Add places determined by initial_place : count : stride + for (i = 0; i < count; i++) { + if (setSize == 0) { + break; } - } - else { - int i; - for (i = 0; i < count; i++) { - int j; - if (setSize == 0) { - break; + // Add the current place, then build the next place (tempMask) from that + KMP_CPU_COPY(previousMask, tempMask); + ADD_MASK(previousMask); + KMP_CPU_ZERO(tempMask); + setSize = 0; + KMP_CPU_SET_ITERATE(j, previousMask) { + if (! KMP_CPU_ISSET(j, previousMask)) { + continue; } - ADD_MASK(tempMask); - setSize = 0; - for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; - j++) { - if (! KMP_CPU_ISSET(j - stride, tempMask)) { - KMP_CPU_CLR(j, tempMask); - } - else if ((j > maxOsId) || - (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { - if ((__kmp_affinity_verbose || (__kmp_affinity_warnings - && (__kmp_affinity_type != affinity_none))) && i < count - 1) { - KMP_WARNING(AffIgnoreInvalidProcID, j); - } - KMP_CPU_CLR(j, tempMask); - } - else { - KMP_CPU_SET(j, tempMask); - setSize++; + else if ((j+stride > maxOsId) || (j+stride < 0) || + (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { + if ((__kmp_affinity_verbose || (__kmp_affinity_warnings + && (__kmp_affinity_type != affinity_none))) && i < count - 1) { + KMP_WARNING(AffIgnoreInvalidProcID, j+stride); } } - for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { - KMP_CPU_CLR(j, tempMask); + else { + KMP_CPU_SET(j+stride, tempMask); + setSize++; } } } @@ -3038,14 +3349,18 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, *out_numMasks = nextNewMask; if (nextNewMask == 0) { *out_masks = NULL; - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); return; } - *out_masks - = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); - KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); - __kmp_free(tempMask); - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); + KMP_CPU_FREE(tempMask); + KMP_CPU_FREE(previousMask); + for(i = 0; i < nextNewMask; i++) { + kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); + kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); + KMP_CPU_COPY(dest, src); + } + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); } # endif /* OMP_40_ENABLED */ @@ -3140,7 +3455,7 @@ __kmp_aux_affinity_initialize(void) // processors that we know about on the machine. // if (fullMask == NULL) { - fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); + KMP_CPU_ALLOC(fullMask); } if (KMP_AFFINITY_CAPABLE()) { if (__kmp_affinity_respect_mask) { @@ -3151,7 +3466,7 @@ __kmp_aux_affinity_initialize(void) // unsigned i; __kmp_avail_proc = 0; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { if (! KMP_CPU_ISSET(i, fullMask)) { continue; } @@ -3193,39 +3508,60 @@ __kmp_aux_affinity_initialize(void) // const char *file_name = NULL; int line = 0; - -# if KMP_ARCH_X86 || KMP_ARCH_X86_64 - - if (__kmp_affinity_verbose) { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); +# if KMP_USE_HWLOC + if (depth < 0) { + if (__kmp_affinity_verbose) { + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + } + if(!__kmp_hwloc_error) { + depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } else if(depth < 0 && __kmp_affinity_verbose) { + KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); + } + } else if(__kmp_affinity_verbose) { + KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); + } } +# endif - file_name = NULL; - depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_ASSERT(__kmp_affinity_type == affinity_none); - KMP_ASSERT(address2os == NULL); - return; - } +# if KMP_ARCH_X86 || KMP_ARCH_X86_64 if (depth < 0) { if (__kmp_affinity_verbose) { - if (msg_id != kmp_i18n_null) { - KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), - KMP_I18N_STR(DecodingLegacyAPIC)); - } - else { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); - } + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); } file_name = NULL; - depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); + depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); if (depth == 0) { KMP_ASSERT(__kmp_affinity_type == affinity_none); KMP_ASSERT(address2os == NULL); return; } + + if (depth < 0) { + if (__kmp_affinity_verbose) { + if (msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), + KMP_I18N_STR(DecodingLegacyAPIC)); + } + else { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); + } + } + + file_name = NULL; + depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } + } } # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ @@ -3430,6 +3766,50 @@ __kmp_aux_affinity_initialize(void) KMP_ASSERT(address2os != NULL); } +# if KMP_USE_HWLOC + else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { + if (__kmp_affinity_verbose) { + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + } + depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } +# if KMP_DEBUG + AddrUnsPair *otheraddress2os = NULL; + int otherdepth = -1; +# if KMP_MIC + otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id); +# else + otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id); +# endif + if(otheraddress2os != NULL && address2os != NULL) { + int i; + unsigned arent_equal_flag = 0; + for(i=0;i<__kmp_avail_proc;i++) { + if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1; + } + if(arent_equal_flag) { + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n")); + KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n")); + for(i=0;i<__kmp_avail_proc;i++) { + otheraddress2os[i].print(); __kmp_printf("\n"); + } + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n")); + for(i=0;i<__kmp_avail_proc;i++) { + address2os[i].print(); __kmp_printf("\n"); + } + } + else { + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n")); + } + } +# endif // KMP_DEBUG + } +# endif // KMP_USE_HWLOC + if (address2os == NULL) { if (KMP_AFFINITY_CAPABLE() && (__kmp_affinity_verbose || (__kmp_affinity_warnings @@ -3608,8 +3988,7 @@ __kmp_aux_affinity_initialize(void) } # endif - __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( - __kmp_affinity_num_masks * __kmp_affin_mask_size); + KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); // // Sort the address2os table according to the current setting of @@ -3679,7 +4058,7 @@ void __kmp_affinity_uninitialize(void) { if (__kmp_affinity_masks != NULL) { - __kmp_free(__kmp_affinity_masks); + KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); __kmp_affinity_masks = NULL; } if (fullMask != NULL) { @@ -3909,7 +4288,7 @@ __kmp_aux_set_affinity(void **mask) unsigned proc; int num_procs = 0; - for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { + KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { continue; } @@ -4027,7 +4406,11 @@ __kmp_aux_set_affinity_mask_proc(int proc, void **mask) } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { @@ -4063,7 +4446,11 @@ __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { @@ -4099,8 +4486,12 @@ __kmp_aux_get_affinity_mask_proc(int proc, void **mask) } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { - return 0; + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { + return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { return 0; @@ -4137,7 +4528,8 @@ void __kmp_balanced_affinity( int tid, int nthreads ) KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal set affinity operation when not capable"); - kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *mask; + KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); // Granularity == thread @@ -4158,9 +4550,11 @@ void __kmp_balanced_affinity( int tid, int nthreads ) tid, buf); } __kmp_set_system_affinity( mask, TRUE ); + KMP_CPU_FREE_FROM_STACK(mask); } else { // Non-uniform topology - kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *mask; + KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); // Number of hyper threads per core in HT machine @@ -4334,6 +4728,7 @@ void __kmp_balanced_affinity( int tid, int nthreads ) tid, buf); } __kmp_set_system_affinity( mask, TRUE ); + KMP_CPU_FREE_FROM_STACK(mask); } } |