1 files changed, 517 insertions, 122 deletions
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 0dc0d4829ea..4e6699ff214 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -50,6 +50,50 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
 //
 // Print the affinity mask to the character array in a pretty format.
 //
+#if KMP_USE_HWLOC
+char *
+__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
+{
+    int num_chars_to_write, num_chars_written;
+    char* scan;
+    KMP_ASSERT(buf_len >= 40);
+
+    // bufsize of 0 just retrieves the needed buffer size.
+    num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask);
+
+    // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes
+    // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not
+    //   take into account the '\0' character.
+    if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) {
+        KMP_SNPRINTF(buf, buf_len, "{<empty>}");
+    } else if(num_chars_to_write < buf_len - 3) {
+        // no problem fitting the mask into buf_len number of characters
+        buf[0] = '{';
+        // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer
+        num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask);
+        buf[num_chars_written+1] = '}';
+        buf[num_chars_written+2] = '\0';
+    } else {
+        // Need to truncate the affinity mask string and add ellipsis.
+        // To do this, we first write out the '{' + str(mask)
+        buf[0] = '{';
+        hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask);
+        // then, what we do here is go to the 7th to last character, then go backwards until we are NOT
+        // on a digit then write "...}\0".  This way it is a clean ellipsis addition and we don't
+        // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get
+        // { 45, 67,...} instead.
+        scan = buf + buf_len - 7;
+        while(*scan >= '0' && *scan <= '9' && scan >= buf)
+            scan--;
+        *(scan+1) = '.';
+        *(scan+2) = '.';
+        *(scan+3) = '.';
+        *(scan+4) = '}';
+        *(scan+5) = '\0';
+    }
+    return buf;
+}
+#else
 char *
 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
 {
@@ -102,6 +146,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
     KMP_ASSERT(scan <= end);
     return buf;
 }
+#endif // KMP_USE_HWLOC
 
 
 void
@@ -263,6 +308,291 @@ __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
     }
 }
 
+#if KMP_USE_HWLOC
+static int
+__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
+  kmp_i18n_id_t *const msg_id)
+{
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // Save the affinity mask for the current thread.
+    //
+    kmp_affin_mask_t *oldMask;
+    KMP_CPU_ALLOC(oldMask);
+    __kmp_get_system_affinity(oldMask, TRUE);
+
+    unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology);
+    int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU);
+    int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
+    int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
+    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0;
+
+    //
+    // This makes an assumption about the topology being four levels:
+    // machines -> packages -> cores -> hardware threads
+    //
+    hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology);
+    hwloc_obj_t child_iterator;
+    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+        child_iterator != NULL;
+        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+    {
+        nPackages++;
+    }
+    current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0);
+    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+        child_iterator != NULL;
+        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+    {
+        nCoresPerPkg++;
+    }
+    current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0);
+    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
+        child_iterator != NULL;
+        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
+    {
+        __kmp_nThreadsPerCore++;
+    }
+
+    if (! KMP_AFFINITY_CAPABLE())
+    {
+        //
+        // Hack to try and infer the machine topology using only the data
+        // available from cpuid on the current thread, and __kmp_xproc.
+        //
+        KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+        __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            if (__kmp_affinity_uniform_topology()) {
+                KMP_INFORM(Uniform, "KMP_AFFINITY");
+            } else {
+                KMP_INFORM(NonUniform, "KMP_AFFINITY");
+            }
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+        return 0;
+    }
+
+    //
+    // Allocate the data structure to be returned.
+    //
+    AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+
+    unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
+    unsigned i;
+    hwloc_obj_t hardware_thread_iterator;
+    int nActiveThreads = 0;
+    for(i=0;i<num_hardware_threads;i++) {
+        hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i);
+        Address addr(3);
+        if(! KMP_CPU_ISSET(i, fullMask)) continue;
+        addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index;
+        addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg;
+        addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore;
+        retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index);
+        nActiveThreads++;
+    }
+
+    //
+    // If there's only one thread context to bind to, return now.
+    //
+    KMP_ASSERT(nActiveThreads > 0);
+    if (nActiveThreads == 1) {
+        __kmp_ncores = nPackages = 1;
+        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+        if (__kmp_affinity_verbose) {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+            KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+            if (__kmp_affinity_respect_mask) {
+                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+            } else {
+                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+            }
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+
+        if (__kmp_affinity_type == affinity_none) {
+            __kmp_free(retval);
+            KMP_CPU_FREE(oldMask);
+            return 0;
+        }
+
+        //
+        // Form an Address object which only includes the package level.
+        //
+        Address addr(1);
+        addr.labels[0] = retval[0].first.labels[pkgLevel-1];
+        retval[0].first = addr;
+
+        if (__kmp_affinity_gran_levels < 0) {
+            __kmp_affinity_gran_levels = 0;
+        }
+
+        if (__kmp_affinity_verbose) {
+            __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+        }
+
+        *address2os = retval;
+        KMP_CPU_FREE(oldMask);
+        return 1;
+    }
+
+    //
+    // Sort the table by physical Id.
+    //
+    qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+
+    //
+    // When affinity is off, this routine will still be called to set
+    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+    // correctly, and return if affinity is not enabled.
+    //
+    __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel);
+
+    //
+    // Check to see if the machine topology is uniform
+    //
+    unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel);
+    unsigned ncores = __kmp_ncores;
+    unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
+    unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads);
+
+    //
+    // Print the machine topology summary.
+    //
+    if (__kmp_affinity_verbose) {
+        char mask[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+        } else {
+            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
+        }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        if (uniform) {
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+        } else {
+            KMP_INFORM(NonUniform, "KMP_AFFINITY");
+        }
+
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+
+        __kmp_str_buf_print(&buf, "%d", npackages);
+        //for (level = 1; level <= pkgLevel; level++) {
+        //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+       // }
+        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+          __kmp_nThreadsPerCore, __kmp_ncores);
+
+        __kmp_str_buf_free(&buf);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+        KMP_CPU_FREE(oldMask);
+        return 0;
+    }
+
+    //
+    // Find any levels with radiix 1, and remove them from the map
+    // (except for the package level).
+    //
+    int new_depth = 0;
+    int level;
+    unsigned proc;
+    for (level = 1; level < (int)depth; level++) {
+        if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
+           continue;
+        }
+        new_depth++;
+    }
+
+    //
+    // If we are removing any levels, allocate a new vector to return,
+    // and copy the relevant information to it.
+    //
+    if (new_depth != depth-1) {
+        AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
+          sizeof(AddrUnsPair) * nActiveThreads);
+        for (proc = 0; (int)proc < nActiveThreads; proc++) {
+            Address addr(new_depth);
+            new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
+        }
+        int new_level = 0;
+        for (level = 1; level < (int)depth; level++) {
+            if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
+               if (level == threadLevel) {
+                   threadLevel = -1;
+               }
+               else if ((threadLevel >= 0) && (level < threadLevel)) {
+                   threadLevel--;
+               }
+               if (level == coreLevel) {
+                   coreLevel = -1;
+               }
+               else if ((coreLevel >= 0) && (level < coreLevel)) {
+                   coreLevel--;
+               }
+               if (level < pkgLevel) {
+                   pkgLevel--;
+               }
+               continue;
+            }
+            for (proc = 0; (int)proc < nActiveThreads; proc++) {
+                new_retval[proc].first.labels[new_level]
+                  = retval[proc].first.labels[level];
+            }
+            new_level++;
+        }
+
+        __kmp_free(retval);
+        retval = new_retval;
+        depth = new_depth;
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        //
+        // Set the granularity level based on what levels are modeled
+        // in the machine topology map.
+        //
+        __kmp_affinity_gran_levels = 0;
+        if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if (__kmp_affinity_gran > affinity_gran_package) {
+            __kmp_affinity_gran_levels++;
+        }
+    }
+
+    if (__kmp_affinity_verbose) {
+        __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1,
+          coreLevel-1, threadLevel-1);
+    }
+
+    KMP_CPU_FREE(oldMask);
+    *address2os = retval;
+    if(depth == 0) return 0;
+    else return depth-1;
+}
+#endif // KMP_USE_HWLOC
 
 //
 // If we don't know how to retrieve the machine's processor topology, or
@@ -329,7 +659,7 @@ __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
     int avail_ct = 0;
     unsigned int i;
-    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+    KMP_CPU_SET_ITERATE(i, fullMask) {
         //
         // Skip this proc if it is not included in the machine model.
         //
@@ -394,7 +724,7 @@ __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
       __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
     int avail_ct = 0;
     int i;
-    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+    KMP_CPU_SET_ITERATE(i, fullMask) {
         //
         // Skip this proc if it is not included in the machine model.
         //
@@ -656,7 +986,7 @@ __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
     apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
       __kmp_avail_proc * sizeof(apicThreadInfo));
     unsigned nApics = 0;
-    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+    KMP_CPU_SET_ITERATE(i, fullMask) {
         //
         // Skip this proc if it is not included in the machine model.
         //
@@ -1167,7 +1497,7 @@ __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
     //
     unsigned int proc;
     int nApics = 0;
-    for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
+    KMP_CPU_SET_ITERATE(proc, fullMask) {
         //
         // Skip this proc if it is not included in the machine model.
         //
@@ -2282,8 +2612,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
             maxOsId = osId;
         }
     }
-    kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
-      (maxOsId + 1) * __kmp_affin_mask_size);
+    kmp_affin_mask_t *osId2Mask;
+    KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1));
 
     //
     // Sort the address2os table according to physical order.  Doing so
@@ -2314,8 +2644,8 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
     unsigned j = 0;                             // index of 1st thread on core
     unsigned leader = 0;
     Address *leaderAddr = &(address2os[0].first);
-    kmp_affin_mask_t *sum
-      = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+    kmp_affin_mask_t *sum;
+    KMP_CPU_ALLOC_ON_STACK(sum);
     KMP_CPU_ZERO(sum);
     KMP_CPU_SET(address2os[0].second, sum);
     for (i = 1; i < numAddrs; i++) {
@@ -2365,6 +2695,7 @@ __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
         address2os[j].first.leader = (j == leader);
     }
     unique++;
+    KMP_CPU_FREE_FROM_STACK(sum);
 
     *maxIndex = maxOsId;
     *numUnique = unique;
@@ -2384,9 +2715,17 @@ static int nextNewMask;
 #define ADD_MASK(_mask) \
     {                                                                   \
         if (nextNewMask >= numNewMasks) {                               \
+            int i;                                                      \
             numNewMasks *= 2;                                           \
-            newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
-              numNewMasks * __kmp_affin_mask_size);                     \
+            kmp_affin_mask_t* temp;                                     \
+            KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);            \
+            for(i=0;i<numNewMasks/2;i++) {                              \
+                kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);    \
+                kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i);        \
+                KMP_CPU_COPY(dest, src);                                \
+            }                                                           \
+            KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2);       \
+            newMasks = temp;                                            \
         }                                                               \
         KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
         nextNewMask++;                                                  \
@@ -2416,6 +2755,7 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
   unsigned int *out_numMasks, const char *proclist,
   kmp_affin_mask_t *osId2Mask, int maxOsId)
 {
+    int i;
     const char *scan = proclist;
     const char *next = proclist;
 
@@ -2424,11 +2764,10 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
     // so that we can use realloc() to extend it.
     //
     numNewMasks = 2;
-    newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
-      * __kmp_affin_mask_size);
+    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
     nextNewMask = 0;
-    kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
-      __kmp_affin_mask_size);
+    kmp_affin_mask_t *sumMask;
+    KMP_CPU_ALLOC(sumMask);
     int setSize = 0;
 
     for (;;) {
@@ -2632,14 +2971,17 @@ __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
     *out_numMasks = nextNewMask;
     if (nextNewMask == 0) {
         *out_masks = NULL;
-        KMP_INTERNAL_FREE(newMasks);
+        KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
         return;
     }
-    *out_masks
-      = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
-    KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
-    __kmp_free(sumMask);
-    KMP_INTERNAL_FREE(newMasks);
+    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+    for(i = 0; i < nextNewMask; i++) {
+        kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
+        kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
+        KMP_CPU_COPY(dest, src);
+    }
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    KMP_CPU_FREE(sumMask);
 }
 
 
@@ -2834,7 +3176,7 @@ __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
     else if (**scan == '!') {
         (*scan)++;      // skip '!'
         __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
-        KMP_CPU_COMPLEMENT(tempMask);
+        KMP_CPU_COMPLEMENT(maxOsId, tempMask);
     }
     else if ((**scan >= '0') && (**scan <= '9')) {
         next = *scan;
@@ -2866,17 +3208,23 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
   unsigned int *out_numMasks, const char *placelist,
   kmp_affin_mask_t *osId2Mask, int maxOsId)
 {
+    int i,j,count,stride,sign;
     const char *scan = placelist;
     const char *next = placelist;
 
     numNewMasks = 2;
-    newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
-      * __kmp_affin_mask_size);
+    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
     nextNewMask = 0;
 
-    kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
-      __kmp_affin_mask_size);
+    // tempMask is modified based on the previous or initial
+    //   place to form the current place
+    // previousMask contains the previous place
+    kmp_affin_mask_t *tempMask;
+    kmp_affin_mask_t *previousMask;
+    KMP_CPU_ALLOC(tempMask);
     KMP_CPU_ZERO(tempMask);
+    KMP_CPU_ALLOC(previousMask);
+    KMP_CPU_ZERO(previousMask);
     int setSize = 0;
 
     for (;;) {
@@ -2910,7 +3258,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
           "bad explicit places list");
         next = scan;
         SKIP_DIGITS(next);
-        int count = __kmp_str_to_int(scan, *next);
+        count = __kmp_str_to_int(scan, *next);
         KMP_ASSERT(count >= 0);
         scan = next;
 
@@ -2918,7 +3266,6 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
         // valid follow sets are ',' ':' and EOL
         //
         SKIP_WS(scan);
-        int stride;
         if (*scan == '\0' || *scan == ',') {
             stride = +1;
         }
@@ -2929,7 +3276,7 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
             //
             // Read stride parameter
             //
-            int sign = +1;
+            sign = +1;
             for (;;) {
                 SKIP_WS(scan);
                 if (*scan == '+') {
@@ -2954,66 +3301,30 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
             stride *= sign;
         }
 
-        if (stride > 0) {
-            int i;
-            for (i = 0; i < count; i++) {
-                int j;
-                if (setSize == 0) {
-                    break;
-                }
-                ADD_MASK(tempMask);
-                setSize = 0;
-                for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
-                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
-                        KMP_CPU_CLR(j, tempMask);
-                    }
-                    else if ((j > maxOsId) ||
-                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
-                        if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
-                          && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
-                            KMP_WARNING(AffIgnoreInvalidProcID, j);
-                        }
-                        KMP_CPU_CLR(j, tempMask);
-                    }
-                    else {
-                        KMP_CPU_SET(j, tempMask);
-                        setSize++;
-                    }
-                }
-                for (; j >= 0; j--) {
-                    KMP_CPU_CLR(j, tempMask);
-                }
+        // Add places determined by initial_place : count : stride
+        for (i = 0; i < count; i++) {
+            if (setSize == 0) {
+                break;
             }
-        }
-        else {
-            int i;
-            for (i = 0; i < count; i++) {
-                int j;
-                if (setSize == 0) {
-                    break;
+            // Add the current place, then build the next place (tempMask) from that
+            KMP_CPU_COPY(previousMask, tempMask);
+            ADD_MASK(previousMask);
+            KMP_CPU_ZERO(tempMask);
+            setSize = 0;
+            KMP_CPU_SET_ITERATE(j, previousMask) {
+                if (! KMP_CPU_ISSET(j, previousMask)) {
+                    continue;
                 }
-                ADD_MASK(tempMask);
-                setSize = 0;
-                for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
-                  j++) {
-                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
-                        KMP_CPU_CLR(j, tempMask);
-                    }
-                    else if ((j > maxOsId) ||
-                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
-                        if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
-                          && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
-                            KMP_WARNING(AffIgnoreInvalidProcID, j);
-                        }
-                        KMP_CPU_CLR(j, tempMask);
-                    }
-                    else {
-                        KMP_CPU_SET(j, tempMask);
-                        setSize++;
+                else if ((j+stride > maxOsId) || (j+stride < 0) ||
+                  (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) {
+                    if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
+                      && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
+                        KMP_WARNING(AffIgnoreInvalidProcID, j+stride);
                     }
                 }
-                for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
-                    KMP_CPU_CLR(j, tempMask);
+                else {
+                    KMP_CPU_SET(j+stride, tempMask);
+                    setSize++;
                 }
             }
         }
@@ -3038,14 +3349,18 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
     *out_numMasks = nextNewMask;
     if (nextNewMask == 0) {
         *out_masks = NULL;
-        KMP_INTERNAL_FREE(newMasks);
+        KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
         return;
     }
-    *out_masks
-      = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
-    KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
-    __kmp_free(tempMask);
-    KMP_INTERNAL_FREE(newMasks);
+    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+    KMP_CPU_FREE(tempMask);
+    KMP_CPU_FREE(previousMask);
+    for(i = 0; i < nextNewMask; i++) {
+        kmp_affin_mask_t* src  = KMP_CPU_INDEX(newMasks, i);
+        kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i);
+        KMP_CPU_COPY(dest, src);
+    }
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
 }
 
 # endif /* OMP_40_ENABLED */
@@ -3140,7 +3455,7 @@ __kmp_aux_affinity_initialize(void)
     // processors that we know about on the machine.
     //
     if (fullMask == NULL) {
-        fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
+        KMP_CPU_ALLOC(fullMask);
     }
     if (KMP_AFFINITY_CAPABLE()) {
         if (__kmp_affinity_respect_mask) {
@@ -3151,7 +3466,7 @@ __kmp_aux_affinity_initialize(void)
             //
             unsigned i;
             __kmp_avail_proc = 0;
-            for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+            KMP_CPU_SET_ITERATE(i, fullMask) {
                 if (! KMP_CPU_ISSET(i, fullMask)) {
                     continue;
                 }
@@ -3193,39 +3508,60 @@ __kmp_aux_affinity_initialize(void)
         //
         const char *file_name = NULL;
         int line = 0;
-
-# if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-        if (__kmp_affinity_verbose) {
-            KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+# if KMP_USE_HWLOC
+        if (depth < 0) {
+            if (__kmp_affinity_verbose) {
+                KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+            }
+            if(!__kmp_hwloc_error) {
+                depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+                if (depth == 0) {
+                    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+                    KMP_ASSERT(address2os == NULL);
+                    return;
+                } else if(depth < 0 && __kmp_affinity_verbose) {
+                    KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+                }
+            } else if(__kmp_affinity_verbose) {
+                KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+            }
         }
+# endif
 
-        file_name = NULL;
-        depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-        if (depth == 0) {
-            KMP_ASSERT(__kmp_affinity_type == affinity_none);
-            KMP_ASSERT(address2os == NULL);
-            return;
-        }
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
         if (depth < 0) {
             if (__kmp_affinity_verbose) {
-                if (msg_id != kmp_i18n_null) {
-                    KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
-                      KMP_I18N_STR(DecodingLegacyAPIC));
-                }
-                else {
-                    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
-                }
+                KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
             }
 
             file_name = NULL;
-            depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+            depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
             if (depth == 0) {
                 KMP_ASSERT(__kmp_affinity_type == affinity_none);
                 KMP_ASSERT(address2os == NULL);
                 return;
             }
+
+            if (depth < 0) {
+                if (__kmp_affinity_verbose) {
+                    if (msg_id != kmp_i18n_null) {
+                        KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
+                          KMP_I18N_STR(DecodingLegacyAPIC));
+                    }
+                    else {
+                        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+                    }
+                }
+
+                file_name = NULL;
+                depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+                if (depth == 0) {
+                    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+                    KMP_ASSERT(address2os == NULL);
+                    return;
+                }
+            }
         }
 
 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -3430,6 +3766,50 @@ __kmp_aux_affinity_initialize(void)
         KMP_ASSERT(address2os != NULL);
     }
 
+# if KMP_USE_HWLOC
+    else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+        }
+        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+        if (depth == 0) {
+            KMP_ASSERT(__kmp_affinity_type == affinity_none);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+#  if KMP_DEBUG
+        AddrUnsPair *otheraddress2os = NULL;
+        int otherdepth = -1;
+#   if KMP_MIC
+        otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id);
+#   else
+        otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id);
+#   endif
+        if(otheraddress2os != NULL && address2os != NULL) {
+            int i;
+            unsigned arent_equal_flag = 0;
+            for(i=0;i<__kmp_avail_proc;i++) {
+                if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1;
+            }
+            if(arent_equal_flag) {
+                KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n"));
+                KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n"));
+                for(i=0;i<__kmp_avail_proc;i++) {
+                    otheraddress2os[i].print(); __kmp_printf("\n");
+                }
+                KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n"));
+                for(i=0;i<__kmp_avail_proc;i++) {
+                    address2os[i].print(); __kmp_printf("\n");
+                }
+            }
+            else {
+                KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n"));
+            }
+        }
+#  endif // KMP_DEBUG
+    }
+# endif // KMP_USE_HWLOC
+
     if (address2os == NULL) {
         if (KMP_AFFINITY_CAPABLE()
           && (__kmp_affinity_verbose || (__kmp_affinity_warnings
@@ -3608,8 +3988,7 @@ __kmp_aux_affinity_initialize(void)
         }
 # endif
 
-        __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
-          __kmp_affinity_num_masks * __kmp_affin_mask_size);
+        KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
 
         //
         // Sort the address2os table according to the current setting of
@@ -3679,7 +4058,7 @@ void
 __kmp_affinity_uninitialize(void)
 {
     if (__kmp_affinity_masks != NULL) {
-        __kmp_free(__kmp_affinity_masks);
+        KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
         __kmp_affinity_masks = NULL;
     }
     if (fullMask != NULL) {
@@ -3909,7 +4288,7 @@ __kmp_aux_set_affinity(void **mask)
             unsigned proc;
             int num_procs = 0;
 
-            for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
+            KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) {
                 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
                     continue;
                 }
@@ -4027,7 +4406,11 @@ __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
         }
     }
 
-    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+    if ((proc < 0)
+# if !KMP_USE_HWLOC
+         || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+       ) {
         return -1;
     }
     if (! KMP_CPU_ISSET(proc, fullMask)) {
@@ -4063,7 +4446,11 @@ __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
         }
     }
 
-    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+    if ((proc < 0)
+# if !KMP_USE_HWLOC
+         || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+       ) {
         return -1;
     }
     if (! KMP_CPU_ISSET(proc, fullMask)) {
@@ -4099,8 +4486,12 @@ __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
         }
     }
 
-    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
-        return 0;
+    if ((proc < 0)
+# if !KMP_USE_HWLOC
+         || ((unsigned)proc >= KMP_CPU_SETSIZE)
+# endif
+       ) {
+        return -1;
     }
     if (! KMP_CPU_ISSET(proc, fullMask)) {
         return 0;
@@ -4137,7 +4528,8 @@ void __kmp_balanced_affinity( int tid, int nthreads )
         KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
           "Illegal set affinity operation when not capable");
 
-        kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+        kmp_affin_mask_t *mask;
+        KMP_CPU_ALLOC_ON_STACK(mask);
         KMP_CPU_ZERO(mask);
 
         // Granularity == thread
@@ -4158,9 +4550,11 @@ void __kmp_balanced_affinity( int tid, int nthreads )
               tid, buf);
         }
         __kmp_set_system_affinity( mask, TRUE );
+        KMP_CPU_FREE_FROM_STACK(mask);
     } else { // Non-uniform topology
 
-        kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+        kmp_affin_mask_t *mask;
+        KMP_CPU_ALLOC_ON_STACK(mask);
         KMP_CPU_ZERO(mask);
 
         // Number of hyper threads per core in HT machine
@@ -4334,6 +4728,7 @@ void __kmp_balanced_affinity( int tid, int nthreads )
               tid, buf);
         }
         __kmp_set_system_affinity( mask, TRUE );
+        KMP_CPU_FREE_FROM_STACK(mask);
     }
 }