diff options
Diffstat (limited to 'openmp/runtime')
| -rw-r--r-- | openmp/runtime/src/kmp.h | 8 | ||||
| -rw-r--r-- | openmp/runtime/src/kmp_dispatch.cpp | 6 | ||||
| -rw-r--r-- | openmp/runtime/src/kmp_wait_release.h | 4 |
3 files changed, 16 insertions, 2 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 82b607b0956..e5aab72fc75 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1706,6 +1706,12 @@ typedef struct dispatch_shared_info { volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1) kmp_int32 doacross_num_done; // count finished threads #endif +#if KMP_USE_HWLOC + // When linking with libhwloc, the ORDERED EPCC test slows down on big + // machines (> 48 cores). Performance analysis showed that a cache thrash + // was occurring and this padding helps alleviate the problem. + char padding[64]; +#endif } dispatch_shared_info_t; typedef struct kmp_disp { @@ -2567,7 +2573,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team { int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call // Read/write by workers as well ----------------------------------------------------------------------- -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 +#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_HWLOC // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel' // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel' // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding. diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index 071035013ee..4b24b292ab3 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -180,6 +180,12 @@ struct dispatch_shared_info_template { kmp_uint32 *doacross_flags; // array of iteration flags (0/1) kmp_int32 doacross_num_done; // count finished threads #endif +#if KMP_USE_HWLOC + // When linking with libhwloc, the ORDERED EPCC test slowsdown on big + // machines (> 48 cores). Performance analysis showed that a cache thrash + // was occurring and this padding helps alleviate the problem. + char padding[64]; +#endif }; /* ------------------------------------------------------------------------ */ diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h index f791ea6f8a9..c8f6b5c2be0 100644 --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -97,6 +97,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin kmp_uint32 hibernate; int th_gtid; int tasks_completed = FALSE; + int oversubscribed; KMP_FSYNC_SPIN_INIT(spin, NULL); if (flag->done_check()) { @@ -166,6 +167,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin hibernate - __kmp_global.g.g_time.dt.t_value)); } + oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc); KMP_MB(); // Main wait spin loop @@ -201,7 +203,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin } // If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield - KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); + KMP_YIELD(oversubscribed); // TODO: Should it be number of cores instead of thread contexts? Like: // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores); // Need performance improvement data to make the change... |

