diff options
author | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2019-04-04 20:35:29 +0000 |
---|---|---|
committer | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2019-04-04 20:35:29 +0000 |
commit | b727d384a3ad0fe9ac1901e6da54c593b2c1a491 (patch) | |
tree | 984f0c9ee1942a6a1cd1d0bf68721ce1169734f7 /openmp/runtime/src | |
parent | 19d21854e92ed3acdeee17a6c2dd106f9e9dd058 (diff) | |
download | bcm5719-llvm-b727d384a3ad0fe9ac1901e6da54c593b2c1a491.tar.gz bcm5719-llvm-b727d384a3ad0fe9ac1901e6da54c593b2c1a491.zip |
[OpenMP] Fix hang on Windows
Debug dump on large machine shows when many OpenMP threads (401 in total)
sleep on a barrier, one of the innermost nesting levels sleeps
on a child's b_arrived flag whose value is equal to 4 and is equal to
checker value. i.e., (1) sleep bit is 0, and (2) done_check() would
return true if called.
It is unclear how this might happen. It could be Windows Server 2016's
error of EnterCriticalSection / LeaveCriticalSection, or
error of WaitForSingleObject / SetEvent / ResetEvent, or
error in the library which is very difficult to find.
As a workaround, change INFINITE wait to timed wait, so that each
thread awakens each 5 seconds (the timeout was chosen arbitrary to not
disturb other threads much), check flag condition under the lock, and
either go to sleep again or stop sleeping as a result of the check.
Patch by Andrey Churbanov
Differential Revision: https://reviews.llvm.org/D59793
llvm-svn: 357722
Diffstat (limited to 'openmp/runtime/src')
-rw-r--r-- | openmp/runtime/src/z_Windows_NT_util.cpp | 67 |
1 files changed, 46 insertions, 21 deletions
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp index 0049ca830a6..33b018174b6 100644 --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -193,8 +193,9 @@ void __kmp_win32_cond_destroy(kmp_win32_cond_t *cv) { /* TODO associate cv with a team instead of a thread so as to optimize the case where we wake up a whole team */ -void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx, - kmp_info_t *th, int need_decrease_load) { +template <class C> +static void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx, + kmp_info_t *th, C *flag) { int my_generation; int last_waiter; @@ -211,21 +212,46 @@ void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx, __kmp_win32_mutex_unlock(mx); for (;;) { - int wait_done; - + int wait_done = 0; + DWORD res, timeout = 5000; // just tried to quess an appropriate number /* Wait until the event is signaled */ - WaitForSingleObject(cv->event_, INFINITE); - - __kmp_win32_mutex_lock(&cv->waiters_count_lock_); - - /* Exit the loop when the <cv->event_> is signaled and there are still - waiting threads from this <wait_generation> that haven't been released - from this wait yet. */ - wait_done = (cv->release_count_ > 0) && - (cv->wait_generation_count_ != my_generation); - - __kmp_win32_mutex_unlock(&cv->waiters_count_lock_); - + res = WaitForSingleObject(cv->event_, timeout); + + if (res == WAIT_OBJECT_0) { + // event signaled + __kmp_win32_mutex_lock(&cv->waiters_count_lock_); + /* Exit the loop when the <cv->event_> is signaled and there are still + waiting threads from this <wait_generation> that haven't been released + from this wait yet. */ + wait_done = (cv->release_count_ > 0) && + (cv->wait_generation_count_ != my_generation); + __kmp_win32_mutex_unlock(&cv->waiters_count_lock_); + } else if (res == WAIT_TIMEOUT || res == WAIT_FAILED) { + // check if the flag and cv counters are in consistent state + // as MS sent us debug dump whith inconsistent state of data + __kmp_win32_mutex_lock(mx); + typename C::flag_t old_f = flag->set_sleeping(); + if (!flag->done_check_val(old_f & ~KMP_BARRIER_SLEEP_STATE)) { + __kmp_win32_mutex_unlock(mx); + continue; + } + // condition fulfilled, exiting + old_f = flag->unset_sleeping(); + KMP_DEBUG_ASSERT(old_f & KMP_BARRIER_SLEEP_STATE); + TCW_PTR(th->th.th_sleep_loc, NULL); + KF_TRACE(50, ("__kmp_win32_cond_wait: exiting, condition " + "fulfilled: flag's loc(%p): %u => %u\n", + flag->get(), old_f, *(flag->get()))); + + __kmp_win32_mutex_lock(&cv->waiters_count_lock_); + KMP_DEBUG_ASSERT(cv->waiters_count_ > 0); + cv->release_count_ = cv->waiters_count_; + cv->wait_generation_count_++; + wait_done = 1; + __kmp_win32_mutex_unlock(&cv->waiters_count_lock_); + + __kmp_win32_mutex_unlock(mx); + } /* there used to be a semicolon after the if statement, it looked like a bug, so i removed it */ if (wait_done) @@ -377,12 +403,11 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) { KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0); } deactivated = TRUE; - - __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, 0, - 0); + __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th, + flag); } else { - __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, 0, - 0); + __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th, + flag); } #ifdef KMP_DEBUG |