diff options
| author | Andrey Churbanov <Andrey.Churbanov@intel.com> | 2018-11-07 12:19:57 +0000 |
|---|---|---|
| committer | Andrey Churbanov <Andrey.Churbanov@intel.com> | 2018-11-07 12:19:57 +0000 |
| commit | c334434550d205e6ab90ebd80d0c4e05482c900b (patch) | |
| tree | f058ba700d1b92121ddd0dacf7808061903e0501 /openmp/runtime/src | |
| parent | 8cc5cf2ee9cd7c5eda5b4aab3560e9ffb6c90dde (diff) | |
| download | bcm5719-llvm-c334434550d205e6ab90ebd80d0c4e05482c900b.tar.gz bcm5719-llvm-c334434550d205e6ab90ebd80d0c4e05482c900b.zip | |
Implementation of OpenMP 5.0 mutexinoutset task dependency type.
Differential Revision: https://reviews.llvm.org/D53380
llvm-svn: 346307
Diffstat (limited to 'openmp/runtime/src')
| -rw-r--r-- | openmp/runtime/src/kmp.h | 20 | ||||
| -rw-r--r-- | openmp/runtime/src/kmp_taskdeps.cpp | 233 | ||||
| -rw-r--r-- | openmp/runtime/src/kmp_taskdeps.h | 5 | ||||
| -rw-r--r-- | openmp/runtime/src/kmp_tasking.cpp | 316 |
4 files changed, 375 insertions, 199 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index fbb5ab81bf1..9f462a2d85e 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2160,30 +2160,35 @@ typedef union kmp_depnode kmp_depnode_t; typedef struct kmp_depnode_list kmp_depnode_list_t; typedef struct kmp_dephash_entry kmp_dephash_entry_t; +// Compiler sends us this info: typedef struct kmp_depend_info { kmp_intptr_t base_addr; size_t len; struct { bool in : 1; bool out : 1; + bool mtx : 1; } flags; } kmp_depend_info_t; +// Internal structures to work with task dependencies: struct kmp_depnode_list { kmp_depnode_t *node; kmp_depnode_list_t *next; }; -typedef struct kmp_base_depnode { - kmp_depnode_list_t *successors; - kmp_task_t *task; - - kmp_lock_t lock; +// Max number of mutexinoutset dependencies per node +#define MAX_MTX_DEPS 4 +typedef struct kmp_base_depnode { + kmp_depnode_list_t *successors; /* used under lock */ + kmp_task_t *task; /* non-NULL if depnode is active, used under lock */ + kmp_lock_t *mtx_locks[MAX_MTX_DEPS]; /* lock mutexinoutset dependent tasks */ + kmp_int32 mtx_num_locks; /* number of locks in mtx_locks array */ + kmp_lock_t lock; /* guards shared fields: task, successors */ #if KMP_SUPPORT_GRAPH_OUTPUT kmp_uint32 id; #endif - std::atomic<kmp_int32> npredecessors; std::atomic<kmp_int32> nrefs; } kmp_base_depnode_t; @@ -2198,6 +2203,9 @@ struct kmp_dephash_entry { kmp_intptr_t addr; kmp_depnode_t *last_out; kmp_depnode_list_t *last_ins; + kmp_depnode_list_t *last_mtxs; + kmp_int32 last_flag; + kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */ kmp_dephash_entry_t *next_in_bucket; }; diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp index b5d53f1ae68..b48c5b633ce 100644 --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -37,12 +37,14 @@ static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0); #endif static void __kmp_init_node(kmp_depnode_t *node) { - node->dn.task = NULL; // set to null initially, it will point to the right - // task once dependences have been processed node->dn.successors = NULL; + node->dn.task = NULL; // will point to the rigth task + // once dependences have been processed + for (int i = 0; i < MAX_MTX_DEPS; ++i) + node->dn.mtx_locks[i] = NULL; + node->dn.mtx_num_locks = 0; __kmp_init_lock(&node->dn.lock); - KMP_ATOMIC_ST_RLX(&node->dn.nrefs, - 1); // init creates the first reference to the node + KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference #ifdef KMP_SUPPORT_GRAPH_OUTPUT node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed); #endif @@ -94,6 +96,9 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread, return h; } +#define ENTRY_LAST_INS 0 +#define ENTRY_LAST_MTXS 1 + static kmp_dephash_entry * __kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr) { kmp_int32 bucket = __kmp_dephash_hash(addr, h->size); @@ -115,6 +120,9 @@ __kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr) { entry->addr = addr; entry->last_out = NULL; entry->last_ins = NULL; + entry->last_mtxs = NULL; + entry->last_flag = ENTRY_LAST_INS; + entry->mtx_lock = NULL; entry->next_in_bucket = h->buckets[bucket]; h->buckets[bucket] = entry; #ifdef KMP_DEBUG @@ -173,6 +181,58 @@ static inline void __kmp_track_dependence(kmp_depnode_t *source, #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */ } +static inline kmp_int32 +__kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, + kmp_task_t *task, kmp_depnode_t *node, + kmp_depnode_list_t *plist) { + if (!plist) + return 0; + kmp_int32 npredecessors = 0; + // link node as successor of list elements + for (kmp_depnode_list_t *p = plist; p; p = p->next) { + kmp_depnode_t *dep = p->node; + if (dep->dn.task) { + KMP_ACQUIRE_DEPNODE(gtid, dep); + if (dep->dn.task) { + __kmp_track_dependence(dep, node, task); + dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + "%p\n", + gtid, KMP_TASK_TO_TASKDATA(dep->dn.task), + KMP_TASK_TO_TASKDATA(task))); + npredecessors++; + } + KMP_RELEASE_DEPNODE(gtid, dep); + } + } + return npredecessors; +} + +static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, + kmp_info_t *thread, + kmp_task_t *task, + kmp_depnode_t *source, + kmp_depnode_t *sink) { + if (!sink) + return 0; + kmp_int32 npredecessors = 0; + if (sink->dn.task) { + // synchronously add source to sink' list of successors + KMP_ACQUIRE_DEPNODE(gtid, sink); + if (sink->dn.task) { + __kmp_track_dependence(sink, source, task); + sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + "%p\n", + gtid, KMP_TASK_TO_TASKDATA(sink->dn.task), + KMP_TASK_TO_TASKDATA(task))); + npredecessors++; + } + KMP_RELEASE_DEPNODE(gtid, sink); + } + return npredecessors; +} + template <bool filter> static inline kmp_int32 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash, @@ -187,72 +247,106 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash, for (kmp_int32 i = 0; i < ndeps; i++) { const kmp_depend_info_t *dep = &dep_list[i]; - KMP_DEBUG_ASSERT(dep->flags.in); - if (filter && dep->base_addr == 0) continue; // skip filtered entries kmp_dephash_entry_t *info = __kmp_dephash_find(thread, hash, dep->base_addr); kmp_depnode_t *last_out = info->last_out; - - if (dep->flags.out && info->last_ins) { - for (kmp_depnode_list_t *p = info->last_ins; p; p = p->next) { - kmp_depnode_t *indep = p->node; - if (indep->dn.task) { - KMP_ACQUIRE_DEPNODE(gtid, indep); - if (indep->dn.task) { - __kmp_track_dependence(indep, node, task); - indep->dn.successors = - __kmp_add_node(thread, indep->dn.successors, node); - KA_TRACE(40, ("__kmp_process_deps<%d>: T#%d adding dependence from " - "%p to %p\n", - filter, gtid, KMP_TASK_TO_TASKDATA(indep->dn.task), - KMP_TASK_TO_TASKDATA(task))); - npredecessors++; - } - KMP_RELEASE_DEPNODE(gtid, indep); + kmp_depnode_list_t *last_ins = info->last_ins; + kmp_depnode_list_t *last_mtxs = info->last_mtxs; + + if (dep->flags.out) { // out --> clean lists of ins and mtxs if any + if (last_ins || last_mtxs) { + if (info->last_flag == ENTRY_LAST_INS) { // INS were last + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_ins); + } else { // MTXS were last + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs); } + __kmp_depnode_list_free(thread, last_ins); + __kmp_depnode_list_free(thread, last_mtxs); + info->last_ins = NULL; + info->last_mtxs = NULL; + } else { + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_out); } - - __kmp_depnode_list_free(thread, info->last_ins); - info->last_ins = NULL; - - } else if (last_out && last_out->dn.task) { - KMP_ACQUIRE_DEPNODE(gtid, last_out); - if (last_out->dn.task) { - __kmp_track_dependence(last_out, node, task); - last_out->dn.successors = - __kmp_add_node(thread, last_out->dn.successors, node); - KA_TRACE( - 40, - ("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p\n", - filter, gtid, KMP_TASK_TO_TASKDATA(last_out->dn.task), - KMP_TASK_TO_TASKDATA(task))); - - npredecessors++; - } - KMP_RELEASE_DEPNODE(gtid, last_out); - } - - if (dep_barrier) { - // if this is a sync point in the serial sequence, then the previous - // outputs are guaranteed to be completed after - // the execution of this task so the previous output nodes can be cleared. __kmp_node_deref(thread, last_out); - info->last_out = NULL; + if (dep_barrier) { + // if this is a sync point in the serial sequence, then the previous + // outputs are guaranteed to be completed after the execution of this + // task so the previous output nodes can be cleared. + info->last_out = NULL; + } else { + info->last_out = __kmp_node_ref(node); + } + } else if (dep->flags.in) { + // in --> link node to either last_out or last_mtxs, clean earlier deps + if (last_mtxs) { + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs); + __kmp_node_deref(thread, last_out); + info->last_out = NULL; + if (info->last_flag == ENTRY_LAST_MTXS && last_ins) { // MTXS were last + // clean old INS before creating new list + __kmp_depnode_list_free(thread, last_ins); + info->last_ins = NULL; + } + } else { + // link node as successor of the last_out if any + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_out); + } + info->last_flag = ENTRY_LAST_INS; + info->last_ins = __kmp_add_node(thread, info->last_ins, node); } else { - if (dep->flags.out) { + KMP_DEBUG_ASSERT(dep->flags.mtx == 1); + // mtx --> link node to either last_out or last_ins, clean earlier deps + if (last_ins) { + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_ins); __kmp_node_deref(thread, last_out); - info->last_out = __kmp_node_ref(node); - } else - info->last_ins = __kmp_add_node(thread, info->last_ins, node); + info->last_out = NULL; + if (info->last_flag == ENTRY_LAST_INS && last_mtxs) { // INS were last + // clean old MTXS before creating new list + __kmp_depnode_list_free(thread, last_mtxs); + info->last_mtxs = NULL; + } + } else { + // link node as successor of the last_out if any + npredecessors += + __kmp_depnode_link_successor(gtid, thread, task, node, last_out); + } + info->last_flag = ENTRY_LAST_MTXS; + info->last_mtxs = __kmp_add_node(thread, info->last_mtxs, node); + if (info->mtx_lock == NULL) { + info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); + __kmp_init_lock(info->mtx_lock); + } + KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS); + kmp_int32 m; + // Save lock in node's array + for (m = 0; m < MAX_MTX_DEPS; ++m) { + // sort pointers in decreasing order to avoid potential livelock + if (node->dn.mtx_locks[m] < info->mtx_lock) { + KMP_DEBUG_ASSERT(node->dn.mtx_locks[node->dn.mtx_num_locks] == NULL); + for (int n = node->dn.mtx_num_locks; n > m; --n) { + // shift right all lesser non-NULL pointers + KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL); + node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1]; + } + node->dn.mtx_locks[m] = info->mtx_lock; + break; + } + } + KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop + node->dn.mtx_num_locks++; } } - KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter, gtid, npredecessors)); - return npredecessors; } @@ -266,8 +360,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) { - int i; - + int i, n_mtxs = 0; #if KMP_DEBUG kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); #endif @@ -279,13 +372,31 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node, // Filter deps in dep_list // TODO: Different algorithm for large dep_list ( > 10 ? ) for (i = 0; i < ndeps; i++) { - if (dep_list[i].base_addr != 0) - for (int j = i + 1; j < ndeps; j++) + if (dep_list[i].base_addr != 0) { + for (int j = i + 1; j < ndeps; j++) { if (dep_list[i].base_addr == dep_list[j].base_addr) { dep_list[i].flags.in |= dep_list[j].flags.in; - dep_list[i].flags.out |= dep_list[j].flags.out; + dep_list[i].flags.out |= + (dep_list[j].flags.out || + (dep_list[i].flags.in && dep_list[j].flags.mtx) || + (dep_list[i].flags.mtx && dep_list[j].flags.in)); + dep_list[i].flags.mtx = + dep_list[i].flags.mtx | dep_list[j].flags.mtx && + !dep_list[i].flags.out; dep_list[j].base_addr = 0; // Mark j element as void } + } + if (dep_list[i].flags.mtx) { + // limit number of mtx deps to MAX_MTX_DEPS per node + if (n_mtxs < MAX_MTX_DEPS && task != NULL) { + ++n_mtxs; + } else { + dep_list[i].flags.in = 1; // downgrade mutexinoutset to inout + dep_list[i].flags.out = 1; + dep_list[i].flags.mtx = 0; + } + } + } } // doesn't need to be atomic as no other thread is going to be accessing this @@ -469,7 +580,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking " "dependencies : " - "loc=%p task=%p, transferring to __kmpc_omp_task\n", + "loc=%p task=%p, transferring to __kmp_omp_task\n", gtid, loc_ref, new_taskdata)); kmp_int32 ret = __kmp_omp_task(gtid, new_task, true); diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h index 2e79b1cafc9..8496884d1c3 100644 --- a/openmp/runtime/src/kmp_taskdeps.h +++ b/openmp/runtime/src/kmp_taskdeps.h @@ -62,7 +62,12 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread, for (kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next) { next = entry->next_in_bucket; __kmp_depnode_list_free(thread, entry->last_ins); + __kmp_depnode_list_free(thread, entry->last_mtxs); __kmp_node_deref(thread, entry->last_out); + if (entry->mtx_lock) { + __kmp_destroy_lock(entry->mtx_lock); + __kmp_free(entry->mtx_lock); + } #if USE_FAST_MEMORY __kmp_fast_free(thread, entry); #else diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index d2b5ddeb2ff..2d7468659c5 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -32,7 +32,7 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread, static int __kmp_realloc_task_threads_data(kmp_info_t *thread, kmp_task_team_t *task_team); -#ifdef OMP_45_ENABLED +#if OMP_45_ENABLED static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); #endif @@ -251,6 +251,79 @@ static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, } #endif /* BUILD_TIED_TASK_STACK */ +// returns 1 if new task is allowed to execute, 0 otherwise +// checks Task Scheduling constraint (if requested) and +// mutexinoutset dependencies if any +static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, + const kmp_taskdata_t *tasknew, + const kmp_taskdata_t *taskcurr) { + if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { + // Check if the candidate obeys the Task Scheduling Constraints (TSC) + // only descendant of all deferred tied tasks can be scheduled, checking + // the last one is enough, as it in turn is the descendant of all others + kmp_taskdata_t *current = taskcurr->td_last_tied; + KMP_DEBUG_ASSERT(current != NULL); + // check if the task is not suspended on barrier + if (current->td_flags.tasktype == TASK_EXPLICIT || + current->td_taskwait_thread > 0) { // <= 0 on barrier + kmp_int32 level = current->td_level; + kmp_taskdata_t *parent = tasknew->td_parent; + while (parent != current && parent->td_level > level) { + // check generation up to the level of the current task + parent = parent->td_parent; + KMP_DEBUG_ASSERT(parent != NULL); + } + if (parent != current) + return false; + } + } + // Check mutexinoutset dependencies, acquire locks + kmp_depnode_t *node = tasknew->td_depnode; + if (node && (node->dn.mtx_num_locks > 0)) { + for (int i = 0; i < node->dn.mtx_num_locks; ++i) { + KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); + if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) + continue; + // could not get the lock, release previous locks + for (int j = i - 1; j >= 0; --j) + __kmp_release_lock(node->dn.mtx_locks[j], gtid); + return false; + } + // negative num_locks means all locks acquired successfully + node->dn.mtx_num_locks = -node->dn.mtx_num_locks; + } + return true; +} + +// __kmp_realloc_task_deque: +// Re-allocates a task deque for a particular thread, copies the content from +// the old deque and adjusts the necessary data structures relating to the +// deque. This operation must be done with the deque_lock being held +static void __kmp_realloc_task_deque(kmp_info_t *thread, + kmp_thread_data_t *thread_data) { + kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); + kmp_int32 new_size = 2 * size; + + KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " + "%d] for thread_data %p\n", + __kmp_gtid_from_thread(thread), size, new_size, thread_data)); + + kmp_taskdata_t **new_deque = + (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); + + int i, j; + for (i = thread_data->td.td_deque_head, j = 0; j < size; + i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) + new_deque[j] = thread_data->td.td_deque[i]; + + __kmp_free(thread_data->td.td_deque); + + thread_data->td.td_deque_head = 0; + thread_data->td.td_deque_tail = size; + thread_data->td.td_deque = new_deque; + thread_data->td.td_deque_size = new_size; +} + // __kmp_push_task: Add a task to the thread's deque static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; @@ -298,33 +371,47 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { __kmp_alloc_task_deque(thread, thread_data); } + int locked = 0; // Check if deque is full if (TCR_4(thread_data->td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td)) { - KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " - "TASK_NOT_PUSHED for task %p\n", - gtid, taskdata)); - return TASK_NOT_PUSHED; + if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, + thread->th.th_current_task)) { + KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " + "TASK_NOT_PUSHED for task %p\n", + gtid, taskdata)); + return TASK_NOT_PUSHED; + } else { + __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); + locked = 1; + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } } - // Lock the deque for the task push operation - __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); - + if (!locked) { + __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); #if OMP_45_ENABLED - // Need to recheck as we can get a proxy task from a thread outside of OpenMP - if (TCR_4(thread_data->td.td_deque_ntasks) >= - TASK_DEQUE_SIZE(thread_data->td)) { - __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " - "TASK_NOT_PUSHED for task %p\n", - gtid, taskdata)); - return TASK_NOT_PUSHED; + // Need to recheck as we can get a proxy task from thread outside of OpenMP + if (TCR_4(thread_data->td.td_deque_ntasks) >= + TASK_DEQUE_SIZE(thread_data->td)) { + if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, + thread->th.th_current_task)) { + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " + "returning TASK_NOT_PUSHED for task %p\n", + gtid, taskdata)); + return TASK_NOT_PUSHED; + } else { + // expand deque to push the task which is not allowed to execute + __kmp_realloc_task_deque(thread, thread_data); + } + } +#endif } -#else // Must have room since no thread can add tasks but calling thread KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td)); -#endif thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; // Push taskdata @@ -678,11 +765,31 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid, taskdata = parent_taskdata; + if (team_serial) + return; // Stop checking ancestors at implicit task instead of walking up ancestor // tree to avoid premature deallocation of ancestors. - if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) + if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { + if (taskdata->td_dephash) { // do we need to cleanup dephash? + int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); + kmp_tasking_flags_t flags_old = taskdata->td_flags; + if (children == 0 && flags_old.complete == 1) { + kmp_tasking_flags_t flags_new = flags_old; + flags_new.complete = 0; + if (KMP_COMPARE_AND_STORE_ACQ32( + RCAST(kmp_int32 *, &taskdata->td_flags), + *RCAST(kmp_int32 *, &flags_old), + *RCAST(kmp_int32 *, &flags_new))) { + KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " + "dephash of implicit task %p\n", + gtid, taskdata)); + // cleanup dephash of finished implicit task + __kmp_dephash_free_entries(thread, taskdata->td_dephash); + } + } + } return; - + } // Predecrement simulated by "- 1" calculation children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; KMP_DEBUG_ASSERT(children >= 0); @@ -750,6 +857,17 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, __ompt_task_finish(task, resumed_task); #endif + // Check mutexinoutset dependencies, release locks + kmp_depnode_t *node = taskdata->td_depnode; + if (node && (node->dn.mtx_num_locks < 0)) { + // negative num_locks means all locks were acquired + node->dn.mtx_num_locks = -node->dn.mtx_num_locks; + for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) { + KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); + __kmp_release_lock(node->dn.mtx_locks[i], gtid); + } + } + KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); taskdata->td_flags.complete = 1; // mark the task as completed KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); @@ -976,8 +1094,24 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, // thread: thread data structure corresponding to implicit task void __kmp_finish_implicit_task(kmp_info_t *thread) { kmp_taskdata_t *task = thread->th.th_current_task; - if (task->td_dephash) - __kmp_dephash_free_entries(thread, task->td_dephash); + if (task->td_dephash) { + int children; + task->td_flags.complete = 1; + children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); + kmp_tasking_flags_t flags_old = task->td_flags; + if (children == 0 && flags_old.complete == 1) { + kmp_tasking_flags_t flags_new = flags_old; + flags_new.complete = 0; + if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), + *RCAST(kmp_int32 *, &flags_old), + *RCAST(kmp_int32 *, &flags_new))) { + KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " + "dephash of implicit task %p\n", + thread->th.th_info.ds.ds_gtid, task)); + __kmp_dephash_free_entries(thread, task->td_dephash); + } + } + } } // __kmp_free_implicit_task: Release resources associated to implicit tasks @@ -2229,33 +2363,16 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, TASK_DEQUE_MASK(thread_data->td); // Wrap index. taskdata = thread_data->td.td_deque[tail]; - if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { - // we need to check if the candidate obeys task scheduling constraint (TSC) - // only descendant of all deferred tied tasks can be scheduled, checking - // the last one is enough, as it in turn is the descendant of all others - kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied; - KMP_DEBUG_ASSERT(current != NULL); - // check if last tied task is not suspended on barrier - if (current->td_flags.tasktype == TASK_EXPLICIT || - current->td_taskwait_thread > 0) { // <= 0 on barrier - kmp_int32 level = current->td_level; - kmp_taskdata_t *parent = taskdata->td_parent; - while (parent != current && parent->td_level > level) { - parent = parent->td_parent; // check generation up to the level of the - // current task - KMP_DEBUG_ASSERT(parent != NULL); - } - if (parent != current) { - // The TSC does not allow to steal victim task - __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " - "ntasks=%d head=%u tail=%u\n", - gtid, thread_data->td.td_deque_ntasks, - thread_data->td.td_deque_head, - thread_data->td.td_deque_tail)); - return NULL; - } - } + if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, + thread->th.th_current_task)) { + // The TSC does not allow to steal victim task + __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + KA_TRACE(10, + ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " + "ntasks=%d head=%u tail=%u\n", + gtid, thread_data->td.td_deque_ntasks, + thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); + return NULL; } thread_data->td.td_deque_tail = tail; @@ -2263,7 +2380,7 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); - KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " + KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " "ntasks=%d head=%u tail=%u\n", gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); @@ -2284,7 +2401,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, kmp_taskdata_t *taskdata; kmp_taskdata_t *current; kmp_thread_data_t *victim_td, *threads_data; - kmp_int32 level, target; + kmp_int32 target; kmp_int32 victim_tid; KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); @@ -2324,69 +2441,33 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, } KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); - + current = __kmp_threads[gtid]->th.th_current_task; taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; - if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { - // we need to check if the candidate obeys task scheduling constraint (TSC) - // only descendant of all deferred tied tasks can be scheduled, checking - // the last one is enough, as it in turn is the descendant of all others - current = __kmp_threads[gtid]->th.th_current_task->td_last_tied; - KMP_DEBUG_ASSERT(current != NULL); - // check if last tied task is not suspended on barrier - if (current->td_flags.tasktype == TASK_EXPLICIT || - current->td_taskwait_thread > 0) { // <= 0 on barrier - level = current->td_level; - kmp_taskdata_t *parent = taskdata->td_parent; - while (parent != current && parent->td_level > level) { - parent = parent->td_parent; // check generation up to the level of the - // current task - KMP_DEBUG_ASSERT(parent != NULL); - } - if (parent != current) { - if (!task_team->tt.tt_untied_task_encountered) { - // The TSC does not allow to steal victim task - __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); - KA_TRACE(10, - ("__kmp_steal_task(exit #3): T#%d could not steal from " - "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", - gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, - victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); - return NULL; - } - taskdata = NULL; // will check other tasks in victim's deque - } - } - } - if (taskdata != NULL) { + if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { // Bump head pointer and Wrap. victim_td->td.td_deque_head = (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); } else { + if (!task_team->tt.tt_untied_task_encountered) { + // The TSC does not allow to steal victim task + __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); + KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " + "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", + gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, + victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); + return NULL; + } int i; // walk through victim's deque trying to steal any task target = victim_td->td.td_deque_head; + taskdata = NULL; for (i = 1; i < ntasks; ++i) { target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); taskdata = victim_td->td.td_deque[target]; - if (taskdata->td_flags.tiedness == TASK_TIED) { - // check if the candidate obeys the TSC - kmp_taskdata_t *parent = taskdata->td_parent; - // check generation up to the level of the current task - while (parent != current && parent->td_level > level) { - parent = parent->td_parent; - KMP_DEBUG_ASSERT(parent != NULL); - } - if (parent != current) { - // The TSC does not allow to steal the candidate - taskdata = NULL; - continue; - } else { - // found victim tied task - break; - } + if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { + break; // found victim task } else { - // found victim untied task - break; + taskdata = NULL; } } if (taskdata == NULL) { @@ -2834,35 +2915,6 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread, thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; } -// __kmp_realloc_task_deque: -// Re-allocates a task deque for a particular thread, copies the content from -// the old deque and adjusts the necessary data structures relating to the -// deque. This operation must be done with a the deque_lock being held -static void __kmp_realloc_task_deque(kmp_info_t *thread, - kmp_thread_data_t *thread_data) { - kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); - kmp_int32 new_size = 2 * size; - - KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " - "%d] for thread_data %p\n", - __kmp_gtid_from_thread(thread), size, new_size, thread_data)); - - kmp_taskdata_t **new_deque = - (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); - - int i, j; - for (i = thread_data->td.td_deque_head, j = 0; j < size; - i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) - new_deque[j] = thread_data->td.td_deque[i]; - - __kmp_free(thread_data->td.td_deque); - - thread_data->td.td_deque_head = 0; - thread_data->td.td_deque_tail = size; - thread_data->td.td_deque = new_deque; - thread_data->td.td_deque_size = new_size; -} - // __kmp_free_task_deque: // Deallocates a task deque for a particular thread. Happens at library // deallocation so don't need to reset all thread data fields. @@ -3422,7 +3474,7 @@ release_and_exit: /* The finish of the proxy tasks is divided in two pieces: - the top half is the one that can be done from a thread outside the team - - the bottom half must be run from a them within the team + - the bottom half must be run from a thread within the team In order to run the bottom half the task gets queued back into one of the threads of the team. Once the td_incomplete_child_task counter of the parent |

