diff options
author | protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de> | 2019-12-27 14:39:50 +0100 |
---|---|---|
committer | protze@itc.rwth-aachen.de <protze@itc.rwth-aachen.de> | 2019-12-27 15:30:51 +0100 |
commit | 3356e268f6cdfd4815da70ae4fc6b8db63163375 (patch) | |
tree | a4fcca917560ce36a1d0e948db42c54416ead193 /openmp | |
parent | 69d85f805a3959d297970a4eaa0666b22c4d7e74 (diff) | |
download | bcm5719-llvm-3356e268f6cdfd4815da70ae4fc6b8db63163375.tar.gz bcm5719-llvm-3356e268f6cdfd4815da70ae4fc6b8db63163375.zip |
[OpenMP] Implementation of OMPT reduction callbacks
Including two tests
These callbacks were added late to the 5.0 specification, an implementation is missing.
Reviewed By: jdoerfert
Differential Review: https://reviews.llvm.org/D70395
Diffstat (limited to 'openmp')
-rw-r--r-- | openmp/runtime/src/kmp_barrier.cpp | 14 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_csupport.cpp | 23 | ||||
-rw-r--r-- | openmp/runtime/src/ompt-event-specific.h | 2 | ||||
-rw-r--r-- | openmp/runtime/src/ompt-specific.h | 26 | ||||
-rw-r--r-- | openmp/runtime/test/ompt/callback.h | 36 | ||||
-rw-r--r-- | openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c | 38 | ||||
-rw-r--r-- | openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c | 48 |
7 files changed, 181 insertions, 6 deletions
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index e17986b16a9..a6d87b5d7a2 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -15,9 +15,7 @@ #include "kmp_itt.h" #include "kmp_os.h" #include "kmp_stats.h" -#if OMPT_SUPPORT #include "ompt-specific.h" -#endif #if KMP_MIC #include <immintrin.h> @@ -128,8 +126,11 @@ static bool __kmp_linear_barrier_gather_template( gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, other_threads[i]->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -355,8 +356,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -600,8 +604,11 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -912,6 +919,8 @@ static void __kmp_hierarchical_barrier_gather( flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); if (reduce) { ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; ++child_tid) { KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " @@ -923,6 +932,7 @@ static void __kmp_hierarchical_barrier_gather( (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); } + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index d39bf9af433..ac9a93590ad 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -18,10 +18,7 @@ #include "kmp_itt.h" #include "kmp_lock.h" #include "kmp_stats.h" - -#if OMPT_SUPPORT #include "ompt-specific.h" -#endif #define MAX_MESSAGE 512 @@ -3429,13 +3426,18 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); + OMPT_REDUCTION_DECL(th, global_tid); if (packed_reduction_method == critical_reduce_block) { + OMPT_REDUCTION_BEGIN; + __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_BEGIN; + // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3536,15 +3538,20 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); + OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid); + if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); + OMPT_REDUCTION_END; } else if (packed_reduction_method == empty_reduce_block) { // usage: if team size == 1, no synchronization is required ( on Intel // platforms only ) + OMPT_REDUCTION_END; + } else if (packed_reduction_method == atomic_reduce_block) { // neither master nor other workers should get here @@ -3556,6 +3563,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, tree_reduce_block)) { // only master gets here + // OMPT: tree reduction is annotated in the barrier code } else { @@ -3629,13 +3637,17 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); + OMPT_REDUCTION_DECL(th, global_tid); + if (packed_reduction_method == critical_reduce_block) { + OMPT_REDUCTION_BEGIN; __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_BEGIN; // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3723,10 +3735,13 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, // this barrier should be visible to a customer and to the threading profile // tool (it's a terminating barrier on constructs if NOWAIT not specified) + OMPT_REDUCTION_DECL(th, global_tid); if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); + OMPT_REDUCTION_END; + // TODO: implicit barrier: should be exposed #if OMPT_SUPPORT ompt_frame_t *ompt_frame; @@ -3749,6 +3764,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_END; + // usage: if team size==1, no synchronization is required (Intel platforms only) // TODO: implicit barrier: should be exposed diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h index da6a0e42472..a5901b51114 100644 --- a/openmp/runtime/src/ompt-event-specific.h +++ b/openmp/runtime/src/ompt-event-specific.h @@ -99,7 +99,7 @@ #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL -#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED +#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h index 47d8a166984..5ba240c1a95 100644 --- a/openmp/runtime/src/ompt-specific.h +++ b/openmp/runtime/src/ompt-specific.h @@ -15,6 +15,7 @@ #include "kmp.h" +#if OMPT_SUPPORT /***************************************************************************** * forward declarations ****************************************************************************/ @@ -101,5 +102,30 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) { inline const char *ompt_get_runtime_version() { return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN]; } +#endif // OMPT_SUPPRORT + +// macros providing the OMPT callbacks for reduction clause +#if OMPT_SUPPORT && OMPT_OPTIONAL +#define OMPT_REDUCTION_DECL(this_thr, gtid) \ + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); \ + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); \ + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); +#define OMPT_REDUCTION_BEGIN \ + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \ + ompt_callbacks.ompt_callback(ompt_callback_reduction)( \ + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, \ + my_task_data, return_address); \ + } +#define OMPT_REDUCTION_END \ + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \ + ompt_callbacks.ompt_callback(ompt_callback_reduction)( \ + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, \ + my_task_data, return_address); \ + } +#else // OMPT_SUPPORT && OMPT_OPTIONAL +#define OMPT_REDUCTION_DECL(this_thr, gtid) +#define OMPT_REDUCTION_BEGIN +#define OMPT_REDUCTION_END +#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL #endif diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h index 64463ec83db..a5ba897c6ec 100644 --- a/openmp/runtime/test/ompt/callback.h +++ b/openmp/runtime/test/ompt/callback.h @@ -358,6 +358,9 @@ on_ompt_callback_sync_region( printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region\n"); + exit(-1); break; } break; @@ -377,6 +380,9 @@ on_ompt_callback_sync_region( printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region\n"); + exit(-1); break; } break; @@ -409,6 +415,9 @@ on_ompt_callback_sync_region_wait( printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region_wait\n"); + exit(-1); break; } break; @@ -428,12 +437,38 @@ on_ompt_callback_sync_region_wait( printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region_wait\n"); + exit(-1); break; } break; } } +static void on_ompt_callback_reduction(ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + const void *codeptr_ra) { + switch (endpoint) { + case ompt_scope_begin: + printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64 + ", task_id=%" PRIu64 ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, + (parallel_data) ? parallel_data->value : 0, task_data->value, + codeptr_ra); + break; + case ompt_scope_end: + printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64 + ", task_id=%" PRIu64 ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, + (parallel_data) ? parallel_data->value : 0, task_data->value, + codeptr_ra); + break; + } +} + static void on_ompt_callback_flush( ompt_data_t *thread_data, @@ -784,6 +819,7 @@ int ompt_initialize( register_callback(ompt_callback_nest_lock); register_callback(ompt_callback_sync_region); register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t); + register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t); register_callback(ompt_callback_control_tool); register_callback(ompt_callback_flush); register_callback(ompt_callback_cancel); diff --git a/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c new file mode 100644 index 00000000000..ca032984d50 --- /dev/null +++ b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c @@ -0,0 +1,38 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s +// REQUIRES: ompt +// UNSUPPORTED: gcc +#include "callback.h" +#include <omp.h> + +#ifdef NOWAIT +#define FOR_CLAUSE nowait +#else +#define FOR_CLAUSE +#endif + +int main() { + int sum = 0; + int i; +#pragma omp parallel num_threads(1) +#pragma omp for reduction(+ : sum) FOR_CLAUSE + for (i = 0; i < 10000; i++) { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], + // CHECK-SAME: codeptr_ra= + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], + // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra= + + return 0; +} diff --git a/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c new file mode 100644 index 00000000000..2c73fe13900 --- /dev/null +++ b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c @@ -0,0 +1,48 @@ +// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s +// REQUIRES: ompt +// UNSUPPORTED: gcc +#include "callback.h" +#include <omp.h> + +#ifdef NOWAIT +#define FOR_CLAUSE nowait +#else +#define FOR_CLAUSE +#endif + +int main() { + int sum = 0; + int i; +#pragma omp parallel num_threads(5) +#pragma omp for reduction(+ : sum) FOR_CLAUSE + for (i = 0; i < 10000; i++) { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // order and distribution to threads not determined + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + + return 0; +} |