// RUN: %libomp-cxx-compile-and-run // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run // GCC-5 is needed for OpenMP 4.0 support (taskgroup) // XFAIL: gcc-4 #include #include #include #include // Total number of loop iterations, should be multiple of T for this test #define N 10000 // Flag to request lazy (1) or eager (0) allocation of reduction objects #ifndef FLG #define FLG 0 #endif /* // initial user's code that corresponds to pseudo code of the test #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x) { for( int l = 0; l < N; ++l ) { #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x) { i += l; if( l%2 ) x *= 1.0 / (l + 1); else x *= (l + 1); } } #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y) { for( int l = 0; l < N; ++l ) { #pragma omp task firstprivate(l) in_reduction(+:j,y) \ in_reduction(*:x) in_reduction(-:k) { j += l; k -= l; y += (double)l; if( l%2 ) x *= 1.0 / (l + 1); else x *= (l + 1); } #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k) { i -= l; k -= l; y += (double)l; } #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x) { j += l; if( l%2 ) x *= 1.0 / (l + 1); else x *= (l + 1); } } } // inner reduction for( int l = 0; l < N; ++l ) { #pragma omp task firstprivate(l) in_reduction(+:j) j += l; } } // outer reduction */ //------------------------------------------------ // OpenMP runtime library routines #ifdef __cplusplus extern "C" { #endif extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item); extern void* __kmpc_task_reduction_init(int gtid, int num, void* data); extern int __kmpc_global_thread_num(void*); #ifdef __cplusplus } #endif //------------------------------------------------ // Compiler-generated code typedef struct _task_red_item { void *shar; // shared reduction item size_t size; // size of data item void *f_init; // data initialization routine void *f_fini; // data finalization routine void *f_comb; // data combiner routine unsigned flags; } _task_red_item_t; // int:+ no need in init/fini callbacks, valid for subtraction void __red_int_add_comb(void *lhs, void *rhs) // combiner { *(int*)lhs += *(int*)rhs; } // long long:+ no need in init/fini callbacks, valid for subtraction void __red_llong_add_comb(void *lhs, void *rhs) // combiner { *(long long*)lhs += *(long long*)rhs; } // double:* no need in fini callback void __red_dbl_mul_init(void *data) // initializer { *(double*)data = 1.0; } void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner { *(double*)lhs *= *(double*)rhs; } // double:+ no need in init/fini callbacks void __red_dbl_add_comb(void *lhs, void *rhs) // combiner { *(double*)lhs += *(double*)rhs; } // ============================== void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py) { for( int l = 0; l < N; ++l ) { *pi += l; if( l%2 ) *px *= 1.0 / (l + 1); else *px *= (l + 1); } for( int l = 0; l < N; ++l ) { *pj += l; *pk -= l; *py += (double)l; if( l%2 ) *px *= 1.0 / (l + 1); else *px *= (l + 1); *pi -= l; *pk -= l; *py += (double)l; *pj += l; if( l%2 ) *px *= 1.0 / (l + 1); else *px *= (l + 1); } for( int l = 0; l < N; ++l ) { *pj += l; } } //------------------------------------------------ // Test case int main() { int nthreads = omp_get_max_threads(); int err = 0; void** ptrs = (void**)malloc(nthreads*sizeof(void*)); // user's code ====================================== // variables for serial calculations: int is = 3; long long js = -9999999; double xs = 99999.0; long long ks = 99999999; double ys = -99999999.0; // variables for parallel calculations: int ip = 3; long long jp = -9999999; double xp = 99999.0; long long kp = 99999999; double yp = -99999999.0; calc_serial(&is, &js, &xs, &ks, &ys); // ================================================== for (int i = 0; i < nthreads; ++i) ptrs[i] = NULL; #pragma omp parallel { #pragma omp single nowait { // outer taskgroup reduces (i,j,x) #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x) { _task_red_item_t red_data[3]; red_data[0].shar = &ip; red_data[0].size = sizeof(ip); red_data[0].f_init = NULL; // RTL will zero thread-specific objects red_data[0].f_fini = NULL; // no destructors needed red_data[0].f_comb = (void*)&__red_int_add_comb; red_data[0].flags = FLG; red_data[1].shar = &jp; red_data[1].size = sizeof(jp); red_data[1].f_init = NULL; // RTL will zero thread-specific objects red_data[1].f_fini = NULL; // no destructors needed red_data[1].f_comb = (void*)&__red_llong_add_comb; red_data[1].flags = FLG; red_data[2].shar = &xp; red_data[2].size = sizeof(xp); red_data[2].f_init = (void*)&__red_dbl_mul_init; red_data[2].f_fini = NULL; // no destructors needed red_data[2].f_comb = (void*)&__red_dbl_mul_comb; red_data[2].flags = FLG; int gtid = __kmpc_global_thread_num(NULL); void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data); for( int l = 0; l < N; l += 2 ) { // 2 iterations per task to get correct x value; actually any even // number of iters per task will work, otherwise x looses precision #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x) { int gtid = __kmpc_global_thread_num(NULL); int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip); double *p_xp = (double*)__kmpc_task_reduction_get_th_data( gtid, tg1, &xp); if (!ptrs[gtid]) ptrs[gtid] = p_xp; // user's pseudo-code ============================== *p_ip += l; *p_xp *= (l + 1); *p_ip += l + 1; *p_xp *= 1.0 / (l + 2); // ================================================== } } // inner taskgroup reduces (i,k,y), i is same object as in outer one #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y) { _task_red_item_t red_data[3]; red_data[0].shar = &ip; red_data[0].size = sizeof(ip); red_data[0].f_init = NULL; // RTL will zero thread-specific objects red_data[0].f_fini = NULL; // no destructors needed red_data[0].f_comb = (void*)&__red_int_add_comb; red_data[0].flags = FLG; red_data[1].shar = &kp; red_data[1].size = sizeof(kp); red_data[1].f_init = NULL; // RTL will zero thread-specific objects red_data[1].f_fini = NULL; // no destructors needed red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and - red_data[1].flags = FLG; red_data[2].shar = &yp; red_data[2].size = sizeof(yp); red_data[2].f_init = NULL; // RTL will zero thread-specific objects red_data[2].f_fini = NULL; // no destructors needed red_data[2].f_comb = (void*)&__red_dbl_add_comb; red_data[2].flags = FLG; int gtid = __kmpc_global_thread_num(NULL); void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data); for( int l = 0; l < N; l += 2 ) { #pragma omp task firstprivate(l) // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k) { int gtid = __kmpc_global_thread_num(NULL); long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( gtid, tg1, &jp); long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( gtid, tg2, &kp); double *p_xp = (double*)__kmpc_task_reduction_get_th_data( gtid, tg1, &xp); double *p_yp = (double*)__kmpc_task_reduction_get_th_data( gtid, tg2, &yp); // user's pseudo-code ============================== *p_jp += l; *p_kp -= l; *p_yp += (double)l; *p_xp *= (l + 1); *p_jp += l + 1; *p_kp -= l + 1; *p_yp += (double)(l + 1); *p_xp *= 1.0 / (l + 2); // ================================================= { // the following code is here just to check __kmpc_task_reduction_get_th_data: int tid = omp_get_thread_num(); void *addr1; void *addr2; addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private if (addr1 != addr2) { #pragma omp atomic ++err; printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2); } // from neighbour w/o taskgroup (should start lookup from current tg2) if (tid > 0) { if (ptrs[tid-1]) { addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]); if (addr1 != addr2) { #pragma omp atomic ++err; printf("Wrong thread-specific addresses %d s:%p n:%p\n", tid, addr1, addr2); } } } else { if (ptrs[nthreads-1]) { addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]); if (addr1 != addr2) { #pragma omp atomic ++err; printf("Wrong thread-specific addresses %d s:%p n:%p\n", tid, addr1, addr2); } } } // ---------------------------------------------- } } #pragma omp task firstprivate(l) // in_reduction(+:y) in_reduction(-:i,k) { int gtid = __kmpc_global_thread_num(NULL); int *p_ip = (int*)__kmpc_task_reduction_get_th_data( gtid, tg2, &ip); long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( gtid, tg2, &kp); double *p_yp = (double*)__kmpc_task_reduction_get_th_data( gtid, tg2, &yp); // user's pseudo-code ============================== *p_ip -= l; *p_kp -= l; *p_yp += (double)l; *p_ip -= l + 1; *p_kp -= l + 1; *p_yp += (double)(l + 1); // ================================================= } #pragma omp task firstprivate(l) // in_reduction(+:j) in_reduction(*:x) { int gtid = __kmpc_global_thread_num(NULL); long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( gtid, tg1, &jp); double *p_xp = (double*)__kmpc_task_reduction_get_th_data( gtid, tg1, &xp); // user's pseudo-code ============================== *p_jp += l; *p_xp *= (l + 1); *p_jp += l + 1; *p_xp *= 1.0 / (l + 2); // ================================================= } } } // inner reduction for( int l = 0; l < N; l += 2 ) { #pragma omp task firstprivate(l) // in_reduction(+:j) { int gtid = __kmpc_global_thread_num(NULL); long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( gtid, tg1, &jp); // user's pseudo-code ============================== *p_jp += l; *p_jp += l + 1; // ================================================= } } } // outer reduction } // end single } // end parallel // check results #if _DEBUG printf("reduction flags = %u\n", FLG); #endif if (ip == is && jp == js && ks == kp && fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01) printf("passed\n"); else printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n", is, js, xs, ks, ys, ip, jp, xp, kp, yp); return 0; }