summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/runtime/src/dllexports1
-rw-r--r--openmp/runtime/src/kmp.h10
-rw-r--r--openmp/runtime/src/kmp_tasking.c228
-rw-r--r--openmp/runtime/test/tasking/kmp_taskloop.c158
4 files changed, 391 insertions, 6 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 6ff52521e50..14cfaac2333 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -393,6 +393,7 @@ kmpc_set_defaults 224
__kmpc_doacross_wait 262
__kmpc_doacross_post 263
__kmpc_doacross_fini 264
+ __kmpc_taskloop 266
%endif
%endif
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 140bdd8caef..88179b659e0 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2205,11 +2205,7 @@ struct kmp_taskdata { /* aligned during dynamic
#endif
#if OMP_41_ENABLED
kmp_task_team_t * td_task_team;
-#endif
-#if KMP_HAVE_QUAD
- _Quad td_dummy; // Align structure 16-byte size since allocated just before kmp_task_t
-#else
- kmp_uint32 td_dummy[2];
+ kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
#endif
}; // struct kmp_taskdata
@@ -3478,7 +3474,9 @@ KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
-
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val,
+ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+ kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup );
#endif
#endif
diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c
index 3ed9a9da5cb..faf11930c38 100644
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@@ -1000,6 +1000,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
#if OMP_41_ENABLED
taskdata->td_flags.proxy = flags->proxy;
taskdata->td_task_team = thread->th.th_task_team;
+ taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
#endif
taskdata->td_flags.tasktype = TASK_EXPLICIT;
@@ -2877,4 +2878,231 @@ void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
}
+//---------------------------------------------------------------------------------
+// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
+//
+// thread: allocating thread
+// task_src: pointer to source task to be duplicated
+// returns: a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *
+__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
+{
+ kmp_task_t *task;
+ kmp_taskdata_t *taskdata;
+ kmp_taskdata_t *taskdata_src;
+ kmp_taskdata_t *parent_task = thread->th.th_current_task;
+ size_t shareds_offset;
+ size_t task_size;
+
+ KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
+ taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
+ KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
+ KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
+ task_size = taskdata_src->td_size_alloc;
+
+ // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+ KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
+ #if USE_FAST_MEMORY
+ taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
+ #else
+ taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
+ #endif /* USE_FAST_MEMORY */
+ KMP_MEMCPY(taskdata, taskdata_src, task_size);
+
+ task = KMP_TASKDATA_TO_TASK(taskdata);
+
+ // Initialize new task (only specific fields not affected by memcpy)
+ taskdata->td_task_id = KMP_GEN_TASK_ID();
+ if( task->shareds != NULL ) { // need setup shareds pointer
+ shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
+ task->shareds = &((char*)taskdata)[shareds_offset];
+ KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
+ }
+ taskdata->td_alloc_thread = thread;
+ taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
+
+ // Only need to keep track of child task counts if team parallel and tasking not serialized
+ if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
+ KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
+ if ( parent_task->td_taskgroup )
+ KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
+ // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
+ if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
+ KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
+ }
+
+ KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
+ thread, taskdata, taskdata->td_parent) );
+#if OMPT_SUPPORT
+ __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
+#endif
+ return task;
+}
+
+// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// and calling needed constructors for private/firstprivate objects
+// (used to form taskloop tasks from pattern task)
+typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+//---------------------------------------------------------------------------------
+// __kmp_taskloop_linear: Start tasks of the taskloop linearly
+//
+// loc Source location information
+// gtid Global thread ID
+// task Task with whole loop iteration range
+// lb Pointer to loop lower bound
+// ub Pointer to loop upper bound
+// st Loop stride
+// sched Schedule specified 0/1/2 for none/grainsize/num_tasks
+// grainsize Schedule value if specified
+// task_dup Tasks duplication routine
+void
+__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
+ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+ int sched, kmp_uint64 grainsize, void *task_dup )
+{
+ p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+ kmp_uint64 tc;
+ kmp_uint64 lower = *lb; // compiler provides global bounds here
+ kmp_uint64 upper = *ub;
+ kmp_uint64 i, num_tasks, extras;
+ kmp_info_t *thread = __kmp_threads[gtid];
+ kmp_taskdata_t *current_task = thread->th.th_current_task;
+ kmp_task_t *next_task;
+ kmp_int32 lastpriv = 0;
+ size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
+ size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
+
+ // compute trip count
+ if ( st == 1 ) { // most common case
+ tc = upper - lower + 1;
+ } else if ( st < 0 ) {
+ tc = (lower - upper) / (-st) + 1;
+ } else { // st > 0
+ tc = (upper - lower) / st + 1;
+ }
+ if(tc == 0) {
+ // free the pattern task and exit
+ __kmp_task_start( gtid, task, current_task );
+ // do not execute anything for zero-trip loop
+ __kmp_task_finish( gtid, task, current_task );
+ return;
+ }
+
+ // compute num_tasks/grainsize based on the input provided
+ switch( sched ) {
+ case 0: // no schedule clause specified, we can choose the default
+ // let's try to schedule (team_size*10) tasks
+ grainsize = thread->th.th_team_nproc * 10;
+ case 2: // num_tasks provided
+ if( grainsize > tc ) {
+ num_tasks = tc; // too big num_tasks requested, adjust values
+ grainsize = 1;
+ extras = 0;
+ } else {
+ num_tasks = grainsize;
+ grainsize = tc / num_tasks;
+ extras = tc % num_tasks;
+ }
+ break;
+ case 1: // grainsize provided
+ if( grainsize > tc ) {
+ num_tasks = 1; // too big grainsize requested, adjust values
+ grainsize = tc;
+ extras = 0;
+ } else {
+ num_tasks = tc / grainsize;
+ grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
+ extras = tc % num_tasks;
+ }
+ break;
+ default:
+ KMP_ASSERT2(0, "unknown scheduling of taskloop");
+ }
+ KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+ KMP_DEBUG_ASSERT(num_tasks > extras);
+ KMP_DEBUG_ASSERT(num_tasks > 0);
+
+ // Main loop, launch num_tasks tasks, assign grainsize iterations each task
+ for( i = 0; i < num_tasks; ++i ) {
+ kmp_uint64 chunk_minus_1;
+ if( extras == 0 ) {
+ chunk_minus_1 = grainsize - 1;
+ } else {
+ chunk_minus_1 = grainsize;
+ --extras; // first extras iterations get bigger chunk (grainsize+1)
+ }
+ upper = lower + st * chunk_minus_1;
+ if( i == num_tasks - 1 ) {
+ // schedule the last task, set lastprivate flag
+ lastpriv = 1;
+#if KMP_DEBUG
+ if( st == 1 )
+ KMP_DEBUG_ASSERT(upper == *ub);
+ else if( st > 0 )
+ KMP_DEBUG_ASSERT(upper+st > *ub);
+ else
+ KMP_DEBUG_ASSERT(upper+st < *ub);
+#endif
+ }
+ next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+ *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
+ *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
+ if( ptask_dup != NULL )
+ ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
+ __kmp_omp_task(gtid, next_task, true); // schedule new task
+ lower = upper + st; // adjust lower bound for the next iteration
+ }
+ // free the pattern task and exit
+ __kmp_task_start( gtid, task, current_task );
+ // do not execute the pattern task, just do bookkeeping
+ __kmp_task_finish( gtid, task, current_task );
+}
+
+/*!
+@ingroup TASKING
+@param loc Source location information
+@param gtid Global thread ID
+@param task Task structure
+@param if_val Value of the if clause
+@param lb Pointer to loop lower bound
+@param ub Pointer to loop upper bound
+@param st Loop stride
+@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+ int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
+{
+ kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+ KMP_DEBUG_ASSERT( task != NULL );
+
+ KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
+ gtid, taskdata, *lb, *ub, st, grainsize, sched));
+
+ // check if clause value first
+ if( if_val == 0 ) { // if(0) specified, mark task as serial
+ taskdata->td_flags.task_serial = 1;
+ taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+ }
+ if( nogroup == 0 ) {
+ __kmpc_taskgroup( loc, gtid );
+ }
+
+ if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
+ __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
+ }
+
+ if( nogroup == 0 ) {
+ __kmpc_end_taskgroup( loc, gtid );
+ }
+ KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+}
+
#endif
diff --git a/openmp/runtime/test/tasking/kmp_taskloop.c b/openmp/runtime/test/tasking/kmp_taskloop.c
new file mode 100644
index 00000000000..3b318bd26c4
--- /dev/null
+++ b/openmp/runtime/test/tasking/kmp_taskloop.c
@@ -0,0 +1,158 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define GRAIN 10
+#define STRIDE 3
+
+// globals
+int th_counter[N];
+int counter;
+
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+ void* dummy;
+} ident_t;
+
+typedef struct shar {
+ int(*pth_counter)[N];
+ int *pcounter;
+ int *pj;
+} *pshareds;
+
+typedef struct task {
+ pshareds shareds;
+ int(* routine)(int,struct task*);
+ int part_id;
+// privates:
+ unsigned long long lb; // library always uses ULONG
+ unsigned long long ub;
+ int st;
+ int last;
+ int i;
+ int j;
+ int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+ task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+ int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
+ptask
+__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ task_entry_t task_entry );
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+ pshareds pshar = task->shareds;
+ for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
+ task->th = omp_get_thread_num();
+ __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+ __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
+ task->j = task->i;
+ }
+ my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+ if( task->last ) {
+ *(pshar->pj) = task->j; // lastprivate
+ }
+ return 0;
+}
+
+int main()
+{
+ int i, j, gtid = __kmpc_global_thread_num(NULL);
+ ptask task;
+ pshareds psh;
+ omp_set_dynamic(0);
+ counter = 0;
+ for( i=0; i<N; ++i )
+ th_counter[i] = 0;
+ #pragma omp parallel num_threads(N)
+ {
+ #pragma omp master
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+/*
+ * This is what the OpenMP runtime calls correspond to:
+ #pragma omp taskloop num_tasks(N) lastprivate(j)
+ for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
+ {
+ int th = omp_get_thread_num();
+ #pragma omp atomic
+ counter++;
+ #pragma omp atomic
+ th_counter[th]++;
+ j = i;
+ }
+*/
+ task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
+ psh = task->shareds;
+ psh->pth_counter = &th_counter;
+ psh->pcounter = &counter;
+ psh->pj = &j;
+ task->lb = 0;
+ task->ub = N*GRAIN*STRIDE-2;
+ task->st = STRIDE;
+
+ __kmpc_taskloop(
+ NULL, // location
+ gtid, // gtid
+ task, // task structure
+ 1, // if clause value
+ &task->lb, // lower bound
+ &task->ub, // upper bound
+ STRIDE, // loop increment
+ 0, // 1 if nogroup specified
+ 2, // schedule type: 0-none, 1-grainsize, 2-num_tasks
+ N, // schedule value (ignored for type 0)
+ (void*)&__task_dup_entry // tasks duplication routine
+ );
+ } // end master
+ } // end parallel
+// check results
+ if( j != N*GRAIN*STRIDE-STRIDE ) {
+ printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
+ return 1;
+ }
+ if( counter != N*GRAIN ) {
+ printf("Error, counter %d != %d\n",counter,N*GRAIN);
+ return 1;
+ }
+ for( i=0; i<N; ++i ) {
+ if( th_counter[i] % GRAIN ) {
+ printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
+ return 1;
+ }
+ }
+ printf("passed\n");
+ return 0;
+}
OpenPOWER on IntegriCloud