5.1. task and taskwait Constructs#

The following example shows how to traverse a tree-like structure using explicit tasks. Note that the traverse function should be called from within a parallel region for the different specified tasks to be executed in parallel. Also note that the tasks will be executed in no specified order because there are no synchronization directives. Thus, assuming that the traversal will be done in post order, as in the sequential code, is wrong.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.1
* type: C
* version: omp_3.0
*/

struct node {
  struct node *left;
  struct node *right;
};

extern void process(struct node *);

void traverse( struct node *p )
{
  if (p->left)
#pragma omp task   // p is firstprivate by default
      traverse(p->left);
  if (p->right)
#pragma omp task    // p is firstprivate by default
      traverse(p->right);
  process(p);
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.1
! type: F-free
! version:    omp_3.0

       RECURSIVE SUBROUTINE traverse ( P )
          TYPE Node
             TYPE(Node), POINTER :: left, right
          END TYPE Node
          TYPE(Node) :: P

          IF (associated(P%left)) THEN
             !$OMP TASK     ! P is firstprivate by default
                 CALL traverse(P%left)
             !$OMP END TASK
          ENDIF
          IF (associated(P%right)) THEN
             !$OMP TASK     ! P is firstprivate by default
                 CALL traverse(P%right)
             !$OMP END TASK
          ENDIF
          CALL process ( P )

       END SUBROUTINE

In the next example, we force a postorder traversal of the tree by adding a taskwait directive. Now, we can safely assume that the left and right sons have been executed before we process the current node.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.2
* type: C
* version: omp_3.0
*/
struct node {
  struct node *left;
  struct node *right;
};
extern void process(struct node *);
void postorder_traverse( struct node *p ) {
    if (p->left)
       #pragma omp task    // p is firstprivate by default
           postorder_traverse(p->left);
    if (p->right)
       #pragma omp task   // p is firstprivate by default
           postorder_traverse(p->right);
    #pragma omp taskwait
    process(p);
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.2
! type: F-free
! version:    omp_3.0
      RECURSIVE SUBROUTINE traverse ( P )
         TYPE Node
             TYPE(Node), POINTER :: left, right
         END TYPE Node
         TYPE(Node) :: P
         IF (associated(P%left)) THEN
             !$OMP TASK    ! P is firstprivate by default
                 CALL traverse(P%left)
             !$OMP END TASK
         ENDIF
         IF (associated(P%right)) THEN
             !$OMP TASK    ! P is firstprivate by default
                 CALL traverse(P%right)
             !$OMP END TASK
         ENDIF
         !$OMP TASKWAIT
         CALL process ( P )
      END SUBROUTINE

The following example demonstrates how to use the task construct to process elements of a linked list in parallel. The thread executing the single region generates all of the explicit tasks, which are then executed by the threads in the current team. The pointer p is firstprivate by default on the task construct so it is not necessary to specify it in a firstprivate clause.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.3
* type: C
* version: omp_3.0
*/

typedef struct node node;
struct node {
      int data;
      node * next;
};

void process(node * p)
{
    /* do work here */
}

void increment_list_items(node * head)
{
    #pragma omp parallel
    {
        #pragma omp single
            {
               node * p = head;
               while (p) {
                    #pragma omp task
                     // p is firstprivate by default
                           process(p);
                     p = p->next;
                  }
            }
     }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.3
! type: F-free
! version:    omp_3.0

      MODULE LIST
          TYPE NODE
             INTEGER :: PAYLOAD
             TYPE (NODE), POINTER :: NEXT
          END TYPE NODE
      CONTAINS

          SUBROUTINE PROCESS(p)
             TYPE (NODE), POINTER :: P
                 ! do work here
          END SUBROUTINE

          SUBROUTINE INCREMENT_LIST_ITEMS (HEAD)

             TYPE (NODE), POINTER :: HEAD
             TYPE (NODE), POINTER :: P
             !$OMP PARALLEL PRIVATE(P)
                 !$OMP SINGLE
                     P => HEAD
                     DO
                        !$OMP TASK
                            ! P is firstprivate by default
                            CALL PROCESS(P)
                        !$OMP END TASK
                        P => P%NEXT
                        IF ( .NOT. ASSOCIATED (P) ) EXIT
                     END DO
                 !$OMP END SINGLE
             !$OMP END PARALLEL

          END SUBROUTINE

      END MODULE

The fib() function should be called from within a parallel region for the different specified tasks to be executed in parallel. Also, only one thread of the parallel region should call fib() unless multiple concurrent Fibonacci computations are desired.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.4
* type: C
* version: omp_3.0
*/
     int fib(int n) {
        int i, j;
        if (n<2)
          return n;
        else {
           #pragma omp task shared(i)
              i=fib(n-1);
           #pragma omp task shared(j)
              j=fib(n-2);
           #pragma omp taskwait
              return i+j;
        }
     }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.4
! type: F-fixed
! version: omp_3.0
      RECURSIVE INTEGER FUNCTION fib(n) RESULT(res)
      INTEGER n, i, j
      IF ( n .LT. 2) THEN
        res = n
      ELSE
!$OMP TASK SHARED(i)
        i = fib( n-1 )
!$OMP END TASK
!$OMP TASK SHARED(j)
        j = fib( n-2 )
!$OMP END TASK
!$OMP TASKWAIT
        res = i+j
      END IF
      END FUNCTION

Note: There are more efficient algorithms for computing Fibonacci numbers. This classic recursion algorithm is for illustrative purposes.

The following example demonstrates a way to generate a large number of tasks with one thread and execute them with the threads in the team. While generating these tasks, the implementation may reach its limit on unassigned tasks. If it does, the implementation is allowed to cause the thread executing the task generating loop to suspend its task at the task scheduling point in the task directive, and start executing unassigned tasks. Once the number of unassigned tasks is sufficiently low, the thread may resume execution of the task generating loop.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.5
* type: C
* version: omp_3.0
*/
#define LARGE_NUMBER 10000000
double item[LARGE_NUMBER];
extern void process(double);

int main()
{
#pragma omp parallel
  {
    #pragma omp single
    {
      int i;
      for (i=0; i<LARGE_NUMBER; i++)
             #pragma omp task    // i is firstprivate, item is shared
                  process(item[i]);
    }
  }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.5
! type: F-fixed
! version: omp_3.0
       real*8 item(10000000)
       integer i

!$omp parallel
!$omp single ! loop iteration variable i is private
       do i=1,10000000
!$omp task
         ! i is firstprivate, item is shared
          call process(item(i))
!$omp end task
       end do
!$omp end single
!$omp end parallel

       end

The following example is the same as the previous one, except that the tasks are generated in an untied task. While generating the tasks, the implementation may reach its limit on unassigned tasks. If it does, the implementation is allowed to cause the thread executing the task generating loop to suspend its task at the task scheduling point in the task directive, and start executing unassigned tasks. If that thread begins execution of a task that takes a long time to complete, the other threads may complete all the other tasks before it is finished.

In this case, since the loop is in an untied task, any other thread is eligible to resume the task generating loop. In the previous examples, the other threads would be forced to idle until the generating thread finishes its long task, since the task generating loop was in a tied task.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.6
* type: C
* version: omp_3.0
*/
#define LARGE_NUMBER 10000000
double item[LARGE_NUMBER];
extern void process(double);
int main() {
#pragma omp parallel
  {
    #pragma omp single
    {
      int i;
      #pragma omp task untied
      // i is firstprivate, item is shared
      {
         for (i=0; i<LARGE_NUMBER; i++)
             #pragma omp task
                  process(item[i]);
      }
    }
  }
  return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.6
! type: F-fixed
! version: omp_3.0
       real*8 item(10000000)
!$omp parallel
!$omp single
!$omp task untied
       ! loop iteration variable i is private
       do i=1,10000000
!$omp task ! i is firstprivate, item is shared
          call process(item(i))
!$omp end task
       end do
!$omp end task
!$omp end single
!$omp end parallel
       end

The following two examples demonstrate how the scheduling rules illustrated in Section 2.11.3 of the OpenMP 4.0 specification affect the usage of threadprivate variables in tasks. A threadprivate variable can be modified by another task that is executed by the same thread. Thus, the value of a threadprivate variable cannot be assumed to be unchanged across a task scheduling point. In untied tasks, task scheduling points may be added in any place by the implementation.

A task switch may occur at a task scheduling point. A single thread may execute both of the task regions that modify tp. The parts of these task regions in which tp is modified may be executed in any order so the resulting value of var can be either 1 or 2.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.7
* type: C
* version: omp_3.0
*/

int tp;
#pragma omp threadprivate(tp)
int var;
void work()
{
#pragma omp task
    {
        /* do work here */
#pragma omp task
        {
            tp = 1;
            /* do work here */
#pragma omp task
            {
                /* no modification of tp */
            }
            var = tp; //value of tp can be 1 or 2
        }
        tp = 2;
    }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.7
! type: F-fixed
! version: omp_3.0
      module example
      integer tp
!$omp threadprivate(tp)
      integer var
      contains
      subroutine work
!$omp task
         ! do work here
!$omp task
         tp = 1
         ! do work here
!$omp task
           ! no modification of tp
!$omp end task
         var = tp    ! value of var can be 1 or 2
!$omp end task
        tp = 2
!$omp end task
      end subroutine
      end module

In this example, scheduling constraints prohibit a thread in the team from executing a new task that modifies tp while another such task region tied to the same thread is suspended. Therefore, the value written will persist across the task scheduling point.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.8
* type: C
* version: omp_3.0
*/

int tp;
#pragma omp threadprivate(tp)
int var;
void work()
{
#pragma omp parallel
    {
        /* do work here */
#pragma omp task
        {
            tp++;
            /* do work here */
#pragma omp task
            {
                /* do work here but don't modify tp */
            }
            var = tp; //Value does not change after write above
        }
    }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.8
! type: F-fixed
! version: omp_3.0
      module example
      integer tp
!$omp threadprivate(tp)
      integer var
      contains
      subroutine work
!$omp parallel
         ! do work here
!$omp task
         tp = tp + 1
         ! do work here
!$omp task
           ! do work here but don't modify tp
!$omp end task
         var = tp    ! value does not change after write above
!$omp end task
!$omp end parallel
      end subroutine
      end module

The following two examples demonstrate how the scheduling rules illustrated in Section 2.11.3 of the OpenMP 4.0 specification affect the usage of locks and critical sections in tasks. If a lock is held across a task scheduling point, no attempt should be made to acquire the same lock in any code that may be interleaved. Otherwise, a deadlock is possible.

In the example below, suppose the thread executing task 1 defers task 2. When it encounters the task scheduling point at task 3, it could suspend task 1 and begin task 2 which will result in a deadlock when it tries to enter critical region 1.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.9
* type: C
* version: omp_3.0
*/
void work()
{
   #pragma omp task
   { //Task 1
       #pragma omp task
       { //Task 2
            #pragma omp critical //Critical region 1
            {/*do work here */ }
       }
       #pragma omp critical //Critical Region 2
       {
           //Capture data for the following task
           #pragma omp task
           { /* do work here */ } //Task 3
       }
   }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.9
! type: F-fixed
! version: omp_3.0
       module example
       contains
       subroutine work
!$omp task
       ! Task 1
!$omp task
       ! Task 2
!$omp critical
       ! Critical region 1
       ! do work here
!$omp end critical
!$omp end task
!$omp critical
       ! Critical region 2
       ! Capture data for the following task
!$omp task
       !Task 3
       ! do work here
!$omp end task
!$omp end critical
!$omp end task
      end subroutine
      end module

In the following example, lock is held across a task scheduling point. However, according to the scheduling restrictions, the executing thread can’t begin executing one of the non-descendant tasks that also acquires lock before the task region is complete. Therefore, no deadlock is possible.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.10
* type: C
* version: omp_3.0
*/
#include <omp.h>
void work() {
    omp_lock_t lock;
    omp_init_lock(&lock);
#pragma omp parallel
    {
        int i;
#pragma omp for
        for (i = 0; i < 100; i++) {
#pragma omp task
            {
 // lock is shared by default in the task
 omp_set_lock(&lock);
                // Capture data for the following task
#pragma omp task
 // Task Scheduling Point 1
                { /* do work here */ }
                omp_unset_lock(&lock);
            }
        }
    }
    omp_destroy_lock(&lock);
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.10
! type: F-free
! version:    omp_3.0
      module example
      include 'omp_lib.h'
      integer (kind=omp_lock_kind) lock
      integer i

      contains

      subroutine work
      call omp_init_lock(lock)
!$omp parallel
     !$omp do
      do i=1,100
         !$omp task
              ! Outer task
              call omp_set_lock(lock)    ! lock is shared by
                                         ! default in the task
                     ! Capture data for the following task
                     !$omp task     ! Task Scheduling Point 1
                              ! do work here
                     !$omp end task
               call omp_unset_lock(lock)
         !$omp end task
      end do
!$omp end parallel
      call omp_destroy_lock(lock)
      end subroutine

      end module

The following examples illustrate the use of the mergeable clause in the task construct. In this first example, the task construct has been annotated with the mergeable clause. The addition of this clause allows the implementation to reuse the data environment (including the ICVs) of the parent task for the task inside foo if the task is included or undeferred. Thus, the result of the execution may differ depending on whether the task is merged or not. Therefore the mergeable clause needs to be used with caution. In this example, the use of the mergeable clause is safe. As x is a shared variable the outcome does not depend on whether or not the task is merged (that is, the task will always increment the same variable and will always compute the same value for x).

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.11
* type: C
* version: omp_3.1
*/
#include <stdio.h>
void foo ( )
{
   int x = 2;
   #pragma omp task shared(x) mergeable
   {
      x++;
   }
   #pragma omp taskwait
   printf("%d\n",x);  // prints 3
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.11
! type: F-free
! version:    omp_3.1
subroutine foo()
  integer :: x
  x = 2
!$omp task shared(x) mergeable
  x = x + 1
!$omp end task
!$omp taskwait
  print *, x     ! prints 3
end subroutine

This second example shows an incorrect use of the mergeable clause. In this example, the created task will access different instances of the variable x if the task is not merged, as x is firstprivate, but it will access the same variable x if the task is merged. As a result, the behavior of the program is unspecified, and it can print two different values for x depending on the decisions taken by the implementation.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.12
* type: C
* version: omp_3.1
*/
#include <stdio.h>
void foo ( )
{
   int x = 2;
   #pragma omp task mergeable
   {
      x++;
   }
   #pragma omp taskwait
   printf("%d\n",x);  // prints 2 or 3
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.12
! type: F-free
! version:    omp_3.1
subroutine foo()
  integer :: x
  x = 2
!$omp task mergeable
  x = x + 1
!$omp end task
!$omp taskwait
  print *, x   ! prints 2 or 3
end subroutine

The following example shows the use of the final clause and the omp_in_final API call in a recursive binary search program. To reduce overhead, once a certain depth of recursion is reached the program uses the final clause to create only included tasks, which allow additional optimizations.

The use of the omp_in_final API call allows programmers to optimize their code by specifying which parts of the program are not necessary when a task can create only included tasks (that is, the code is inside a final task). In this example, the use of a different state variable is not necessary so once the program reaches the part of the computation that is finalized and copying from the parent state to the new state is eliminated. The allocation of new_state in the stack could also be avoided but it would make this example less clear. The final clause is most effective when used in conjunction with the mergeable clause since all tasks created in a final task region are included tasks that can be merged if the mergeable clause is present.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.13
* type: C
* version: omp_3.1
*/
#include <string.h>
#include <omp.h>
#define LIMIT  3 /* arbitrary limit on recursion depth */
void check_solution(char *);
void bin_search (int pos, int n, char *state)
{
   if ( pos == n ) {
      check_solution(state);
      return;
   }
   #pragma omp task final( pos > LIMIT ) mergeable
   {
      char new_state[n];
      if (!omp_in_final() ) {
        memcpy(new_state, state, pos );
        state = new_state;
      }
      state[pos] = 0;
      bin_search(pos+1, n, state );
   }
   #pragma omp task final( pos > LIMIT ) mergeable
   {
      char new_state[n];
      if (! omp_in_final() ) {
        memcpy(new_state, state, pos );
        state = new_state;
      }
      state[pos] = 1;
      bin_search(pos+1, n, state );
   }
   #pragma omp taskwait
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.13
! type: F-free
! version:    omp_3.1
recursive subroutine bin_search(pos, n, state)
  use omp_lib
  integer :: pos, n
  character, pointer :: state(:)
  character, target, dimension(n) :: new_state1, new_state2
  integer, parameter :: LIMIT = 3
  if (pos .eq. n) then
    call check_solution(state)
    return
  endif
!$omp task final(pos > LIMIT) mergeable
  if (.not. omp_in_final()) then
    new_state1(1:pos) = state(1:pos)
    state => new_state1
  endif
  state(pos+1) = 'z'
  call bin_search(pos+1, n, state)
!$omp end task
!$omp task final(pos > LIMIT) mergeable
  if (.not. omp_in_final()) then
    new_state2(1:pos) = state(1:pos)
    state => new_state2
  endif
  state(pos+1) = 'y'
  call bin_search(pos+1, n, state)
!$omp end task
!$omp taskwait
end subroutine

The following example illustrates the difference between the if and the final clauses. The if clause has a local effect. In the first nest of tasks, the one that has the if clause will be undeferred but the task nested inside that task will not be affected by the if clause and will be created as usual. Alternatively, the final clause affects all task constructs in the final task region but not the final task itself. In the second nest of tasks, the nested tasks will be created as included tasks. Note also that the conditions for the if and final clauses are usually the opposite.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: tasking.14
* type: C
* version: omp_3.1
*/
void bar(void);

void foo ( )
{
   int i;
   #pragma omp task if(0)  // This task is undeferred
   {
      #pragma omp task     // This task is a regular task
      for (i = 0; i < 3; i++) {
          #pragma omp task     // This task is a regular task
          bar();
      }
   }
   #pragma omp task final(1) // This task is a regular task
   {
      #pragma omp task  // This task is included
      for (i = 0; i < 3; i++) {
          #pragma omp task     // This task is also included
          bar();
      }
   }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: tasking.14
! type: F-free
! version:    omp_3.1
subroutine foo()
integer i
!$omp task if(.FALSE.) ! This task is undeferred
!$omp task             ! This task is a regular task
  do i = 1, 3
    !$omp task             ! This task is a regular task
      call bar()
    !$omp end task
  enddo
!$omp end task
!$omp end task
!$omp task final(.TRUE.) ! This task is a regular task
!$omp task               ! This task is included
  do i = 1, 3
    !$omp task               ! This task is also included
     call bar()
    !$omp end task
  enddo
!$omp end task
!$omp end task
end subroutine