10.9. Reduction#

This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions.

10.9.1. reduction Clause#

The following example demonstrates the reduction clause; note that some reductions can be expressed in the loop in several ways, as shown for the max and min reductions below:

//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.1
* type: C
* version: omp_3.1
*/
#include <math.h>
void reduction1(float *x, int *y, int n)
{
  int i, b, c;
  float a, d;
  a = 0.0;
  b = 0;
  c = y[0];
  d = x[0];
  #pragma omp parallel for private(i) shared(x, y, n) \
                          reduction(+:a) reduction(^:b) \
                          reduction(min:c) reduction(max:d)
    for (i=0; i<n; i++) {
      a += x[i];
      b ^= y[i];
      if (c > y[i]) c = y[i];
      d = fmaxf(d,x[i]);
    }
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.1
! type: F-free
SUBROUTINE REDUCTION1(A, B, C, D, X, Y, N)
    REAL :: X(*), A, D
    INTEGER :: Y(*), N, B, C
    INTEGER :: I
    A = 0
    B = 0
    C = Y(1)
    D = X(1)
    !$OMP PARALLEL DO PRIVATE(I) SHARED(X, Y, N) REDUCTION(+:A) &
    !$OMP& REDUCTION(IEOR:B) REDUCTION(MIN:C)  REDUCTION(MAX:D)
      DO I=1,N
        A = A + X(I)
        B = IEOR(B, Y(I))
        C = MIN(C, Y(I))
        IF (D < X(I)) D = X(I)
      END DO

END SUBROUTINE REDUCTION1

A common implementation of the preceding example is to treat it as if it had been written as follows:

//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.2
* type: C
*/
#include <limits.h>
#include <math.h>
void reduction2(float *x, int *y, int n)
{
  int i, b, b_p, c, c_p;
  float a, a_p, d, d_p;
  a = 0.0f;
  b = 0;
  c = y[0];
  d = x[0];
  #pragma omp parallel shared(a, b, c, d, x, y, n) \
                          private(a_p, b_p, c_p, d_p)
  {
    a_p = 0.0f;
    b_p = 0;
    c_p = INT_MAX;
    d_p = -HUGE_VALF;
    #pragma omp for private(i)
    for (i=0; i<n; i++) {
      a_p += x[i];
      b_p ^= y[i];
      if (c_p > y[i]) c_p = y[i];
      d_p = fmaxf(d_p,x[i]);
    }
    #pragma omp critical
    {
      a += a_p;
      b ^= b_p;
      if( c > c_p ) c = c_p;
      d = fmaxf(d,d_p);
    }
  }
}

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.2
! type: F-free
  SUBROUTINE REDUCTION2(A, B, C, D, X, Y, N)
    REAL :: X(*), A, D
    INTEGER :: Y(*), N, B, C
    REAL :: A_P, D_P
    INTEGER :: I, B_P, C_P
    A = 0
    B = 0
    C = Y(1)
    D = X(1)
    !$OMP PARALLEL SHARED(X, Y, A, B, C, D, N) &
    !$OMP&         PRIVATE(A_P, B_P, C_P, D_P)
      A_P = 0.0
      B_P = 0
      C_P = HUGE(C_P)
      D_P = -HUGE(D_P)
      !$OMP DO PRIVATE(I)
      DO I=1,N
        A_P = A_P + X(I)
        B_P = IEOR(B_P, Y(I))
        C_P = MIN(C_P, Y(I))
        IF (D_P < X(I)) D_P = X(I)
      END DO
      !$OMP CRITICAL
        A = A + A_P
        B = IEOR(B, B_P)
        C = MIN(C, C_P)
        D = MAX(D, D_P)
      !$OMP END CRITICAL
    !$OMP END PARALLEL
  END SUBROUTINE REDUCTION2

The following program is non-conforming because the reduction is on the intrinsic procedure name MAX but that name has been redefined to be the variable named MAX.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.3
! type: F-free
 PROGRAM REDUCTION_WRONG
 MAX = HUGE(0)
 M = 0

 !$OMP PARALLEL DO REDUCTION(MAX: M)
! MAX is no longer the intrinsic so this is non-conforming
 DO I = 1, 100
    CALL SUB(M,I)
 END DO

 END PROGRAM REDUCTION_WRONG

 SUBROUTINE SUB(M,I)
    M = MAX(M,I)
 END SUBROUTINE SUB

The following conforming program performs the reduction using the intrinsic procedure name MAX even though the intrinsic MAX has been renamed to REN.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.4
! type: F-free
MODULE M
   INTRINSIC MAX
END MODULE M

PROGRAM REDUCTION3
   USE M, REN => MAX
   N = 0
!$OMP PARALLEL DO REDUCTION(REN: N)     ! still does MAX
   DO I = 1, 100
      N = MAX(N,I)
   END DO
END PROGRAM REDUCTION3

The following conforming program performs the reduction using intrinsic procedure name MAX even though the intrinsic MAX has been renamed to MIN.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.5
! type: F-free
MODULE MOD
   INTRINSIC MAX, MIN
END MODULE MOD

PROGRAM REDUCTION4
   USE MOD, MIN=>MAX, MAX=>MIN
   REAL :: R
   R = -HUGE(0.0)

!$OMP PARALLEL DO REDUCTION(MIN: R)     ! still does MAX
   DO I = 1, 1000
      R = MIN(R, SIN(REAL(I)))
   END DO
   PRINT *, R
END PROGRAM REDUCTION4

The following example is non-conforming because the initialization (a = 0) of the original list item a is not synchronized with the update of a as a result of the reduction computation in the for loop. Therefore, the example may print an incorrect value for a.

To avoid this problem, the initialization of the original list item a should complete before any update of a as a result of the reduction clause. This can be achieved by adding an explicit barrier after the assignment a = 0, or by enclosing the assignment a = 0 in a single directive (which has an implied barrier), or by initializing a before the start of the parallel region.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.6
* type: C
* version:    omp_5.1
*/
#include <stdio.h>

int main (void)
{
  int a, i;

  #pragma omp parallel shared(a) private(i)
  {
    #pragma omp masked
    a = 0;

    // To avoid race conditions, add a barrier here.

    #pragma omp for reduction(+:a)
    for (i = 0; i < 10; i++) {
        a += i;
    }

    #pragma omp single
    printf ("Sum is %d\n", a);
  }
  return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.6
! type: F-fixed
! version:    omp_5.1
      INTEGER A, I

!$OMP PARALLEL SHARED(A) PRIVATE(I)

!$OMP MASKED
      A = 0
!$OMP END MASKED

      ! To avoid race conditions, add a barrier here.

!$OMP DO REDUCTION(+:A)
      DO I= 0, 9
         A = A + I
      END DO

!$OMP SINGLE
      PRINT *, "Sum is ", A
!$OMP END SINGLE

!$OMP END PARALLEL

      END

The following example demonstrates the reduction of array a . In C/C++ this is illustrated by the explicit use of an array section a[0:N] in the reduction clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the reduction clause in Fortran is not permitted. But this oversight has been fixed in the OpenMP 5.0 specification.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.7
* type: C
* version: omp_4.5
*/
#include <stdio.h>

#define N 100
void init(int n, float (*b)[N]);

int main(){

  int i,j;
  float a[N], b[N][N];

  init(N,b);

  for(i=0; i<N; i++) a[i]=0.0e0;

  #pragma omp parallel for reduction(+:a[0:N]) private(j)
  for(i=0; i<N; i++){
    for(j=0; j<N; j++){
       a[j] +=  b[i][j];
    }
  }
  printf(" a[0] a[N-1]: %f %f\n", a[0], a[N-1]);

  return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: reduction.7
! type: F-free
program array_red

  integer,parameter :: n=100
  integer           :: j
  real              :: a(n), b(n,n)

  call init(n,b)

  a(:) = 0.0e0

  !$omp parallel do reduction(+:a)
  do j = 1, n
     a(:) = a(:) + b(:,j)
  end do

  print*, " a(1) a(n): ", a(1), a(n)

end program

10.9.2. Task Reduction#

In OpenMP 5.0 the task_reduction clause was created for the taskgroup construct, to allow reductions among explicit tasks that have an in_reduction clause.

In the task_reduction.1 example below a reduction is performed as the algorithm traverses a linked list. The reduction statement is assigned to be an explicit task using a task construct and is specified to be a reduction participant with the in_reduction clause. A taskgroup construct encloses the tasks participating in the reduction, and specifies, with the task_reduction clause, that the taskgroup has tasks participating in a reduction. After the taskgroup region the original variable will contain the final value of the reduction.

Note: The res variable is private in the linked_list_sum routine and is not required to be shared (as in the case of a parallel construct reduction).

//%compiler: clang
//%cflags: -fopenmp

/*
* name:       task_reduction.1
* type:       C
*/

#include<stdlib.h>
#include<stdio.h>
#define N 10

typedef struct node_tag {
    int val;
    struct node_tag *next;
} node_t;

int linked_list_sum(node_t *p)
{
    int res = 0;

    #pragma omp taskgroup task_reduction(+: res)
    {
        node_t* aux = p;
        while(aux != 0)
        {
            #pragma omp task in_reduction(+: res)
            res += aux->val;

            aux = aux->next;
        }
    }
    return res;
}


int main(int argc, char *argv[]) {
    int i;
//                           Create the root node.
    node_t* root = (node_t*) malloc(sizeof(node_t));
    root->val = 1;

    node_t* aux = root;

//                           Create N-1 more nodes.
    for(i=2;i<=N;++i){
        aux->next = (node_t*) malloc(sizeof(node_t));
        aux = aux->next;
        aux->val = i;
    }

    aux->next = 0;

    #pragma omp parallel
    #pragma omp single
    {
        int result = linked_list_sum(root);
        printf( "Calculated: %d  Analytic:%d\n", result, (N*(N+1)/2) );
    }

    return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name:       task_reduction.1
! type:       F-free

module m
    type node_t
        integer :: val
        type(node_t), pointer :: next
    end type
end module m

function linked_list_sum(p) result(res)
    use m
    implicit none
    type(node_t), pointer :: p
    type(node_t), pointer :: aux
    integer :: res

    res = 0

    !$omp taskgroup task_reduction(+: res)
        aux => p
        do while (associated(aux))
            !$omp task in_reduction(+: res)
                res = res + aux%val
            !$omp end task
            aux => aux%next
        end do
    !$omp end taskgroup
end function linked_list_sum


program main
    use m
    implicit none
    type(node_t), pointer :: root, aux
    integer :: res, i
    integer, parameter :: N=10

    interface
        function linked_list_sum(p) result(res)
            use m
            implicit none
            type(node_t), pointer :: p
            integer :: res
        end function
    end interface
!                       Create the root node.
    allocate(root)
    root%val = 1
    aux => root

!                       Create N-1 more nodes.
    do i = 2,N
        allocate(aux%next)
        aux => aux%next
        aux%val = i
    end do

    aux%next => null()

    !$omp parallel
    !$omp single
        res = linked_list_sum(root)
        print *, "Calculated:", res, " Analytic:", (N*(N+1))/2
    !$omp end single
    !$omp end parallel

end program main

In OpenMP 5.0 the task reduction-modifier for the reduction clause was introduced to provide a means of performing reductions among implicit and explicit tasks.

The reduction clause of a parallel or worksharing construct may specify the task reduction-modifier to include explicit task reductions within their region, provided the reduction operators ( reduction-identifiers ) and variables ( list items ) of the participating tasks match those of the implicit tasks.

There are 2 reduction use cases (identified by USE CASE #) in the task_reduction.2 example below.

In USE CASE 1 a task modifier in the reduction clause of the parallel construct is used to include the reductions of any participating tasks, those with an in_reduction clause and matching reduction-identifiers (+) and list items (x).

Note, a taskgroup construct (with a task_reduction clause) in not necessary to scope the explicit task reduction (as seen in the example above). Hence, even without the implicit task reduction statement (without the C x++ and Fortran x=x+1 statements), the task reduction-modifier in a reduction clause of the parallel construct can be used to avoid having to create a taskgroup construct (and its task_reduction clause) around the task generating structure.

In USE CASE 2 tasks participating in the reduction are within a worksharing region (a parallel worksharing-loop construct). Here, too, no taskgroup is required, and the reduction-identifier (+) and list item (variable x) match as required.

//%compiler: clang
//%cflags: -fopenmp

/*
* name:       task_reduction.2
* type:       C
* version: omp_5.0
*/
#include <stdio.h>
int main(void){
   int N=100, M=10;
   int i, x;

// USE CASE 1  explicit-task reduction + parallel reduction clause
   x=0;
   #pragma omp parallel num_threads(M) reduction(task,+:x)
   {

     x++;                // implicit task reduction statement

     #pragma omp single
     for(i=0;i<N;i++)
       #pragma omp task in_reduction(+:x)
       x++;

   }
   printf("x=%d  =M+N\n",x);  // x= 110  =M+N


// USE CASE 2  task reduction +  worksharing reduction clause
   x=0;
   #pragma omp parallel for num_threads(M) reduction(task,+:x)
   for(i=0; i< N; i++){

      x++;

      if( i%2 == 0){
       #pragma omp task in_reduction(+:x)
       x--;
      }
   }
   printf("x=%d  =N-N/2\n",x);  // x= 50  =N-N/2

   return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name:       task_reduction.2
! type:       F-free
! version:    omp_5.0

program task_modifier

   integer :: N=100, M=10
   integer :: i, x

! USE CASE 1  explicit-task reduction + parallel reduction clause
   x=0
   !$omp parallel num_threads(M) reduction(task,+:x)

     x=x+1                   !! implicit task reduction statement

     !$omp single
       do i = 1,N
         !$omp task in_reduction(+:x)
           x=x+1
         !$omp end task
       end do
     !$omp end single

   !$omp end parallel
   write(*,'("x=",I0," =M+N")') x   ! x= 110 =M+N


! USE CASE 2  task reduction +  worksharing reduction clause
   x=0
   !$omp parallel do num_threads(M) reduction(task,+:x)
     do i = 1,N

        x=x+1

        if( mod(i,2) == 0) then
           !$omp task in_reduction(+:x)
             x=x-1
           !$omp end task
        endif

     end do
   write(*,'("x=",I0,"  =N-N/2")') x   ! x= 50 =N-N/2

end program

10.9.3. Reduction on Combined Target Constructs#

When a reduction clause appears on a combined construct that combines a target construct with another construct, there is an implicit map of the list items with a tofrom map type for the target construct. Otherwise, the list items (if they are scalar variables) would be treated as firstprivate by default in the target construct, which is unlikely to provide the intended behavior since the result of the reduction that is in the firstprivate variable would be discarded at the end of the target region.

In the following example, the use of the reduction clause on sum1 or sum2 should, by default, result in an implicit tofrom map for that variable. So long as neither sum1 nor sum2 were already present on the device, the mapping behavior ensures the value for sum1 computed in the first target construct is used in the second target construct.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_reduction.1
* type: C
* version: omp_5.0
*/
#include <stdio.h>
int f(int);
int g(int);
int main()
{
   int sum1=0, sum2=0;
   int i;
   const int n = 100;

   #pragma omp target teams distribute reduction(+:sum1)
   for (int i = 0; i < n; i++) {
      sum1 += f(i);
   }

   #pragma omp target teams distribute reduction(+:sum2)
   for (int i = 0; i < n; i++) {
      sum2 += g(i) * sum1;
   }

   printf(  "sum1 = %d, sum2 = %d\n", sum1, sum2);
   //OUTPUT: sum1 = 9900, sum2 = 147015000
   return 0;
}

int f(int res){ return res*2; }
int g(int res){ return res*3; }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: target_reduction.1
! type: F-free
! version: omp_5.0
program target_reduction_ex1
   interface
      function f(res)
             integer :: f, res
          end function
      function g(res)
             integer :: g, res
          end function
   end interface
   integer :: sum1, sum2, i
   integer, parameter :: n = 100
   sum1 = 0
   sum2 = 0
   !$omp target teams distribute reduction(+:sum1)
       do i=1,n
          sum1 = sum1 + f(i)
       end do
   !$omp target teams distribute reduction(+:sum2)
       do i=1,n
          sum2 = sum2 + g(i)*sum1
       end do
   print *, "sum1 = ", sum1, ", sum2 = ", sum2
   !!OUTPUT: sum1 =     10100 , sum2 = 153015000
end program


integer function f(res)
   integer :: res
   f = res*2
end function
integer function g(res)
   integer :: res
   g = res*3
end function

In next example, the variables sum1 and sum2 remain on the device for the duration of the target data region so that it is their device copies that are updated by the reductions. Note the significance of mapping sum1 on the second target construct; otherwise, it would be treated by default as firstprivate and the result computed for sum1 in the prior target region may not be used. Alternatively, a target update construct could be used between the two target constructs to update the host version of sum1 with the value that is in the corresponding device version after the completion of the first construct.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_reduction.2
* type: C
* version: omp_5.0
*/
#include <stdio.h>
int f(int);
int g(int);
int main()
{
   int sum1=0, sum2=0;
   int i;
   const int n = 100;

   #pragma omp target data map(sum1,sum2)
   {
      #pragma omp target teams distribute reduction(+:sum1)
      for (int i = 0; i < n; i++) {
         sum1 += f(i);
      }

      #pragma omp target teams distribute map(sum1) reduction(+:sum2)
      for (int i = 0; i < n; i++) {
         sum2 += g(i) * sum1;
      }
   }
   printf(  "sum1 = %d, sum2 = %d\n", sum1, sum2);
   //OUTPUT: sum1 = 9900, sum2 = 147015000
   return 0;
}

int f(int res){ return res*2; }
int g(int res){ return res*3; }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: target_reduction.2
! type: F-free
! version: omp_5.0

program target_reduction_ex2
   interface
      function f(res)
             integer :: f, res
          end function
      function g(res)
             integer :: g, res
          end function
   end interface
   integer :: sum1, sum2, i
   integer, parameter :: n = 100
   sum1 = 0
   sum2 = 0
   !$omp target data map(sum1, sum2)
       !$omp target teams distribute reduction(+:sum1)
           do i=1,n
              sum1 = sum1 + f(i)
           end do
       !$omp target teams distribute map(sum1) reduction(+:sum2)
           do i=1,n
              sum2 = sum2 + g(i)*sum1
           end do
   !$omp end target data
   print *, "sum1 = ", sum1, ", sum2 = ", sum2
   !!OUTPUT: sum1 =     10100 , sum2 = 153015000
end program


integer function f(res)
   integer :: res
   f = res*2
end function
integer function g(res)
   integer :: res
   g = res*3
end function

10.9.4. Task Reduction with Target Constructs#

The following examples illustrate how task reductions can apply to target tasks that result from a target construct with the in_reduction clause. Here, the in_reduction clause specifies that the target task participates in the task reduction defined in the scope of the enclosing taskgroup construct. Partial results from all tasks participating in the task reduction will be combined (in some order) into the original variable listed in the task_reduction clause before exiting the taskgroup region.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.1
* type: C
* version: omp_5.2
*/

#include <stdio.h>
#pragma omp declare target enter(device_compute)
void device_compute(int *);
void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel masked
   #pragma omp taskgroup task_reduction(+:sum)
   {
      #pragma omp target in_reduction(+:sum) nowait
          device_compute(&sum);

      #pragma omp task in_reduction(+:sum)
          host_compute(&sum);
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: target_task_reduction.1
! type: F-free
! version: omp_5.2

program target_task_reduction_ex1
   interface
      subroutine device_compute(res)
      !$omp declare target enter(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel masked
      !$omp taskgroup task_reduction(+:sum)
         !$omp target in_reduction(+:sum) nowait
            call device_compute(sum)
         !$omp end target
         !$omp task in_reduction(+:sum)
            call host_compute(sum)
         !$omp end task
      !$omp end taskgroup
   !$omp end parallel masked
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program

subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine

In the next pair of examples, the task reduction is defined by a reduction clause with the task modifier, rather than a task_reduction clause on a taskgroup construct. Again, the partial results from the participating tasks will be combined in some order into the original reduction variable, sum.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.2a
* type: C
* version: omp_5.2
*/
#include <stdio.h>
#pragma omp declare target enter(device_compute)
extern void device_compute(int *);
extern void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel sections reduction(task, +:sum)
   {
      #pragma omp section
          {
             #pragma omp target in_reduction(+:sum)
             device_compute(&sum);
          }
      #pragma omp section
          {
             host_compute(&sum);
          }
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: target_task_reduction.2a
! type: F-free
! version: omp_5.2

program target_task_reduction_ex2
   interface
      subroutine device_compute(res)
      !$omp declare target enter(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel sections reduction(task,+:sum)
      !$omp section
         !$omp target in_reduction(+:sum) nowait
           call device_compute(sum)
         !$omp end target
      !$omp section
         call host_compute(sum)
   !$omp end parallel sections
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program

subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine

Next, the task modifier is again used to define a task reduction over participating tasks. This time, the participating tasks are a target task resulting from a target construct with the in_reduction clause, and the implicit task (executing on the primary thread) that calls host_compute. As before, the partial results from these participating tasks are combined in some order into the original reduction variable.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.2b
* type: C
* version: omp_5.2
*/
#include <stdio.h>
#pragma omp declare target enter(device_compute)
extern void device_compute(int *);
extern void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel masked reduction(task, +:sum)
   {
       #pragma omp target in_reduction(+:sum) nowait
       device_compute(&sum);

       host_compute(&sum);
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: target_task_reduction.2b
! type: F-free
! version: omp_5.2

program target_task_reduction_ex2b
   interface
      subroutine device_compute(res)
      !$omp declare target enter(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel masked reduction(task,+:sum)
         !$omp target in_reduction(+:sum) nowait
           call device_compute(sum)
         !$omp end target
         call host_compute(sum)
   !$omp end parallel masked
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program


subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine

10.9.5. Taskloop Reduction#

In the OpenMP 5.0 Specification the taskloop construct was extended to include the reductions.

The following two examples show how to implement a reduction over an array using taskloop reduction in two different ways. In the first example we apply the reduction clause to the taskloop construct. As it was explained above in the task reduction examples, a reduction over tasks is divided in two components: the scope of the reduction, which is defined by a taskgroup region, and the tasks that participate in the reduction. In this example, the reduction clause defines both semantics. First, it specifies that the implicit taskgroup region associated with the taskloop construct is the scope of the reduction, and second, it defines all tasks created by the taskloop construct as participants of the reduction. About the first property, it is important to note that if we add the nogroup clause to the taskloop construct the code will be nonconforming, basically because we have a set of tasks that participate in a reduction that has not been defined.

//%compiler: clang
//%cflags: -fopenmp

/*
* name:       taskloop_reduction.1
* type:       C
* version:    omp_5.0
*/
#include <stdio.h>

int array_sum(int n, int *v) {
    int i;
    int res = 0;

    #pragma omp taskloop reduction(+: res)
    for(i = 0; i < n; ++i)
        res += v[i];

    return res;
}

int main(int argc, char *argv[]) {
    int n = 10;
    int v[10] = {1,2,3,4,5,6,7,8,9,10};

    #pragma omp parallel
    #pragma omp single
    {
        int res = array_sum(n, v);
        printf("The result is %d\n", res);
    }
    return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: taskloop_reduction.1
! type: F-free
! version:    omp_5.0
function array_sum(n, v) result(res)
    implicit none
    integer :: n, v(n), res
    integer :: i

    res = 0
    !$omp taskloop reduction(+: res)
    do i=1, n
        res = res + v(i)
    end do
    !$omp end taskloop

end function array_sum

program main
    implicit none
    integer :: n, v(10), res
    integer :: i

    integer, external :: array_sum

    n = 10
    do i=1, n
        v(i) = i
    end do

    !$omp parallel
    !$omp single
    res = array_sum(n, v)
    print *, "The result is", res
    !$omp end single
    !$omp end parallel
end program main

The second example computes exactly the same value as in the preceding taskloop_reduction.1 code section, but in a very different way. First, in the array_sum function a taskgroup region is created that defines the scope of a new reduction using the task_reduction clause. After that, a task and also the tasks generated by a taskloop participate in that reduction by using the in_reduction clause on the task and taskloop constructs, respectively. Note that the nogroup clause was added to the taskloop construct. This is allowed because what is expressed with the in_reduction clause is different from what is expressed with the reduction clause. In one case the generated tasks are specified to participate in a previously declared reduction (in_reduction clause) whereas in the other case creation of a new reduction is specified and also all tasks generated by the taskloop will participate on it.

//%compiler: clang
//%cflags: -fopenmp

/*
* name:       taskloop_reduction.2
* type:       C
* version:    omp_5.0
*/
#include <stdio.h>

int array_sum(int n, int *v) {
    int i;
    int res = 0;

    #pragma omp taskgroup task_reduction(+: res)
    {
        if (n > 0) {
            #pragma omp task in_reduction(+: res)
            res = res + v[0];

            #pragma omp taskloop in_reduction(+: res) nogroup
            for(i = 1; i < n; ++i)
                res += v[i];
        }
    }

    return res;
}

int main(int argc, char *argv[]) {
    int n = 10;
    int v[10] = {1,2,3,4,5,6,7,8,9,10};

    #pragma omp parallel
    #pragma omp single
    {
        int res = array_sum(n, v);
        printf("The result is %d\n", res);
    }
    return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: taskloop_reduction.2
! type: F-free
! version:    omp_5.0
function array_sum(n, v) result(res)
    implicit none
    integer :: n, v(n), res
    integer :: i

    res = 0
    !$omp taskgroup task_reduction(+: res)
    if (n > 0) then
        !$omp task in_reduction(+: res)
        res = res + v(1)
        !$omp end task

        !$omp taskloop in_reduction(+: res) nogroup
        do i=2, n
            res = res + v(i)
        end do
        !$omp end taskloop
    endif
    !$omp end taskgroup

end function array_sum

program main
    implicit none
    integer :: n, v(10), res
    integer :: i

    integer, external :: array_sum

    n = 10
    do i=1, n
        v(i) = i
    end do

    !$omp parallel
    !$omp single
    res = array_sum(n, v)
    print *, "The result is", res
    !$omp end single
    !$omp end parallel
end program main

In the OpenMP 5.0 Specification, reduction clauses for the taskloop simd construct were also added.

The examples below compare reductions for the taskloop and the taskloop simd constructs. These examples illustrate the use of reduction clauses within “stand-alone” taskloop constructs, and the use of in_reduction clauses for tasks of taskloops to participate with other reductions within the scope of a parallel region.

taskloop reductions:

In the taskloop reductions section of the example below, taskloop 1 uses the reduction clause in a taskloop construct for a sum reduction, accumulated in asum . The behavior is as though a taskgroup construct encloses the taskloop region with a task_reduction clause, and each taskloop task has an in_reduction clause with the specifications of the reduction clause. At the end of the taskloop region asum contains the result of the reduction.

The next taskloop, taskloop 2 , illustrates the use of the in_reduction clause to participate in a previously defined reduction scope of a parallel construct.

The task reductions of task 2 and taskloop 2 are combined across the taskloop construct and the single task construct, as specified in the reduction(task, +:asum) clause of the parallel construct. At the end of the parallel region asum contains the combined result of all reductions.

taskloop simd reductions:

Reductions for the taskloop simd construct are shown in the second half of the code. Since each component construct, taskloop and simd, can accept a reduction-type clause, the taskloop simd construct is a composite construct, and the specific application of the reduction clause is defined within the taskloop simd construct section of the OpenMP 5.0 Specification. The code below illustrates use cases for these reductions.

In the taskloop simd reduction section of the example below, taskloop simd 3 uses the reduction clause in a taskloop simd construct for a sum reduction within a loop. For this case a reduction clause is used, as one would use for a simd construct. The SIMD reductions of each task are combined, and the results of these tasks are further combined just as in the taskloop construct with the reduction clause for taskloop 1 . At the end of the taskloop region asum contains the combined result of all reductions.

If a taskloop simd construct is to participate in a previously defined reduction scope, the reduction participation should be specified with a in_reduction clause, as shown in the parallel region enclosing task 4 and taskloop simd 4 code sections.

Here the taskloop simd construct’s in_reduction clause specifies participation of the construct’s tasks as a task reduction within the scope of the parallel region. That is, the results of each task of the taskloop construct component contribute to the reduction in a broader level, just as in parallel reduction a code section above. Also, each simd-component construct occurs as if it has a reduction clause, and the SIMD results of each task are combined as though to form a single result for each task (that participates in the in_reduction clause). At the end of the parallel region asum contains the combined result of all reductions.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: taskloop_simd_reduction.1
* type: C
* version: omp_5.1
*/
#include <stdio.h>
#define N 100

int main(){
   int i, a[N], asum=0;

   for(i=0;i<N;i++) a[i]=i;

   // taskloop reductions

   #pragma omp parallel masked
   #pragma omp taskloop reduction(+:asum) // taskloop 1
      for(i=0;i<N;i++){ asum += a[i]; }


   #pragma omp parallel reduction(task, +:asum) // parallel reduction a
   {
      #pragma omp masked
      #pragma omp task            in_reduction(+:asum) // task 2
         for(i=0;i<N;i++){ asum += a[i]; }

      #pragma omp masked taskloop in_reduction(+:asum) // taskloop 2
         for(i=0;i<N;i++){ asum += a[i]; }
   }

   // taskloop simd reductions

   #pragma omp parallel masked
   #pragma omp taskloop simd reduction(+:asum) // taskloop simd 3
     for(i=0;i<N;i++){ asum += a[i]; }


   #pragma omp parallel reduction(task, +:asum) // parallel reduction b
   {
      #pragma omp masked
      #pragma omp task                 in_reduction(+:asum) // task 4
         for(i=0;i<N;i++){ asum += a[i]; }

      #pragma omp masked taskloop simd in_reduction(+:asum) // taskloop
         for(i=0;i<N;i++){ asum += a[i]; }                  // simd 4

  }

  printf("asum=%d \n",asum); // output: asum=29700
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: taskloop_simd_reduction.1
! type: F-free
! version: omp_5.1
program main

  use omp_lib
  integer, parameter ::  N=100
  integer            :: i, a(N), asum=0

  a = [( i, i=1,N )]    !! initialize

!! taskloop reductions

  !$omp parallel masked
  !$omp taskloop reduction(+:asum)                  !! taskloop 1
    do i=1,N;  asum = asum + a(i);  enddo
  !$omp end taskloop
  !$omp end parallel masked


  !$omp parallel reduction(task, +:asum)            !! parallel reduction a

     !$omp masked
     !$omp task            in_reduction(+:asum)     !! task 2
       do i=1,N;  asum = asum + a(i);  enddo
     !$omp end task
     !$omp end masked

     !$omp masked taskloop in_reduction(+:asum)     !! taskloop 2
       do i=1,N;  asum = asum + a(i);  enddo
     !$omp end masked taskloop

  !$omp end parallel

!! taskloop simd reductions

  !$omp parallel masked
  !$omp taskloop simd reduction(+:asum)             !! taskloop simd 3
    do i=1,N;  asum = asum + a(i);  enddo
  !$omp end taskloop simd
  !$omp end parallel masked


  !$omp parallel reduction(task, +:asum)            !! parallel reduction b

    !$omp masked
    !$omp task                 in_reduction(+:asum) !! task 4
       do i=1,N;  asum = asum + a(i);  enddo
    !$omp end task
    !$omp end masked

    !$omp masked taskloop simd in_reduction(+:asum) !! taskloop simd 4
       do i=1,N;  asum = asum + a(i);  enddo
    !$omp end masked taskloop simd

  !$omp end parallel

  print*,"asum=",asum   !! output: asum=30300

end program

10.9.6. Reduction with the scope Construct#

The following example illustrates the use of the scope construct to perform a reduction in a parallel region. The case is useful for producing a reduction and accessing reduction variables inside a parallel region without using a worksharing-loop construct.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: scope_reduction.1
* type: C++
* version: omp_5.1
*/
#include <stdio.h>
void do_work(int n, float a[], float &s)
{
   float loc_s = 0.0f;        // local sum
   static int nthrs;
   #pragma omp for
      for (int i = 0; i < n; i++)
         loc_s += a[i];
   #pragma omp single
   {
      s = 0.0f;               // total sum
      nthrs = 0;
   }
   #pragma omp scope reduction(+:s,nthrs)
   {
      s += loc_s;
      nthrs++;
   }
   #pragma omp masked
      printf("total sum = %f, nthrs = %d\n", s, nthrs);
}

float work(int n, float a[])
{
   float s;
   #pragma omp parallel
   {
      do_work(n, a, s);
   }
   return s;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: scope_reduction.1
! type: F-free
! version: omp_5.1
subroutine do_work(n, a, s)
   implicit none
   integer n, i
   real a(*), s, loc_s
   integer, save :: nthrs

   loc_s = 0.0                ! local sum
   !$omp do
      do i = 1, n
         loc_s = loc_s + a(i)
      end do
   !$omp single
      s = 0.0                 ! total sum
      nthrs = 0
   !$omp end single
   !$omp scope reduction(+:s,nthrs)
      s = s + loc_s
      nthrs = nthrs + 1
   !$omp end scope
   !$omp masked
      print *, "total sum = ", s, ", nthrs = ", nthrs
   !$omp end masked
end subroutine

function work(n, a) result(s)
   implicit none
   integer n
   real a(*), s

   !$omp parallel
      call do_work(n, a, s)
   !$omp end parallel
end function

10.9.7. User-Defined Reduction#

The declare reduction directive can be used to specify user-defined reductions (UDR) for user data types.

In the following example, declare reduction directives are used to define min and max operations for the point data structure for computing the rectangle that encloses a set of 2-D points.

Each declare reduction directive defines new reduction identifiers, min and max , to be used in a reduction clause. The next item in the declaration list is the data type ( struct point ) used in the reduction, followed by the combiner, here the functions minproc and maxproc perform the min and max operations, respectively, on the user data (of type struct point ). In the function argument list are two special OpenMP variable identifiers, omp_in and omp_out, that denote the two values to be combined in the “real” function; the omp_out identifier indicates which one is to hold the result.

The initializer of the declare reduction directive specifies the initial value for the private variable of each implicit task. The omp_priv identifier is used to denote the private variable.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: udr.1
* type: C
* version: omp_4.0
*/
#include <stdio.h>
#include <limits.h>

struct point {
  int x;
  int y;
};

void minproc ( struct point *out, struct point *in )
{
  if ( in->x < out->x ) out->x = in->x;
  if ( in->y < out->y ) out->y = in->y;
}

void maxproc ( struct point *out, struct point *in )
{
  if ( in->x > out->x ) out->x = in->x;
  if ( in->y > out->y ) out->y = in->y;
}

#pragma omp declare reduction(min : struct point : \
        minproc(&omp_out, &omp_in)) \
 initializer( omp_priv = { INT_MAX, INT_MAX } )

#pragma omp declare reduction(max : struct point : \
        maxproc(&omp_out, &omp_in)) \
 initializer( omp_priv = { 0, 0 } )

void find_enclosing_rectangle ( int n, struct point points[] )
{
  struct point minp = { INT_MAX, INT_MAX }, maxp = {0,0};
  int i;

#pragma omp parallel for reduction(min:minp) reduction(max:maxp)
  for ( i = 0; i < n; i++ ) {
     minproc(&minp, &points[i]);
     maxproc(&maxp, &points[i]);
  }
  printf("min = (%d, %d)\n", minp.x, minp.y);
  printf("max = (%d, %d)\n", maxp.x, maxp.y);
}

The following example shows the corresponding code in Fortran. The declare reduction directives are specified as part of the declaration in subroutine find_enclosing_rectangle and the procedures that perform the min and max operations are specified as subprograms.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: udr.1
! type: F-free
! version: omp_4.0
module data_type

  type :: point
    integer :: x
    integer :: y
  end type

end module data_type

subroutine find_enclosing_rectangle ( n, points )
  use data_type
  implicit none
  integer :: n
  type(point) :: points(*)

  !$omp declare reduction(min : point : minproc(omp_out, omp_in)) &
  !$omp&  initializer( omp_priv = point( HUGE(0), HUGE(0) ) )

  !$omp declare reduction(max : point : maxproc(omp_out, omp_in)) &
  !$omp&  initializer( omp_priv = point( 0, 0 ) )

  type(point) :: minp = point( HUGE(0), HUGE(0) ), maxp = point( 0, 0 )
  integer :: i

  !$omp parallel do reduction(min:minp) reduction(max:maxp)
  do i = 1, n
     call minproc(minp, points(i))
     call maxproc(maxp, points(i))
  end do
  print *, "min = (", minp%x, minp%y, ")"
  print *, "max = (", maxp%x, maxp%y, ")"

 contains
  subroutine minproc ( out, in )
    implicit none
    type(point), intent(inout) :: out
    type(point), intent(in) :: in

    out%x = min( out%x, in%x )
    out%y = min( out%y, in%y )
  end subroutine minproc

  subroutine maxproc ( out, in )
    implicit none
    type(point), intent(inout) :: out
    type(point), intent(in) :: in

    out%x = max( out%x, in%x )
    out%y = max( out%y, in%y )
  end subroutine maxproc

end subroutine

The following example shows the same computation as udr.1 but it illustrates that you can craft complex expressions in the user-defined reduction declaration. In this case, instead of calling the minproc and maxproc functions we inline the code in a single expression.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: udr.2
* type: C
* version: omp_4.0
*/
#include <stdio.h>
#include <limits.h>

struct point {
  int x;
  int y;
};

#pragma omp declare reduction(min : struct point : \
        omp_out.x = omp_in.x > omp_out.x  ? omp_out.x : omp_in.x, \
        omp_out.y = omp_in.y > omp_out.y  ? omp_out.y : omp_in.y ) \
        initializer( omp_priv = { INT_MAX, INT_MAX } )

#pragma omp declare reduction(max : struct point : \
        omp_out.x = omp_in.x < omp_out.x  ? omp_out.x : omp_in.x,  \
        omp_out.y = omp_in.y < omp_out.y  ? omp_out.y : omp_in.y ) \
        initializer( omp_priv = { 0, 0 } )

void find_enclosing_rectangle ( int n, struct point points[] )
{
  struct point minp = { INT_MAX, INT_MAX }, maxp = {0,0};
  int i;

#pragma omp parallel for reduction(min:minp) reduction(max:maxp)
  for ( i = 0; i < n; i++ ) {
    if ( points[i].x < minp.x ) minp.x = points[i].x;
    if ( points[i].y < minp.y ) minp.y = points[i].y;
    if ( points[i].x > maxp.x ) maxp.x = points[i].x;
    if ( points[i].y > maxp.y ) maxp.y = points[i].y;
  }
  printf("min = (%d, %d)\n", minp.x, minp.y);
  printf("max = (%d, %d)\n", maxp.x, maxp.y);
}

The corresponding code of the same example in Fortran is very similar except that the assignment expression in the declare reduction directive can only be used for a single variable, in this case through a type structure constructor point( … ) .

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: udr.2
! type: F-free
! version: omp_4.0
module data_type

  type :: point
    integer :: x
    integer :: y
  end type

end module data_type

subroutine find_enclosing_rectangle ( n, points )
  use data_type
  implicit none
  integer :: n
  type(point) :: points(*)

  !$omp declare reduction( min : point :  &
  !$omp&   omp_out = point(min( omp_out%x, omp_in%x ), &
  !$omp&                   min( omp_out%y, omp_in%y )) ) &
  !$omp&   initializer( omp_priv = point( HUGE(0), HUGE(0) ) )

  !$omp declare reduction( max : point :  &
  !$omp&   omp_out = point(max( omp_out%x, omp_in%x ), &
  !$omp&                   max( omp_out%y, omp_in%y )) ) &
  !$omp&   initializer( omp_priv = point( 0, 0 ) )

  type(point) :: minp = point( HUGE(0), HUGE(0) ), maxp = point( 0, 0 )
  integer :: i

  !$omp parallel do reduction(min: minp) reduction(max: maxp)
  do i = 1, n
     minp%x = min(minp%x, points(i)%x)
     minp%y = min(minp%y, points(i)%y)
     maxp%x = max(maxp%x, points(i)%x)
     maxp%y = max(maxp%y, points(i)%y)
  end do
  print *, "min = (", minp%x, minp%y, ")"
  print *, "max = (", maxp%x, maxp%y, ")"

end subroutine

The following example shows the use of special variables in arguments for combiner (omp_in and omp_out) and initializer (omp_priv and omp_orig) routines. This example returns the maximum value of an array and the corresponding index value. The declare reduction directive specifies a user-defined reduction operation maxloc for data type struct mx_s . The function mx_combine is the combiner and the function mx_init is the initializer.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: udr.3
* type: C
* version: omp_4.0
*/

#include <stdio.h>
#define N 100

struct mx_s {
   float value;
   int index;
};

/* prototype functions for combiner and initializer in
   the declare reduction */
void mx_combine(struct mx_s *out, struct mx_s *in);
void mx_init(struct mx_s *priv, struct mx_s *orig);

#pragma omp declare reduction(maxloc: struct mx_s: \
        mx_combine(&omp_out, &omp_in)) \
        initializer(mx_init(&omp_priv, &omp_orig))

void mx_combine(struct mx_s *out, struct mx_s *in)
{
   if ( out->value < in->value ) {
      out->value = in->value;
      out->index = in->index;
   }
}

void mx_init(struct mx_s *priv, struct mx_s *orig)
{
   priv->value = orig->value;
   priv->index = orig->index;
}

int main(void)
{
   struct mx_s mx;
   float val[N], d;
   int i, count = N;

   for (i = 0; i < count; i++) {
      d = (N*0.8f - i);
      val[i] = N * N - d * d;
   }

   mx.value = val[0];
   mx.index = 0;
   #pragma omp parallel for reduction(maxloc: mx)
   for (i = 1; i < count; i++) {
      if (mx.value < val[i])
      {
         mx.value = val[i];
         mx.index = i;
      }
   }

   printf("max value = %g, index = %d\n", mx.value, mx.index);
   /* prints 10000, 80 */

   return 0;
}

Below is the corresponding Fortran version of the above example. The declare reduction directive specifies the user-defined operation maxloc for user-derived type mx_s . The combiner mx_combine and the initializer mx_init are specified as subprograms.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: udr.3
! type: F-free
! version: omp_4.0
program max_loc
   implicit none

   type :: mx_s
      real value
      integer index
   end type

   !$omp declare reduction(maxloc: mx_s: &
   !$omp&        mx_combine(omp_out, omp_in)) &
   !$omp&        initializer(mx_init(omp_priv, omp_orig))

   integer, parameter :: N = 100
   type(mx_s) :: mx
   real :: val(N), d
   integer :: i, count

   count = N
   do i = 1, count
      d = N*0.8 - i + 1
      val(i) = N * N - d * d
   enddo

   mx%value = val(1)
   mx%index = 1
   !$omp parallel do reduction(maxloc: mx)
   do i = 2, count
      if (mx%value < val(i)) then
         mx%value = val(i)
         mx%index = i
      endif
   enddo

   print *, 'max value = ', mx%value, ' index = ', mx%index
   ! prints 10000, 81

 contains

 subroutine mx_combine(out, in)
   implicit none
   type(mx_s), intent(inout) :: out
   type(mx_s), intent(in) :: in

   if ( out%value < in%value ) then
      out%value = in%value
      out%index = in%index
   endif
 end subroutine mx_combine

 subroutine mx_init(priv, orig)
   implicit none
   type(mx_s), intent(out) :: priv
   type(mx_s), intent(in) :: orig

   priv%value = orig%value
   priv%index = orig%index
 end subroutine mx_init

end program

The following example explains a few details of the user-defined reduction in Fortran through modules. The declare reduction directive is declared in a module ( data_red ). The reduction-identifier .add. is a user-defined operator that is to allow accessibility in the scope that performs the reduction operation. The user-defined operator .add. and the subroutine dt_init specified in the initializer clause are defined in the same subprogram.

The reduction operation (that is, the reduction clause) is in the main program. The reduction identifier .add. is accessible by use association. Since .add. is a user-defined operator, the explicit interface should also be accessible by use association in the current program unit. Since the declare reduction associated to this reduction clause has the initializer clause, the subroutine specified on the clause must be accessible in the current scoping unit. In this case, the subroutine dt_init is accessible by use association.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: udr.4
! type: F-free
! version: omp_4.0
module data_red
! Declare data type.
  type dt
    real :: r1
    real :: r2
  end type

! Declare the user-defined operator .add.
  interface operator(.add.)
    module procedure addc
  end interface

! Declare the user-defined reduction operator .add.
!$omp declare reduction(.add.:dt:omp_out=omp_out.add.omp_in) &
!$omp& initializer(dt_init(omp_priv))

 contains
! Declare the initialization routine.
  subroutine dt_init(u)
    type(dt) :: u
    u%r1 = 0.0
    u%r2 = 0.0
  end subroutine

! Declare the specific procedure for the .add. operator.
  function addc(x1, x2) result(xresult)
    type(dt), intent(in) :: x1, x2
    type(dt) :: xresult
    xresult%r1 = x1%r1 + x2%r2
    xresult%r2 = x1%r2 + x2%r1
  end function

end module data_red

program main
  use data_red, only : dt, dt_init, operator(.add.)

  type(dt) :: xdt1, xdt2
  integer :: i

  xdt1 = dt(1.0,2.0)
  xdt2 = dt(2.0,3.0)

! The reduction operation
!$omp parallel do reduction(.add.: xdt1)
  do i = 1, 10
    xdt1 = xdt1 .add. xdt2
  end do
!$omp end parallel do

  print *, xdt1

end program

The following example uses user-defined reductions to declare a plus (+) reduction for a C++ class. As the declare reduction directive is inside the context of the V class the expressions in the declare reduction directive are resolved in the context of the class. Also, note that the initializer clause uses a copy constructor to initialize the private variables of the reduction and it uses as parameter to its original variable by using the special variable omp_orig.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: udr.5
* type: C++
* version: omp_4.0
*/
class V {
   float *p;
   int n;

public:
   V( int _n )     : n(_n)  { p = new float[n]; }
   V( const V& m ) : n(m.n) { p = new float[n]; }
   ~V() { delete[] p; }

   V& operator+= ( const V& );

   #pragma omp declare reduction( + : V : omp_out += omp_in ) \
           initializer(omp_priv(omp_orig))
};

The following examples shows how user-defined reductions can be defined for some STL containers. The first declare reduction defines the plus (+) operation for std::vector by making use of the std::transform algorithm. The second and third define the merge (or concatenation) operation for std::vector and std::list . It shows how the user-defined reduction operation can be applied to specific data types of an STL.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: udr.6
* type: C++
* version: omp_4.0
*/
#include <algorithm>
#include <list>
#include <vector>

#pragma omp declare reduction( + : std::vector<int> : \
     std::transform (omp_out.begin(), omp_out.end(),  \
        omp_in.begin(), omp_in.end(),std::plus<int>()))

#pragma omp declare reduction( merge : std::vector<int> : \
     omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end()))

#pragma omp declare reduction( merge : std::list<int> : \
     omp_out.merge(omp_in))