12.6. declare variant Directive#

A declare variant directive specifies an alternate function, function variant , to be used in place of the base function when the trait within the match clause matches the OpenMP context at a given call site. The base function follows the directive in the C and C++ languages. In Fortran, either a subroutine or function may be used as the base function , and the declare variant directive must be in the specification part of a subroutine or function (unless a base-proc-name modifier is used, as in the case of a procedure declaration statement). See the OpenMP 5.0 Specification for details on the modifier.

When multiple declare variant directives are used a function variant becomes a candidate for replacing the base function if the context at the base function call matches the traits of all selectors in the match clause. If there are multiple candidates, a score is assigned with rules for each of the selector traits. The scoring algorithm can be found in the OpenMP 5.0 Specification.

In the first example the vxv() function is called within a parallel region, a target region, and in a sequential part of the program. Two function variants, p_vxv() and t_vxv() , are defined for the first two regions by using parallel and target selectors (within the construct trait set) in a match clause. The p_vxv() function variant includes a for construct (do construct for Fortran) for the parallel region, while t_vxv() includes a distribute simd construct for the target region. The t_vxv() function is explicitly compiled for the device using a declare target directive.

Since the two declare variant directives have no selectors that match traits for the context of the base function call in the sequential part of the program, the base vxv() function is used there, as expected. (The vectors in the p_vxv and t_vxv functions have been multiplied by 3 and 2, respectively, for checking the validity of the replacement. Normally the purpose of a function variant is to produce the same results by a different method.)

//%compiler: clang
//%cflags: -fopenmp

/*
* name: declare_variant.1
* type: C
* version: omp_5.1
*/

#define N 100
#include <stdio.h>
#include <omp.h>

void p_vxv(int *v1,int *v2,int *v3,int n);
void t_vxv(int *v1,int *v2,int *v3,int n);

#pragma omp declare variant( p_vxv ) match( construct={parallel} )
#pragma omp declare variant( t_vxv ) match( construct={target}   )
void vxv(int *v1,int *v2,int *v3,int n)     // base function
{
   for (int i= 0; i< n; i++)  v3[i] = v1[i] * v2[i];
}

void p_vxv(int *v1,int *v2,int *v3,int n)   // function variant
{
   #pragma omp for
   for (int i= 0; i< n; i++)  v3[i] = v1[i] * v2[i]*3;
}

#pragma omp begin declare target
void t_vxv(int *v1,int *v2,int *v3,int n)   // function variant
{
   #pragma omp distribute simd
   for (int i= 0; i< n; i++)  v3[i] = v1[i] * v2[i]*2;
}
#pragma omp end declare target

int main()
{
   int v1[N], v2[N], v3[N];
   for(int i=0; i<N; i++){ v1[i]=(i+1); v2[i]=-(i+1); v3[i]=0; }   //init

   #pragma omp parallel
   {
      vxv(v1,v2,v3,N);
   }
   printf(" %d  %d\n",v3[0],v3[N-1]); //from p_vxv --  output: -3  -30000

   #pragma omp target teams map(to: v1[:N],v2[:N]) map(from: v3[:N])
   {
      vxv(v1,v2,v3,N);
   }
   printf(" %d  %d\n",v3[0],v3[N-1]); //from t_vxv --  output: -2  -20000

   vxv(v1,v2,v3,N);
   printf(" %d  %d\n",v3[0],v3[N-1]); //from   vxv --  output: -1  -10000

   return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: declare_variant.1
! type: F-free
! version: omp_5.0

module subs
  use omp_lib
contains
   subroutine vxv(v1, v2, v3)             !! base function
      integer,intent(in)  :: v1(:),v2(:)
      integer,intent(out) :: v3(:)
      integer             :: i,n
      !$omp  declare variant( p_vxv ) match( construct={parallel} )
      !$omp  declare variant( t_vxv ) match( construct={target}   )

      n=size(v1)
      do i = 1,n; v3(i) = v1(i) * v2(i); enddo

   end subroutine

   subroutine p_vxv(v1, v2, v3)            !! function variant
      integer,intent(in)  :: v1(:),v2(:)
      integer,intent(out) :: v3(:)
      integer             :: i,n
      n=size(v1)

      !$omp do
      do i = 1,n; v3(i) = v1(i) * v2(i) * 3; enddo

   end subroutine

   subroutine t_vxv(v1, v2, v3)            !! function variant
      integer,intent(in)  :: v1(:),v2(:)
      integer,intent(out) :: v3(:)
      integer             :: i,n
      !$omp declare target
      n=size(v1)

      !$omp distribute simd
      do i = 1,n; v3(i) = v1(i) * v2(i) * 2; enddo

   end subroutine

end module subs


program main
   use omp_lib
   use subs
   integer,parameter :: N = 100
   integer           :: v1(N), v2(N), v3(N)

   do i= 1,N; v1(i)= i; v2(i)= -i; v3(i)= 0;  enddo  !! init

   !$omp parallel
      call vxv(v1,v2,v3)
   !$omp end parallel
   print *, v3(1),v3(N)    !! from p_vxv -- output: -3  -30000

   !$omp target teams map(to: v1,v2) map(from: v3)
      call vxv(v1,v2,v3)
   !$omp end target teams
   print *, v3(1),v3(N)    !! from t_vxv -- output: -2  -20000

   call vxv(v1,v2,v3)
   print *, v3(1),v3(N)    !! from   vxv -- output: -1  -10000

end program

In this example, traits from the device set are used to select a function variant. In the declare variant directive, an isa selector specifies that if the implementation of the ” core-avx512 ‘’ instruction set is detected at compile time the avx512_saxpy() variant function is used for the call to base_saxpy() .

A compilation of avx512_saxpy() is aware of the AVX-512 instruction set that supports 512-bit vector extensions (for Xeon or Xeon Phi architectures). Within avx512_saxpy() , the parallel for simd construct performs parallel execution, and takes advantage of 64-byte data alignment. When the avx512_saxpy() function variant is not selected, the base base_saxpy() function variant containing only a basic parallel for construct is used for the call to base_saxpy() .

//%compiler: clang
//%cflags: -fopenmp

/*
* name: declare_variant.2
* type: C
* version: omp_5.0
*/
#include <omp.h>

void   base_saxpy(int, float, float *, float *);
void avx512_saxpy(int, float, float *, float *);

#pragma omp declare variant( avx512_saxpy ) \
                      match( device={isa("core-avx512")} )
void base_saxpy(int n, float s, float *x, float *y)   // base function
{
   #pragma omp parallel for
   for(int i=0; i<n; i++) y[i] = s*x[i] + y[i];
}

void avx512_saxpy(int n, float s, float *x, float *y) //function variant
{
   //assume 64-byte alignment for AVX-512
   #pragma omp parallel for simd simdlen(16) aligned(x,y:64)
   for(int i=0; i<n; i++) y[i] = s*x[i] + y[i];
}

// Above may be in another file scope.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define N 1000

int main()
{
   static float x[N],y[N] __attribute__ ((aligned(64)));
   float s=2.0;
                         // Check for 64-byte aligned
   if( ((intptr_t)y)%64 != 0 || ((intptr_t)x)%64 != 0 )
   { printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); }

   for(int i=0; i<N; i++){ x[i]=i+1; y[i]=i+1; } // initialize

   base_saxpy(N,s,x,y);

   printf("y[0],y[N-1]: %5.0f %5.0f\n",y[0],y[N-1]);
   //output: y[0],y[N-1]: 3  3000

   return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: declare_variant.2
! type: F-free
! version: omp_5.0

module subs
  use omp_lib
contains

   subroutine base_saxpy(s,x,y)              !! base function
      real,intent(inout) :: s,x(:),y(:)
     !$omp  declare variant( avx512_saxpy ) &
     !$omp&           match( device={isa("core-avx512")} )

      y = s*x + y

   end subroutine

   subroutine avx512_saxpy(s,x,y)               !! function variant
      real,intent(inout) :: s,x(:),y(:)
      integer            :: i,n
      n=size(x)
                             !!assume 64-byte alignment for AVX-512
      !$omp parallel do simd simdlen(16) aligned(x,y: 64)
      do i = 1,n
         y(i) = s*x(i) + y(i)
      end do

   end subroutine

end module subs


program main
   use omp_lib
   use subs

   integer, parameter :: N=1000, align=64
   real, allocatable  :: x(:),y(:)
   real               :: s = 2.0e0
   integer            :: i

   allocate(x(N),y(N))   !! Assumes allocation is 64-byte aligned
                         !! (using compiler options, or another
                         !! allocation method).

                         !! loc is non-standard, but found everywhere
                         !! remove these lines if not available
   if(modulo(loc(x),align) /= 0 .and. modulo(loc(y),align) /=0 ) then
      print*,"ERROR: x|y not 64-byte aligned"; stop
   endif

   do i=1,N  !! initialize
     x(i)=i
     y(i)=i
   end do

   call base_saxpy(s,x,y)

   write(*,'("y(1),y(N):",2f6.0)') y(1),y(N) !!output: y... 3. 3000.

   deallocate(x,y)

end program