11.2. Memory Allocators#

OpenMP memory allocators can be used to allocate memory with specific allocator traits. In the following example an OpenMP allocator is used to specify an alignment for arrays x and y . The general approach for attributing traits to variables allocated by OpenMP is to create or specify a pre-defined memory space , create an array of traits , and then form an allocator from the memory space and trait. The allocator is then specified in an OpenMP allocation (using an API omp_alloc() function for C/C++ code and an allocators directive for Fortran code in the allocators.1 example).

In the example below the xy_memspace variable is declared and assigned the default memory space ( omp_default_mem_space ). Next, an array for traits is created. Since only one trait will be used, the array size is 1 . A trait is a structure in C/C++ and a derived type in Fortran, containing 2 components: a key and a corresponding value (key-value pair). The trait key used here is omp_atk_alignment (an enum for C/C++ and a parameter for Fortran) and the trait value of 64 is specified in the xy_traits declaration. These declarations are followed by a call to the omp_init_allocator() function to combine the memory space ( xy_memspace ) and the traits ( xy_traits ) to form an allocator ( xy_alloc ).

In the C/C++ code the API omp_allocate() function is used to allocate space, similar to malloc , except that the allocator is specified as the second argument. In Fortran an allocators directive is used to specify an allocator for the following Fortran allocate statement. A variable list in the allocate clause may be supplied if the allocator is to be applied to a subset of variables in the Fortran allocate statement. Here, the xy_alloc allocator is specified in the modifier of the allocator clause, and the set of all variables used in the allocate statement is specified in the list.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: allocators.1
* type: C
* version: omp_5.0
*/
#include    <omp.h>
#include  <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define N 1000

int main()
{
   float  *x, *y;
   float s=2.0;

   omp_memspace_handle_t  xy_memspace = omp_default_mem_space;
   omp_alloctrait_t       xy_traits[1]= {omp_atk_alignment, 64};
   omp_allocator_handle_t xy_alloc    =
                           omp_init_allocator(xy_memspace,1,xy_traits);


   x=(float *)omp_alloc(N*sizeof(float), xy_alloc);
   y=(float *)omp_alloc(N*sizeof(float), xy_alloc);

   if( ((intptr_t)(y))%64 != 0 || ((intptr_t)(x))%64 != 0 )
   { printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); }

   #pragma omp parallel
   {
      #pragma omp for simd simdlen(16) aligned(x,y:64)
      for(int i=0; i<N; i++){ x[i]=i+1; y[i]=i+1; } // initialize

      #pragma omp for simd simdlen(16) aligned(x,y:64)
      for(int i=0; i<N; i++) y[i] = s*x[i] + y[i];
    }

   printf("y[0],y[N-1]: %5.0f %5.0f\n",y[0],y[N-1]);
   // output y[0],y[N-1]: 3 3000

   omp_free(x, xy_alloc);
   omp_free(y, xy_alloc);
   omp_destroy_allocator(xy_alloc);

   return 0;
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: allocators.1
! type: F-free
! version: omp_5.2
program main
 use omp_lib

 integer, parameter :: N=1000
 real, allocatable  :: x(:),y(:)
 real               :: s = 2.0e0
 integer            :: i

 integer(omp_memspace_handle_kind ) :: xy_memspace = omp_default_mem_space
 type(   omp_alloctrait           ) :: xy_traits(1) = &
                                    [omp_alloctrait(omp_atk_alignment,64)]
 integer(omp_allocator_handle_kind) :: xy_alloc

   xy_alloc   =    omp_init_allocator(   xy_memspace, 1, xy_traits)

   !$omp allocators allocate(allocator(xy_alloc): x, y)
   allocate(x(N),y(N))
                         !! loc is non-standard, but found everywhere
                         !! remove these lines if not available
   if(modulo(loc(x),64) /= 0 .and. modulo(loc(y),64) /=0 ) then
      print*,"ERROR: x|y not 64-byte aligned"; stop
   endif

   !$omp parallel

      !$omp do simd simdlen(16) aligned(x,y: 64) !! 64B aligned
      do i=1,N  !! initialize
        x(i)=i
        y(i)=i
      end do

      !$omp do simd simdlen(16) aligned(x,y: 64) !! 64B aligned
      do i = 1,N
         y(i) = s*x(i) + y(i)
      end do

   !$omp end parallel

   write(*,'("y(1),y(N):",2f6.0)') y(1),y(N) !!output: y... 3. 3000.

   deallocate(x,y)
   call omp_destroy_allocator(xy_alloc)

end program

When using the allocators construct with optional clauses in Fortran code, users should be aware of the behavior of a reallocation.

In the following example, the a variable is allocated with 64-byte alignment through the align clause of the allocators construct. The alignment of the newly allocated object, a , in the (reallocation) assignment a = b will not be reallocated with the 64-byte alignment, but with the 32-byte alignment prescribed by the trait of the my_alloctr allocator. It is best to avoid this problem by constructing and using an allocator (not the align clause) with the required alignment in the allocators construct. Note that in the subsequent deallocation of a the deallocation must precede the destruction of the allocator used in the allocation of a .

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: allocators.2
! type: F-free
! version: omp_5.2
program main
   use omp_lib
   implicit none

   integer, parameter :: align_32=32
   real, allocatable  :: a(:,:)
   real               :: b(10,10)

   integer(omp_memspace_handle_kind ) :: my_memspace
   type(   omp_alloctrait           ) :: my_traits(1)
   integer(omp_allocator_handle_kind) :: my_alloctr

   my_memspace  =  omp_default_mem_space
   my_traits    = [omp_alloctrait(omp_atk_alignment,align_32)]
!                                     allocator alignment ^^
   my_alloctr   =  omp_init_allocator(my_memspace, 1, my_traits)

   !$omp allocators allocate(allocator(my_alloctr), align(64): a)
   allocate(a(5,5)) ! 64-byte aligned by clause <---------^^

   a = b  ! reallocation occurs with 32-byte alignment
          ! uses just my_alloctr (32-byte align from allocator)

   deallocate(a)  ! Uses my_alloctr in deallocation.
   call omp_destroy_allocator(my_alloctr)

end program main

When creating and using an allocators construct within a Fortran procedure for allocating storage (and subsequently freeing the allocator storage with an omp_destroy_allocator construct), users should be aware of the necessity of using an explicit Fortran deallocation instead of relying on auto-deallocation.

In the following example, a user-defined allocator is used in the allocation of the c variable, and then the allocator is destroyed. Auto-deallocation at the end of the broken_auto_deallocation procedure will fail without the allocator, hence an explicit deallocation should be used (before the omp_destroy_allocator construct). Note that an allocator may be specified directly in the allocate clause without using the allocator complex modifier, so long as no other modifier is specified in the clause.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: allocators.3
! type: F-free
! version: omp_5.2
subroutine broken_auto_deallocation
   use omp_lib
   implicit none
   integer, parameter :: align_32=32
   real, allocatable  :: c(:)

   integer(omp_memspace_handle_kind ) :: my_memspace
   type(   omp_alloctrait           ) :: my_traits(1)
   integer(omp_allocator_handle_kind) :: my_alloctr

   my_memspace  =  omp_default_mem_space
   my_traits    = [omp_alloctrait(omp_atk_alignment,align_32)]
   my_alloctr   =  omp_init_allocator(my_memspace, 1, my_traits)

   !$omp allocators allocate(my_alloctr: c)
   allocate(c(100))

   !...

   call omp_destroy_allocator(my_alloctr)
   ! Auto-deallocation of c fails,
   ! because my_alloctr is no longer available.

end subroutine

The allocate directive is a convenient way to apply an OpenMP allocator to the allocation of declared variables.

This example illustrates the allocation of specific types of storage in a program for use in libraries, privatized variables, and with offloading.

Two groups of variables, { v1, v2 } and { v3, v4 }, are used with the allocate directive, and the { v5, v6 } pair is used with the allocate clause. Here we explicitly use predefined allocators omp_high_bw_mem_alloc and omp_default_mem_alloc with the allocate directive in CASE 1. Similar effects are achieved for private variables of a task by using the allocate clause, as shown in CASE 2.

Note, when the allocate directive does not specify an allocator clause, an implementation-defined default, stored in the def-allocator-var ICV, is used (not illustrated here). Users can set and get the default allocator with the omp_set_default_allocator and omp_get_default_allocator API routines.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: allocators.4
* type: C
* version: omp_5.1
*/
#include <omp.h>
#include <stdio.h>

void my_init(double *,double *,int, double *,double *,int, \
             double *,double *,int);
void lib_saxpy(double *,double *,double,int);
void my_gather(double *,double *,int);

#pragma omp begin declare target
void my_gpu_vxv(double *, double *, int);
#pragma omp end  declare target

#define Nhb 1024*1024      // high bandwith
#define Nbg 1024*1024*64   // big memory, default
#define Nll 1024*1024      // low latency memory

void test_allocate() {

  double  v1[Nhb], v2[Nhb];
  double  v3[Nbg], v4[Nbg];
  double  v5[Nll], v6[Nll];

/_* CASE 1: USING ALLOCATE DIRECTIVE _*/
  #pragma omp allocate(v1,v2) allocator(omp_high_bw_mem_alloc)
  #pragma omp allocate(v3,v4) allocator(omp_default_mem_alloc)

  my_init(v1,v2,Nhb, v3,v4,Nbg, v5,v6,Nll);

  lib_saxpy(v1,v2,5.0,Nhb);

  #pragma omp target map(to: v3[0:Nbg], v4[0:Nbg]) map(from:v3[0:Nbg])
  my_gpu_vxv(v3,v4,Nbg);

/_* CASE 2: USING ALLOCATE CLAUSE _*/
  #pragma omp task private(v5,v6) \
                   allocate(allocator(omp_low_lat_mem_alloc): v5,v6)
  {
    my_gather(v5,v6,Nll);
  }

}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: allocators.4
! type: F-free
! version: omp_5.1
subroutine test_allocate
   use omp_lib

   interface
     subroutine my_gpu_vxv(va,vb,n)
     !$omp declare target
     integer :: n
     double precision  :: va(n), vb(n)
     end subroutine
   end interface

   integer,parameter :: Nhb=1024*1024,   & !! high bandwith
                        Nbg=1024*1024*64,& !! big memory, default
                        Nll=1024*1024      !! low latency memory

   double precision  ::  v1(Nhb), v2(Nhb)
   double precision  ::  v3(Nbg), v4(Nbg)
   double precision  ::  v5(Nll), v6(Nll)

 !_* CASE 1: USING ALLOCATE DIRECTIVE _*!
   !$omp allocate(v1,v2) allocator(omp_high_bw_mem_alloc)
   !$omp allocate(v3,v4) allocator(omp_default_mem_alloc)

   call my_init(v1,v2,Nhb, v3,v4,Nbg, v5,v6,Nll)

   call lib_saxpy(v1,v2,5.0,Nhb)

   !$omp target map(to: v3, v4) map(from:v3)
      call my_gpu_vxv(v3,v4,Nbg)
   !$omp end target

 !_* CASE 2: USING ALLOCATE CLAUSE _*!
   !$omp task private(v5,v6) &
   !$omp&     allocate(allocator(omp_low_lat_mem_alloc): v5,v6)
      call my_gather(v5,v6,Nll)
   !$omp end task

end subroutine test_allocate