6.17. Device Routines#

6.17.1. omp_is_initial_device Routine#

The following example shows how the omp_is_initial_device runtime library routine can be used to query if a code is executing on the initial host device or on a target device. The example then sets the number of threads in the parallel region based on where the code is executing.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: device.1
* type: C
* version: omp_5.1
*/
#include <stdio.h>
#include <omp.h>

#pragma omp begin declare target
   void vec_mult(float *p, float *v1, float *v2, int N);
   extern float *p, *v1, *v2;
   extern int N;
#pragma omp end declare target

extern void init_vars(float *, float *, int);
extern void output(float *, int);

void foo()
{
   init_vars(v1, v2, N);
   #pragma omp target device(42) map(p[:N], v1[:N], v2[:N])
   {
      vec_mult(p, v1, v2, N);
   }
   output(p, N);
}

void vec_mult(float *p, float *v1, float *v2, int N)
{
   int i;
   int nthreads;
   if (!omp_is_initial_device())
   {
      printf("1024 threads on target device\n");
      nthreads = 1024;
   }
   else
   {
      printf("8 threads on initial device\n");
      nthreads = 8;
   }
   #pragma omp parallel for private(i) num_threads(nthreads)
   for (i=0; i<N; i++)
     p[i] = v1[i] * v2[i];
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: device.1
! type: F-free
! version:    omp_4.0
module params
   integer,parameter :: N=1024
end module params
module vmult
contains
   subroutine vec_mult(p, v1, v2, N)
   use omp_lib, ONLY : omp_is_initial_device
   !$omp declare target
   real    :: p(N), v1(N), v2(N)
   integer :: i, nthreads, N
      if (.not. omp_is_initial_device()) then
         print*, "1024 threads on target device"
         nthreads = 1024
      else
         print*, "8 threads on initial device"
         nthreads = 8
      endif
      !$omp parallel do private(i) num_threads(nthreads)
      do i = 1,N
        p(i) = v1(i) * v2(i)
      end do
   end subroutine vec_mult
end module vmult
program prog_vec_mult
use params
use vmult
real :: p(N), v1(N), v2(N)
   call init(v1,v2,N)
   !$omp target device(42) map(p, v1, v2)
      call vec_mult(p, v1, v2, N)
   !$omp end target
   call output(p, N)
end program

6.17.2. omp_get_num_devices Routine#

The following example shows how the omp_get_num_devices runtime library routine can be used to determine the number of devices.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: device.2
* type: C
* version: omp_4.0
*/
#include <omp.h>
extern void init(float *, float *, int);
extern void output(float *, int);
void vec_mult(float *p, float *v1, float *v2, int N)
{
   int i;
   init(v1, v2, N);
   int ndev = omp_get_num_devices();
   int do_offload = (ndev>0 && N>1000000);
   #pragma omp target if(do_offload) \
                      map(to: v1[0:N], v2[:N]) \
                      map(from: p[0:N])
   #pragma omp parallel for if(N>1000) private(i)
   for (i=0; i<N; i++)
     p[i] = v1[i] * v2[i];
   output(p, N);
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: device.2
! type: F-free
! version:    omp_4.0
subroutine vec_mult(p, v1, v2, N)
use omp_lib, ONLY : omp_get_num_devices
real    :: p(N), v1(N), v2(N)
integer :: N, i, ndev
logical :: do_offload
   call init(v1, v2, N)
   ndev = omp_get_num_devices()
   do_offload = (ndev>0) .and. (N>1000000)
   !$omp target if(do_offload) map(to: v1, v2) map(from: p)
   !$omp parallel do if(N>1000)
      do i=1,N
         p(i) = v1(i) * v2(i)
      end do
   !$omp end target
   call output(p, N)
end subroutine

6.17.3. omp_set_default_device and omp_get_default_device Routines#

The following example shows how the omp_set_default_device and omp_get_default_device runtime library routines can be used to set the default device and determine the default device respectively.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: device.3
* type: C
* version: omp_4.0
*/
#include <omp.h>
#include <stdio.h>
void foo(void)
{
   int default_device = omp_get_default_device();
   printf("Default device = %d\n", default_device);
   omp_set_default_device(default_device+1);
   if (omp_get_default_device() != default_device+1)
      printf("Default device is still = %d\n", default_device);
}
!!%compiler: gfortran
!!%cflags: -fopenmp

! name: device.3
! type: F-free
! version:    omp_4.0
program foo
use omp_lib, ONLY : omp_get_default_device, omp_set_default_device
integer :: old_default_device, new_default_device
   old_default_device = omp_get_default_device()
   print*, "Default device = ", old_default_device
   new_default_device = old_default_device + 1
   call omp_set_default_device(new_default_device)
   if (omp_get_default_device() == old_default_device) &
      print*,"Default device is STILL = ", old_default_device
end program

6.17.4. Device and Host Memory Association#

The association of device memory with host memory can be established by calling the omp_target_associate_ptr API routine as part of the mapping. The following example shows the use of this routine to associate device memory of size CS, allocated by the omp_target_alloc routine and pointed to by the device pointer dev_ptr, with a chunk of the host array arr starting at index ioff. In Fortran, the intrinsic function c_loc is called to obtain the corresponding C pointer (h_ptr) of arr(ioff) for use in the call to the API routine.

Since the reference count of the resulting mapping is infinite, it is necessary to use the target update directive (or the always modifier in a map clause) to accomplish a data transfer between host and device. The explicit mapping of the array section arr[ioff:CS] (or arr(ioff:ioff+CS-1) in Fortran) on the target construct ensures that the allocated and associated device memory is used when referencing the array arr in the target region. The device pointer dev_ptr cannot be accessed directly after a call to the omp_target_associate_ptr routine.

After the target region, the device pointer is disassociated from the current chunk of the host memory by calling the omp_target_disassociate_ptr routine before working on the next chunk. The device memory is freed by calling the omp_target_free routine at the end.

//%compiler: clang
//%cflags: -fopenmp

/*
* @@name:       target_associate_ptr.1
* @@type:       C
* @@compilable: yes
* @@linkable:   yes
* @@expect:     success
* @@version:    omp_4.5
*/
#include <stdio.h>
#include <omp.h>

#define CS 50
#define N  (CS*2)

int main() {
  int arr[N];
  int *dev_ptr;
  int dev;

  for (int i = 0; i < N; i++)
    arr[i] = i;

  dev = omp_get_default_device();

  // Allocate device memory
  dev_ptr = (int *)omp_target_alloc(sizeof(int) * CS, dev);

  // Loop over chunks
  for (int ioff = 0; ioff < N; ioff += CS) {

    // Associate device memory with one chunk of host memory
    omp_target_associate_ptr(&arr[ioff], dev_ptr,
                             sizeof(int) * CS, 0, dev);

    printf("before: arr[%d]=%d\n", ioff, arr[ioff]);

    // Update the device data
    #pragma omp target update to(arr[ioff:CS]) device(dev)

    // Explicit mapping of arr to make sure that we use the allocated 
    // and associated memory.  No host-device data update here.
    #pragma omp target map(tofrom : arr[ioff:CS]) device(dev)
      for (int i = 0; i < CS; i++) {
        arr[i+ioff]++;
      }

    // Update the host data
    #pragma omp target update from(arr[ioff:CS]) device(dev)

    printf("after: arr[%d]=%d\n", ioff, arr[ioff]);

    // Disassociate device pointer from the current chunk of host memory
    // before next use
    omp_target_disassociate_ptr(&arr[ioff], dev);
  }

  // Free device memory
  omp_target_free(dev_ptr, dev);

  return 0;
}
/* Outputs:
  before: arr[0]=0
  after: arr[0]=1
  before: arr[50]=50
  after: arr[50]=51
 */
!!%compiler: gfortran
!!%cflags: -fopenmp

! @@name:       target_associate_ptr.1
! @@type:       F-free
! @@compilable: yes
! @@linkable:   yes
! @@expect:     success
! @@version:    omp_5.1
program target_associate
  use omp_lib
  use, intrinsic :: iso_c_binding
  implicit none

  integer, parameter :: CS = 50
  integer, parameter :: N  = CS*2
  integer, target :: arr(N)
  type(c_ptr) :: h_ptr, dev_ptr
  integer(c_size_t) :: csize, dev_off
  integer(c_int) :: dev
  integer :: i, ioff, s

  do i = 1, N
    arr(i) = i
  end do

  dev = omp_get_default_device()
  csize = c_sizeof(arr(1)) * CS

  ! Allocate device memory
  dev_ptr = omp_target_alloc(csize, dev)
  dev_off = 0

  ! Loop over chunks
  do ioff = 1, N, CS

    ! Associate device memory with one chunk of host memory
    h_ptr = c_loc(arr(ioff))
    s = omp_target_associate_ptr(h_ptr, dev_ptr, csize, dev_off, dev)

    print *, "before: arr(", ioff, ")=", arr(ioff)

    ! Update the device data
    !$omp target update to(arr(ioff:ioff+CS-1)) device(dev)

    ! Explicit mapping of arr to make sure that we use the allocated 
    ! and associated memory.  No host-device data update here.
    !$omp target map(tofrom: arr(ioff:ioff+CS-1)) device(dev)
      do i = 0, CS-1
        arr(i+ioff) = arr(i+ioff) + 1
      end do
    !$omp end target

    ! Update the host data
    !$omp target update from(arr(ioff:ioff+CS-1)) device(dev)

    print *, "after: arr(", ioff, ")=", arr(ioff)

    ! Disassociate device pointer from the current chunk of host memory
    ! before next use
    s = omp_target_disassociate_ptr(h_ptr, dev)
  end do

  ! Free device memory
  call omp_target_free(dev_ptr, dev)

end
! Outputs:
!  before: arr( 1 )= 1
!  after: arr( 1 )= 2
!  before: arr( 51 )= 51
!  after: arr( 51 )= 52

6.17.5. Target Memory and Device Pointers Routines#

The following example shows how to create space on a device, transfer data to and from that space, and free the space, using API calls. The API calls directly execute allocation, copy and free operations on the device, without invoking any mapping through a target directive. The omp_target_alloc routine allocates space and returns a device pointer for referencing the space in the omp_target_memcpy API routine on the host. The omp_target_free routine frees the space on the device.

The example also illustrates how to access that space in a target region by exposing the device pointer in an is_device_ptr clause.

The example creates an array of cosine values on the default device, to be used on the host device. The function fails if a default device is not available.

//%compiler: clang
//%cflags: -fopenmp

/*
* name: device.4
* type: C
* version: omp_4.5
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>

void get_dev_cos(double *mem, size_t s)
{
   int h, t, i;
   double * mem_dev_cpy;
   h = omp_get_initial_device();
   t = omp_get_default_device();

   if (omp_get_num_devices() < 1 || t < 0){
      printf(" ERROR: No device found.\n");
      exit(1);
   }

   mem_dev_cpy = (double *)omp_target_alloc( sizeof(double) * s, t);
   if(mem_dev_cpy == NULL){
      printf(" ERROR: No space left on device.\n");
      exit(1);
   }

                          /* dst  src */
   omp_target_memcpy(mem_dev_cpy, mem, sizeof(double)*s,
                              0,    0,
                              t,   h);

   #pragma omp target is_device_ptr(mem_dev_cpy) device(t)
   #pragma omp teams distribute parallel for
     for(i=0;i<s;i++){ mem_dev_cpy[i] = cos((double)i); } /* init data */

                   /* dst  src */
    omp_target_memcpy(mem, mem_dev_cpy, sizeof(double)*s,
                      0,             0,
                      h,             t);

    omp_target_free(mem_dev_cpy, t);
}

The following Fortran example illustrates how to use the omp_target_alloc and omp_target_memcpy functions to directly allocate device storage and transfer data to and from a device. It also shows how to check for the presence of device data with the omp_target_is_present function and to associate host and device storage with the omp_target_associate_ptr function.

In Section 1 of the code, 40 bytes of storage are allocated on the default device with the omp_target_alloc function, which returns a value (of type C_PTR ) that contains the device address of the storage. In the subsequent target construct, cp is specified on the is_device_ptr clause to instruct the compiler that cp is a device pointer. The device pointer ( cp ) is then associated with the Fortran pointer ( fp ) via the c_f_pointer routine inside the target construct. As a result, fp points to the storage on the device that is allocated by the omp_target_alloc routine. In the target region, the value 4 is assigned to the storage on the device, using the Fortran pointer. A trivial test checks that all values were correctly assigned. The Fortran pointer ( fp ) is nullified before the end of the target region. After the target construct, the space on the device is freed with the omp_target_free function, using the device cp pointer which is set to null after the call.

In Section 2, the content of the storage allocated on the host is directly copied to the OpenMP allocated storage on the device. First, storage is allocated for the device and host using omp_target_alloc. Next, on the host the device pointer, returned from the allocation omp_target_alloc function, is associated with a Fortran pointer, and values are assigned to the storage. Similarly, values are assigned on the device to the device storage, after associating a Fortran pointer ( fp_dst ) with the device’s storage pointer ( cp_dst ).

Next the omp_target_memcpy function directly copies the host data to the device storage, specified by the respective host and device pointers. This copy will overwrite -1 values in the device storage, and is checked in the next target construct. Keyword arguments are used here for clarity. (A positional argument list is used in the next Section.)

In Section 3, space is allocated (with a Fortran ALLOCATE statement) and initialized using a host Fortran pointer ( h_fp ), and the address of the storage is directly assigned to a host C pointer ( h_cp ). The following omp_target_is_present function returns 0 (false, of integer(C_INT) type) to indicate that h_cp does not have any corresponding storage on the default device.

Next, the same amount of space is allocated on the default device with the omp_target_alloc function, which returns a device pointer ( d_cp ). The device pointer d_cp and host pointer h_cp are then associated using the omp_target_associate_ptr function. The device storage to which d_cp points becomes the corresponding storage of the host storage to which h_cp points. The following omp_target_is_present call confirms this, by returning a non-zero value of integer(C_INT) type for true.

After the association, the content of the host storage is copied to the device using the omp_target_memcpy function. In the final target construct an array section of h_fp is mapped to the device, and evaluated for correctness. The mapping establishes a connection of h_fp with the corresponding device data in the target construct, but does not produce an update on the device because the previous omp_target_associate_ptr routine sets the reference count of the mapped object to infinity, meaning a mapping without the always modifier will not update the device object.

!!%compiler: gfortran
!!%cflags: -fopenmp

! name: device.4
! type: F-free
! version: omp_5.0
program device_mem
  use omp_lib
  use, intrinsic            :: iso_c_binding

  integer(kind=4),parameter :: N = 10
  type(c_ptr)               :: cp
  integer(c_int), pointer   :: fp(:)
  integer(c_int)            :: rc, host_dev, targ_dev
  integer(c_size_t)         :: int_bytes

  integer, pointer  :: fp_src(:), fp_dst(:)    ! Section 2 vars
  type(c_ptr)       :: cp_src,    cp_dst       ! Section 2 vars

  integer, pointer  :: h_fp(:)                 ! Section 3 vars
  type(c_ptr)       :: h_cp,    d_cp           ! Section 3 vars

  integer :: i

  host_dev  = omp_get_initial_device()
  targ_dev  = omp_get_default_device()
  int_bytes = C_SIZEOF(rc)

!------------------------------------------------Section 1 vv-----------
  cp = omp_target_alloc(N*int_bytes, targ_dev)

  !$omp target is_device_ptr(cp) device(targ_dev) !fp implicit map
     call c_f_pointer(cp, fp, [ N ])              !fp becomes associated
     fp(:) = 4
     if( all(fp == 4) ) print*,"PASSED 1 of 5"
     nullify(fp)                   !fp must be returned as disassociated
  !$omp end target

  call omp_target_free(cp, targ_dev)
  cp = c_null_ptr

!------------------------------------------------Section 2 vv-----------

   cp_src = omp_target_alloc((N+1)*int_bytes, host_dev)
   cp_dst = omp_target_alloc(  N  *int_bytes, targ_dev)

!           Initialize host array (src)
   call c_f_pointer(cp_src, fp_src, [N+1])
   fp_src = [(i,i=1,N+1)]

   !$omp target device(targ_dev) is_device_ptr(cp_dst)
     call c_f_pointer(cp_dst, fp_dst, [N])   ! fp_dst becomes associated
     fp_dst(:) = -1                          ! Initial device storage
     nullify(fp_dst)                         ! return as disassociated
   !$omp end target

!  Copy subset of host (src) array to device (dst) array
   rc = omp_target_memcpy(                                             &
            dst=cp_dst,             src=cp_src,    length=N*int_bytes, &
            dst_offset=0_c_size_t,  src_offset=int_bytes,              &
            dst_device_num=targ_dev,src_device_num=host_dev)

!  Check dst array on device

   !$omp target device(targ_dev) is_device_ptr(cp_dst)
     call c_f_pointer(cp_dst, fp_dst, [N])
     if ( all(fp_dst == [(i,i=1,N)]) ) print*,"PASSED 2 of 5"
     nullify(fp_dst)
   !$omp end target

!------------------------------------------------Section 3 vv-----------

   !allocate host memory and initialize.
   allocate(h_fp(N), source=[(i,i=1,N)])

   h_cp = c_loc(h_fp)
             ! Device is not aware of allocation on host
   if(omp_target_is_present(h_cp, targ_dev) == 0) &
      print*, "PASSED 3 of 5"

             ! Allocate device memory
   d_cp = omp_target_alloc(c_sizeof(h_fp(1))*size(h_fp), targ_dev)

             ! now associate host and device storage
   rc=omp_target_associate_ptr(h_cp,d_cp,c_sizeof(h_fp(1))*size(h_fp), &
                               0_c_size_t,targ_dev)

             ! check presence of device data, associated w. host pointer
   if(omp_target_is_present(h_cp, targ_dev) /= 0) &
      print*,"PASSED 4 of 5"

             ! copy from host to device via C pointers
   rc=omp_target_memcpy(d_cp,       h_cp,c_sizeof(h_fp(1))*size(h_fp), &
                        0_c_size_t, 0_c_size_t,                        &
                        targ_dev,   host_dev)

             ! validate the device data in the target region
             ! no data copy here since the reference count is infinity
   !$omp target device(targ_dev) map(h_fp)
     if ( all(h_fp == [(i,i=1,N)]) ) print*, "PASSED 5 of 5"
   !$omp end target

   call omp_target_free(d_cp,targ_dev)
   deallocate(h_fp)
end program