lib:pocl

Librairie générant une version de OpenCL purement CPU.

Compilation

Suivre la procédure décrite dans le readme et respecter la compatibilité avec les versions de clang.

export LLVM_VERSION=<major LLVM version>
apt install -y python3-dev libpython3-dev build-essential ocl-icd-libopencl1 \
    cmake git pkg-config libclang-${LLVM_VERSION}-dev clang \
    llvm-${LLVM_VERSION} make ninja-build ocl-icd-libopencl1 ocl-icd-dev \
    ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo dialog apt-utils \
    libxml2-dev libclang-cpp${LLVM_VERSION}-dev libclang-cpp${LLVM_VERSION} \
    llvm-${LLVM_VERSION}-dev

Pour forcer la compilation de OpenCL version CPU, ajouter l'option DEFAULT_ENABLE_ICD=0.

cd <directory-with-pocl-sources>
mkdir build
cd build
cmake .. -DDEFAULT_ENABLE_ICD=0
make
# and optionally
make install

Exemple

Pris dans le livre OpenCL Programming by Example

Pour passer de la version OpenCL à la version PoCL, il suffit de remplacer l'include CL/opencl.h par pocl_opencl.h et de changer les 2 références à CL_DEVICE_TYPE_GPU par CL_DEVICE_TYPE_CPU.

opencc.cpp

#include <stdio.h>
#include <stdlib.h>
 
// A décommenter
//#include "pocl_opencl.h"
 
// A commenter
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/opencl.h>
#endif
 
#define VECTOR_SIZE 409600000
 
//OpenCL kernel which is run for every work item created.
const char *saxpy_kernel =
"__kernel                                   \n"
"void saxpy_kernel(float alpha,             \n"
"                  __global float *A,       \n"
"                  __global float *B,       \n"
"                  __global float *C)       \n"
"{                                          \n"
"    //Get the index of the work-item       \n"
"    int index = get_global_id(0);          \n"
"    C[index] = alpha* A[index] + B[index]; \n"
"}                                          \n";
 
int main(void) {
  int i;
  // Allocate space for vectors A, B and C
  float alpha = 2.0;
  float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
  float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
  float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
  for(i = 0; i < VECTOR_SIZE; i++)
  {
    A[i] = i;
    B[i] = VECTOR_SIZE - i;
    C[i] = 0;
  }
 
  // Get platform and device information
  cl_platform_id * platforms = NULL;
  cl_uint     num_platforms;
  //Set up the Platform
  cl_int clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
  platforms = (cl_platform_id *)
  malloc(sizeof(cl_platform_id)*num_platforms);
  clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
 
  //Get the devices list and choose the device you want to run on
  cl_device_id     *device_list = NULL;
  cl_uint           num_devices;
 
  // A passer en CL_DEVICE_TYPE_CPU
  clStatus = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_GPU, 0,NULL, &num_devices);
  device_list = (cl_device_id *) 
  malloc(sizeof(cl_device_id)*num_devices);
  // A passer en CL_DEVICE_TYPE_CPU
  clStatus = clGetDeviceIDs( platforms[0],CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
 
  // Create one OpenCL context for each device in the platform
  cl_context context;
  context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus);
 
  // Create a command queue
  cl_command_queue command_queue = clCreateCommandQueue(context, device_list[0], 0, &clStatus);
 
  // Create memory buffers on the device for each vector
  cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY,VECTOR_SIZE * sizeof(float), NULL, &clStatus);
  cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY,VECTOR_SIZE * sizeof(float), NULL, &clStatus);
  cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,VECTOR_SIZE * sizeof(float), NULL, &clStatus);
 
  // Copy the Buffer A and B to the device
  clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
  clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
 
  // Create a program from the kernel source
  cl_program program = clCreateProgramWithSource(context, 1,(const char **)&saxpy_kernel, NULL, &clStatus);
 
  // Build the program
  clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
 
  // Create the OpenCL kernel
  cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
 
  // Set the arguments of the kernel
  clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
  clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem);
  clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem);
  clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem);
 
  // Execute the OpenCL kernel on the list
  size_t global_size = VECTOR_SIZE; // Process the entire lists
  size_t local_size = 64;           // Process one item at a time
  clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
 
  // Read the cl memory C_clmem on device to the host variable C
  clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
 
  // Clean up and wait for all the comands to complete.
  clStatus = clFlush(command_queue);
  clStatus = clFinish(command_queue);
 
  // Display the result to the screen
  //for(i = 0; i < VECTOR_SIZE; i++)
  //  printf("%f * %f + %f = %f\n", alpha, A[i], B[i], C[i]);
 
  // Finally release all OpenCL allocated objects and host buffers.
  clStatus = clReleaseKernel(kernel);
  clStatus = clReleaseProgram(program);
  clStatus = clReleaseMemObject(A_clmem);
  clStatus = clReleaseMemObject(B_clmem);
  clStatus = clReleaseMemObject(C_clmem);
  clStatus = clReleaseCommandQueue(command_queue);
  clStatus = clReleaseContext(context);
  free(A);
  free(B);
  free(C);
  free(platforms);
  free(device_list);
  return 0;
}

Compilation / exécution pour OpenCL

g++ -o opencc opencc.cpp -lOpenCL
time ./opencc
real    0m3,532s
user    0m2,259s
sys     0m1,209s

Compilation / exécution pour PoCL

g++ -g -o opencc opencc.cpp -I ~/prog/pocl-3.1/build -I ~/prog/pocl-3.1/poclu/ -L ~/prog/pocl-3.1/build/lib/CL -lpocl -lOpenCL
LD_LIBRARY_PATH="/home/legarrec/prog/pocl-3.1/build/lib/CL" POCL_BUILDING=1 time ./opencc
4.95user 14.23system 0:22.90elapsed 83%CPU (0avgtext+0avgdata 7643848maxresident)k
2624936inputs+2176outputs (47118major+959861minor)pagefaults 0swaps

Il faut mettre POCL_BUILDING=1 si les librairies libOpenCL et libpocl sont dans le dossier de compilation et non le dossier d'installation (make install non exécuté).