Main Application
#define    NX    2048
extern void scal_cpu_func(void *buffers[], void *_args);
extern void scal_sse_func(void *buffers[], void *_args);
extern void scal_cuda_func(void *buffers[], void *_args);
extern void scal_opencl_func(void *buffers[], void *_args);
    
    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
#ifdef STARPU_USE_CUDA
    
    .cuda_funcs = { scal_cuda_func, NULL },
#endif
#ifdef STARPU_USE_OPENCL
    
    .opencl_funcs = { scal_opencl_func, NULL },
#endif
    .nbuffers = 1,
};
#ifdef STARPU_USE_OPENCL
#endif
int main(int argc, char **argv)
{
    
    float vector[NX];
    unsigned i;
    for (i = 0; i < NX; i++)
        vector[i] = 1.0f;
    fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
    
#ifdef STARPU_USE_OPENCL
               "examples/basic_examples/vector_scal_opencl_kernel.cl", &
programs, NULL);
 
#endif
    
                                NX, sizeof(vector[0]));
    float factor = 3.14;
    
    
    
    
    
#ifdef STARPU_USE_OPENCL
#endif
    
    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
    return 0;
}
  
CPU Kernel
#include <xmmintrin.h>
void scal_cpu_func(void *buffers[], void *cl_arg)
{
    unsigned i;
    
    
    
    
    for (i = 0; i < n; i++)
        val[i] *= *factor;
}
void scal_sse_func(void *buffers[], void *cl_arg)
{
    unsigned int n_iterations = n/4;
    __m128 *VECTOR = (__m128*) vector;
    float factor = *(float *) cl_arg;
    FACTOR = _mm_set1_ps(factor);
    unsigned int i;
    for (i = 0; i < n_iterations; i++)
        VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
    unsigned int remainder = n%4;
    if (remainder != 0)
    {
        unsigned int start = 4 * n_iterations;
        for (i = start; i < start+remainder; ++i)
        {
            vector[i] = factor * vector[i];
        }
    }
}
  
CUDA Kernel
static __global__ void vector_mult_cuda(unsigned n, float *val,
                                        float factor)
{
        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
        if (i < n)
               val[i] *= factor;
}
extern "C" void scal_cuda_func(void *buffers[], void *_args)
{
        float *factor = (float *)_args;
        
        
        unsigned threads_per_block = 64;
        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
                        (n, val, *factor);
}
  
OpenCL Kernel
Invoking the Kernel
void scal_opencl_func(void *buffers[], void *_args)
{
    float *factor = _args;
    int id, devid, err;                   
    cl_kernel kernel;                     
    cl_command_queue queue;               
    cl_event event;                       
    
    
    {  
                                         "vector_mult_opencl", 
                                         devid);
         err = clSetKernelArg(kernel, 0, sizeof(n), &n);
         err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
         err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
    }
    {   
        size_t global=n;
        size_t local;
        size_t s;
        cl_device_id device;
        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
                                        sizeof(local), &local, &s);
        if (local > global) local=global;
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
                                     NULL, &event);
    }
    {  
         clFinish(queue);
         clReleaseEvent(event);
    }
}
  
Source of the Kernel
__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
{
        const int i = get_global_id(0);
        if (i < nx) {
                val[i] *= factor;
        }
}