Main Application
#define NX 2048
extern void scal_cpu_func(void *buffers[], void *_args);
extern void scal_sse_func(void *buffers[], void *_args);
extern void scal_cuda_func(void *buffers[], void *_args);
extern void scal_opencl_func(void *buffers[], void *_args);
{
.cpu_funcs = { scal_cpu_func, scal_sse_func },
.cpu_funcs_name = { "scal_cpu_func", "scal_sse_func" },
#ifdef STARPU_USE_CUDA
.cuda_funcs = { scal_cuda_func },
#endif
#ifdef STARPU_USE_OPENCL
.opencl_funcs = { scal_opencl_func },
#endif
.nbuffers = 1,
.modes = { STARPU_RW }
};
#ifdef STARPU_USE_OPENCL
#endif
int main(int argc, char **argv)
{
float vector[NX];
unsigned i;
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
#ifdef STARPU_USE_OPENCL
#endif
float factor = 3.14;
#ifdef STARPU_USE_OPENCL
#endif
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
return 0;
}
CPU Kernel
#include <xmmintrin.h>
void scal_cpu_func(void *buffers[], void *cl_arg)
{
unsigned i;
for (i = 0; i < n; i++)
val[i] *= *factor;
}
void scal_sse_func(void *buffers[], void *cl_arg)
{
unsigned int n_iterations = n/4;
__m128 *VECTOR = (__m128*) vector;
float factor = *(float *) cl_arg;
FACTOR = _mm_set1_ps(factor);
unsigned int i;
for (i = 0; i < n_iterations; i++)
VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
unsigned int remainder = n%4;
if (remainder != 0)
{
unsigned int start = 4 * n_iterations;
for (i = start; i < start+remainder; ++i)
{
vector[i] = factor * vector[i];
}
}
}
CUDA Kernel
static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
val[i] *= factor;
}
extern "C" void scal_cuda_func(void *buffers[], void *_args)
{
float *factor = (float *)_args;
unsigned threads_per_block = 64;
unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
}
OpenCL Kernel
Invoking the Kernel
void scal_opencl_func(void *buffers[], void *_args)
{
float *factor = _args;
int id, devid, err;
cl_kernel kernel;
cl_command_queue queue;
cl_event event;
{
"vector_mult_opencl",
devid);
err = clSetKernelArg(kernel, 0, sizeof(n), &n);
err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
}
{
size_t global=n;
size_t local;
size_t s;
cl_device_id device;
err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
if (local > global) local=global;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
}
{
clFinish(queue);
clReleaseEvent(event);
}
}
Source of the Kernel
__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
{
const int i = get_global_id(0);
if (i < nx)
{
val[i] *= factor;
}
}