299 lines
9.2 KiB
C
299 lines
9.2 KiB
C
#include "opencl.h"
|
|
#include "utils.h"
|
|
#include <CL/cl.h>
|
|
#include <assert.h>
|
|
#include <linux/limits.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include "stb/stb_image.h"
|
|
#include "stb/stb_image_write.h"
|
|
|
|
cl_int opencl_init(cl_struct *cl, const char *program_path)
|
|
{
|
|
cl_platform_id platform_id = NULL;
|
|
cl_uint ret_num_devices;
|
|
cl_uint ret_num_platforms;
|
|
cl_int ret;
|
|
|
|
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1,
|
|
&(cl->device_id), &ret_num_devices);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl->context = clCreateContext(NULL, 1, &(cl->device_id), NULL, NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl_queue_properties properties[] = {
|
|
CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0,
|
|
};
|
|
|
|
cl->command_queue = clCreateCommandQueueWithProperties(cl->context,
|
|
cl->device_id, properties, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = opencl_load_kernel(cl, program_path, "gaussian_blur", &cl->kernel, &cl->program);
|
|
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_load_kernel(cl_struct *cl, const char *path, const char* kernel_name,
|
|
cl_kernel *kernel, cl_program *program)
|
|
{
|
|
assert(cl);
|
|
assert(path);
|
|
assert(kernel_name);
|
|
assert(kernel);
|
|
assert(program);
|
|
|
|
cl_int ret;
|
|
|
|
FILE *fp;
|
|
char *source_str;
|
|
size_t source_size;
|
|
const size_t max_source_size = 10000;
|
|
|
|
fp = fopen(path, "r");
|
|
if (!fp) {
|
|
fprintf(stderr, "failed to open kernel source file.\n");
|
|
exit(1);
|
|
}
|
|
|
|
source_str = malloc(max_source_size);
|
|
assert(source_str);
|
|
|
|
source_size = fread(source_str, 1, max_source_size, fp);
|
|
fprintf(stderr, "kernel: read %ld bytes\n", source_size);
|
|
|
|
fclose(fp);
|
|
|
|
*program = clCreateProgramWithSource(cl->context, 1,
|
|
(const char **)&source_str, (const size_t *)&source_size, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clBuildProgram(*program, 1, &(cl->device_id), NULL, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
if (ret != CL_SUCCESS) {
|
|
char build_log[2000];
|
|
size_t ret_size;
|
|
ret = clGetProgramBuildInfo(*program, cl->device_id, CL_PROGRAM_BUILD_LOG, 2000,
|
|
build_log, &ret_size);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
fprintf(stderr, "build: read %ld bytes: %s\n", ret_size, build_log);
|
|
fprintf(stderr, "build failed, exiting\n");
|
|
exit(1);
|
|
}
|
|
|
|
*kernel = clCreateKernel(*program, kernel_name, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
free(source_str);
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_init_copy_host_buffers(cl_struct *cl, Matrixd *mat1, Matrixd *mat2)
|
|
{
|
|
cl_int ret;
|
|
|
|
cl->mat1 = mat1;
|
|
cl->mat2 = mat2;
|
|
|
|
cl->cl_buffer1 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
|
|
matrixd_size(cl->mat1) * sizeof(double), NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
|
|
matrixd_size(cl->mat2) * sizeof(double), NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
// size_t result_size = (cl->mat1 > cl->mat2) ? matrixd_size(cl->mat1) : matrixd_size(cl->mat2);
|
|
size_t result_size = matrixd_size(cl->mat1);
|
|
|
|
cl->cl_result = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
|
|
result_size * sizeof(double), NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
|
|
// copy host memory to device memory
|
|
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer1, CL_TRUE, 0,
|
|
matrixd_size(cl->mat1) * sizeof(double), cl->mat1->buf, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0,
|
|
matrixd_size(cl->mat2) * sizeof(double), cl->mat2->buf, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_init_copy_host_buffers_mu8(cl_struct *cl, Matrixu8 *mat1, Matrixd *kernel)
|
|
{
|
|
cl_int ret;
|
|
|
|
cl->matu8 = mat1;
|
|
cl->kernel_m = kernel;
|
|
|
|
cl_image_format format = {.image_channel_order = CL_RGBA,
|
|
.image_channel_data_type = CL_UNSIGNED_INT8};
|
|
cl_image_desc desc;
|
|
memset(&desc, 0, sizeof(cl_image_desc));
|
|
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
|
desc.image_width = cl->matu8->w;
|
|
desc.image_height = cl->matu8->h;
|
|
|
|
cl->cl_buffer1 = clCreateImage(cl->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
|
&format, &desc, cl->matu8->buf, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_ONLY,
|
|
matrixd_size(cl->kernel_m) * sizeof(double), NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl->cl_result = clCreateImage(cl->context, CL_MEM_WRITE_ONLY,
|
|
&format, &desc, NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
cl->sampler = clCreateSamplerWithProperties(cl->context, NULL, &ret);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
// copy host memory to device memory
|
|
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0,
|
|
matrixd_size(cl->kernel_m) * sizeof(double), cl->kernel_m->buf, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_execute_image(cl_struct* cl)
|
|
{
|
|
cl_int ret;
|
|
|
|
ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->sampler));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer1));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_buffer2));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 3, sizeof(cl_mem), (void *)&(cl->cl_result));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 4, sizeof(int32_t), (void *)&cl->matu8->w);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 5, sizeof(int32_t), (void *)&cl->matu8->h);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
int k = cl->kernel_m->w-1/2;
|
|
ret = clSetKernelArg(cl->kernel, 6, sizeof(int32_t), (void *)&k);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
size_t mat1_size = cl->matu8->w * cl->matu8->h;
|
|
size_t mat2_size = matrixd_size(cl->kernel_m);
|
|
|
|
size_t local_size = 16;
|
|
size_t local_items_size[2] = {local_size, local_size};
|
|
size_t global_items_size[2];
|
|
size_t work_groups1 = (cl->matu8->w + local_size + 1) / local_size;
|
|
size_t work_groups2 = (cl->matu8->h + local_size + 1) / local_size;
|
|
global_items_size[0] = work_groups1 * local_size;
|
|
global_items_size[1] = work_groups2 * local_size;
|
|
ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL,
|
|
global_items_size, local_items_size, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_execute(cl_struct* cl)
|
|
{
|
|
cl_int ret;
|
|
|
|
ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->cl_buffer1));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer2));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_result));
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 3, sizeof(uint32_t), (void *)&cl->mat1->w);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
ret = clSetKernelArg(cl->kernel, 4, sizeof(uint32_t), (void *)&cl->mat1->h);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
int k = cl->mat2->w-1/2;
|
|
ret = clSetKernelArg(cl->kernel, 5, sizeof(uint32_t), (void *)&k);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
size_t mat1_size = matrixd_size(cl->mat1);
|
|
size_t mat2_size = matrixd_size(cl->mat2);
|
|
|
|
size_t local_size = 16;
|
|
size_t local_items_size[2] = {local_size, local_size};
|
|
size_t global_items_size[2];
|
|
size_t work_groups1 = (cl->mat1->w + local_size + 1) / local_size;
|
|
size_t work_groups2 = (cl->mat1->h + local_size + 1) / local_size;
|
|
global_items_size[0] = work_groups1 * local_size;
|
|
global_items_size[1] = work_groups2 * local_size;
|
|
|
|
ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL,
|
|
global_items_size, local_items_size, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_read_result(cl_struct *cl)
|
|
{
|
|
cl_int ret;
|
|
|
|
clFinish(cl->command_queue);
|
|
|
|
size_t result_size = matrixd_size(cl->mat1);
|
|
|
|
double *result_buffer = calloc(result_size, sizeof(double));
|
|
ret = clEnqueueReadBuffer(cl->command_queue, cl->cl_result, CL_TRUE, 0,
|
|
result_size * sizeof(double), result_buffer, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
for (size_t i = 0; i < result_size; i+=100) {
|
|
printf("%ld: %lf\n", i, result_buffer[i]);
|
|
}
|
|
|
|
free(result_buffer);
|
|
return ret;
|
|
}
|
|
|
|
cl_int opencl_read_result_image(cl_struct* cl)
|
|
{
|
|
cl_int ret;
|
|
|
|
clFinish(cl->command_queue);
|
|
|
|
size_t result_size = matrixu8_size(cl->matu8) * 4;
|
|
|
|
uint8_t *result_buffer = calloc(result_size, 1);
|
|
|
|
size_t origin[] = {0, 0, 0};
|
|
size_t region[] = {cl->matu8->w, cl->matu8->h, 1};
|
|
|
|
ret = clEnqueueReadImage(cl->command_queue, cl->cl_result, CL_TRUE, origin,
|
|
region, 0, 0, result_buffer, 0, NULL, NULL);
|
|
CHECK_CL_ERROR(ret);
|
|
|
|
stbi_write_bmp("image_blurred.bmp", cl->matu8->w, cl->matu8->h, 4, result_buffer);
|
|
|
|
free(result_buffer);
|
|
return ret;
|
|
}
|