2023-05-17 17:45:47 +02:00

299 lines
9.2 KiB
C

#include "opencl.h"
#include "utils.h"
#include <CL/cl.h>
#include <assert.h>
#include <linux/limits.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stb/stb_image.h"
#include "stb/stb_image_write.h"
cl_int opencl_init(cl_struct *cl, const char *program_path)
{
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
CHECK_CL_ERROR(ret);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1,
&(cl->device_id), &ret_num_devices);
CHECK_CL_ERROR(ret);
cl->context = clCreateContext(NULL, 1, &(cl->device_id), NULL, NULL, &ret);
CHECK_CL_ERROR(ret);
cl_queue_properties properties[] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0,
};
cl->command_queue = clCreateCommandQueueWithProperties(cl->context,
cl->device_id, properties, &ret);
CHECK_CL_ERROR(ret);
ret = opencl_load_kernel(cl, program_path, "gaussian_blur", &cl->kernel, &cl->program);
return ret;
}
cl_int opencl_load_kernel(cl_struct *cl, const char *path, const char* kernel_name,
cl_kernel *kernel, cl_program *program)
{
assert(cl);
assert(path);
assert(kernel_name);
assert(kernel);
assert(program);
cl_int ret;
FILE *fp;
char *source_str;
size_t source_size;
const size_t max_source_size = 10000;
fp = fopen(path, "r");
if (!fp) {
fprintf(stderr, "failed to open kernel source file.\n");
exit(1);
}
source_str = malloc(max_source_size);
assert(source_str);
source_size = fread(source_str, 1, max_source_size, fp);
fprintf(stderr, "kernel: read %ld bytes\n", source_size);
fclose(fp);
*program = clCreateProgramWithSource(cl->context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
CHECK_CL_ERROR(ret);
ret = clBuildProgram(*program, 1, &(cl->device_id), NULL, NULL, NULL);
CHECK_CL_ERROR(ret);
if (ret != CL_SUCCESS) {
char build_log[2000];
size_t ret_size;
ret = clGetProgramBuildInfo(*program, cl->device_id, CL_PROGRAM_BUILD_LOG, 2000,
build_log, &ret_size);
CHECK_CL_ERROR(ret);
fprintf(stderr, "build: read %ld bytes: %s\n", ret_size, build_log);
fprintf(stderr, "build failed, exiting\n");
exit(1);
}
*kernel = clCreateKernel(*program, kernel_name, &ret);
CHECK_CL_ERROR(ret);
free(source_str);
return ret;
}
cl_int opencl_init_copy_host_buffers(cl_struct *cl, Matrixd *mat1, Matrixd *mat2)
{
cl_int ret;
cl->mat1 = mat1;
cl->mat2 = mat2;
cl->cl_buffer1 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
matrixd_size(cl->mat1) * sizeof(double), NULL, &ret);
CHECK_CL_ERROR(ret);
cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
matrixd_size(cl->mat2) * sizeof(double), NULL, &ret);
CHECK_CL_ERROR(ret);
// size_t result_size = (cl->mat1 > cl->mat2) ? matrixd_size(cl->mat1) : matrixd_size(cl->mat2);
size_t result_size = matrixd_size(cl->mat1);
cl->cl_result = clCreateBuffer(cl->context, CL_MEM_READ_WRITE,
result_size * sizeof(double), NULL, &ret);
CHECK_CL_ERROR(ret);
// copy host memory to device memory
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer1, CL_TRUE, 0,
matrixd_size(cl->mat1) * sizeof(double), cl->mat1->buf, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0,
matrixd_size(cl->mat2) * sizeof(double), cl->mat2->buf, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
return ret;
}
cl_int opencl_init_copy_host_buffers_mu8(cl_struct *cl, Matrixu8 *mat1, Matrixd *kernel)
{
cl_int ret;
cl->matu8 = mat1;
cl->kernel_m = kernel;
cl_image_format format = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_UNSIGNED_INT8};
cl_image_desc desc;
memset(&desc, 0, sizeof(cl_image_desc));
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = cl->matu8->w;
desc.image_height = cl->matu8->h;
cl->cl_buffer1 = clCreateImage(cl->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
&format, &desc, cl->matu8->buf, &ret);
CHECK_CL_ERROR(ret);
cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_ONLY,
matrixd_size(cl->kernel_m) * sizeof(double), NULL, &ret);
CHECK_CL_ERROR(ret);
cl->cl_result = clCreateImage(cl->context, CL_MEM_WRITE_ONLY,
&format, &desc, NULL, &ret);
CHECK_CL_ERROR(ret);
cl->sampler = clCreateSamplerWithProperties(cl->context, NULL, &ret);
CHECK_CL_ERROR(ret);
// copy host memory to device memory
ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0,
matrixd_size(cl->kernel_m) * sizeof(double), cl->kernel_m->buf, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
return ret;
}
cl_int opencl_execute_image(cl_struct* cl)
{
cl_int ret;
ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->sampler));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer1));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_buffer2));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 3, sizeof(cl_mem), (void *)&(cl->cl_result));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 4, sizeof(int32_t), (void *)&cl->matu8->w);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 5, sizeof(int32_t), (void *)&cl->matu8->h);
CHECK_CL_ERROR(ret);
int k = cl->kernel_m->w-1/2;
ret = clSetKernelArg(cl->kernel, 6, sizeof(int32_t), (void *)&k);
CHECK_CL_ERROR(ret);
size_t mat1_size = cl->matu8->w * cl->matu8->h;
size_t mat2_size = matrixd_size(cl->kernel_m);
size_t local_size = 16;
size_t local_items_size[2] = {local_size, local_size};
size_t global_items_size[2];
size_t work_groups1 = (cl->matu8->w + local_size + 1) / local_size;
size_t work_groups2 = (cl->matu8->h + local_size + 1) / local_size;
global_items_size[0] = work_groups1 * local_size;
global_items_size[1] = work_groups2 * local_size;
ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL,
global_items_size, local_items_size, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
return ret;
}
cl_int opencl_execute(cl_struct* cl)
{
cl_int ret;
ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->cl_buffer1));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer2));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_result));
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 3, sizeof(uint32_t), (void *)&cl->mat1->w);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(cl->kernel, 4, sizeof(uint32_t), (void *)&cl->mat1->h);
CHECK_CL_ERROR(ret);
int k = cl->mat2->w-1/2;
ret = clSetKernelArg(cl->kernel, 5, sizeof(uint32_t), (void *)&k);
CHECK_CL_ERROR(ret);
size_t mat1_size = matrixd_size(cl->mat1);
size_t mat2_size = matrixd_size(cl->mat2);
size_t local_size = 16;
size_t local_items_size[2] = {local_size, local_size};
size_t global_items_size[2];
size_t work_groups1 = (cl->mat1->w + local_size + 1) / local_size;
size_t work_groups2 = (cl->mat1->h + local_size + 1) / local_size;
global_items_size[0] = work_groups1 * local_size;
global_items_size[1] = work_groups2 * local_size;
ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL,
global_items_size, local_items_size, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
return ret;
}
cl_int opencl_read_result(cl_struct *cl)
{
cl_int ret;
clFinish(cl->command_queue);
size_t result_size = matrixd_size(cl->mat1);
double *result_buffer = calloc(result_size, sizeof(double));
ret = clEnqueueReadBuffer(cl->command_queue, cl->cl_result, CL_TRUE, 0,
result_size * sizeof(double), result_buffer, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
for (size_t i = 0; i < result_size; i+=100) {
printf("%ld: %lf\n", i, result_buffer[i]);
}
free(result_buffer);
return ret;
}
cl_int opencl_read_result_image(cl_struct* cl)
{
cl_int ret;
clFinish(cl->command_queue);
size_t result_size = matrixu8_size(cl->matu8) * 4;
uint8_t *result_buffer = calloc(result_size, 1);
size_t origin[] = {0, 0, 0};
size_t region[] = {cl->matu8->w, cl->matu8->h, 1};
ret = clEnqueueReadImage(cl->command_queue, cl->cl_result, CL_TRUE, origin,
region, 0, 0, result_buffer, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
stbi_write_bmp("image_blurred.bmp", cl->matu8->w, cl->matu8->h, 4, result_buffer);
free(result_buffer);
return ret;
}