#include "opencl.h" #include "utils.h" #include #include #include #include #include #include #include #include "stb/stb_image.h" #include "stb/stb_image_write.h" cl_int opencl_init(cl_struct *cl, const char *program_path) { cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); CHECK_CL_ERROR(ret); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &(cl->device_id), &ret_num_devices); CHECK_CL_ERROR(ret); cl->context = clCreateContext(NULL, 1, &(cl->device_id), NULL, NULL, &ret); CHECK_CL_ERROR(ret); cl_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0, }; cl->command_queue = clCreateCommandQueueWithProperties(cl->context, cl->device_id, properties, &ret); CHECK_CL_ERROR(ret); ret = opencl_load_kernel(cl, program_path, "gaussian_blur", &cl->kernel, &cl->program); return ret; } cl_int opencl_load_kernel(cl_struct *cl, const char *path, const char* kernel_name, cl_kernel *kernel, cl_program *program) { assert(cl); assert(path); assert(kernel_name); assert(kernel); assert(program); cl_int ret; FILE *fp; char *source_str; size_t source_size; const size_t max_source_size = 10000; fp = fopen(path, "r"); if (!fp) { fprintf(stderr, "failed to open kernel source file.\n"); exit(1); } source_str = malloc(max_source_size); assert(source_str); source_size = fread(source_str, 1, max_source_size, fp); fprintf(stderr, "kernel: read %ld bytes\n", source_size); fclose(fp); *program = clCreateProgramWithSource(cl->context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); CHECK_CL_ERROR(ret); ret = clBuildProgram(*program, 1, &(cl->device_id), NULL, NULL, NULL); CHECK_CL_ERROR(ret); if (ret != CL_SUCCESS) { char build_log[2000]; size_t ret_size; ret = clGetProgramBuildInfo(*program, cl->device_id, CL_PROGRAM_BUILD_LOG, 2000, build_log, &ret_size); CHECK_CL_ERROR(ret); fprintf(stderr, "build: read %ld bytes: %s\n", ret_size, build_log); fprintf(stderr, "build failed, exiting\n"); exit(1); } *kernel = clCreateKernel(*program, kernel_name, &ret); CHECK_CL_ERROR(ret); free(source_str); return ret; } cl_int opencl_init_copy_host_buffers(cl_struct *cl, Matrixd *mat1, Matrixd *mat2) { cl_int ret; cl->mat1 = mat1; cl->mat2 = mat2; cl->cl_buffer1 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE, matrixd_size(cl->mat1) * sizeof(double), NULL, &ret); CHECK_CL_ERROR(ret); cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_WRITE, matrixd_size(cl->mat2) * sizeof(double), NULL, &ret); CHECK_CL_ERROR(ret); // size_t result_size = (cl->mat1 > cl->mat2) ? matrixd_size(cl->mat1) : matrixd_size(cl->mat2); size_t result_size = matrixd_size(cl->mat1); cl->cl_result = clCreateBuffer(cl->context, CL_MEM_READ_WRITE, result_size * sizeof(double), NULL, &ret); CHECK_CL_ERROR(ret); // copy host memory to device memory ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer1, CL_TRUE, 0, matrixd_size(cl->mat1) * sizeof(double), cl->mat1->buf, 0, NULL, NULL); CHECK_CL_ERROR(ret); ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0, matrixd_size(cl->mat2) * sizeof(double), cl->mat2->buf, 0, NULL, NULL); CHECK_CL_ERROR(ret); return ret; } cl_int opencl_init_copy_host_buffers_mu8(cl_struct *cl, Matrixu8 *mat1, Matrixd *kernel) { cl_int ret; cl->matu8 = mat1; cl->kernel_m = kernel; cl_image_format format = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_UNSIGNED_INT8}; cl_image_desc desc; memset(&desc, 0, sizeof(cl_image_desc)); desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = cl->matu8->w; desc.image_height = cl->matu8->h; cl->cl_buffer1 = clCreateImage(cl->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format, &desc, cl->matu8->buf, &ret); CHECK_CL_ERROR(ret); cl->cl_buffer2 = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, matrixd_size(cl->kernel_m) * sizeof(double), NULL, &ret); CHECK_CL_ERROR(ret); cl->cl_result = clCreateImage(cl->context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &ret); CHECK_CL_ERROR(ret); cl->sampler = clCreateSamplerWithProperties(cl->context, NULL, &ret); CHECK_CL_ERROR(ret); // copy host memory to device memory ret = clEnqueueWriteBuffer(cl->command_queue, cl->cl_buffer2, CL_TRUE, 0, matrixd_size(cl->kernel_m) * sizeof(double), cl->kernel_m->buf, 0, NULL, NULL); CHECK_CL_ERROR(ret); return ret; } cl_int opencl_execute_image(cl_struct* cl) { cl_int ret; ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->sampler)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer1)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_buffer2)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 3, sizeof(cl_mem), (void *)&(cl->cl_result)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 4, sizeof(int32_t), (void *)&cl->matu8->w); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 5, sizeof(int32_t), (void *)&cl->matu8->h); CHECK_CL_ERROR(ret); int k = cl->kernel_m->w-1/2; ret = clSetKernelArg(cl->kernel, 6, sizeof(int32_t), (void *)&k); CHECK_CL_ERROR(ret); size_t mat1_size = cl->matu8->w * cl->matu8->h; size_t mat2_size = matrixd_size(cl->kernel_m); size_t local_size = 16; size_t local_items_size[2] = {local_size, local_size}; size_t global_items_size[2]; size_t work_groups1 = (cl->matu8->w + local_size + 1) / local_size; size_t work_groups2 = (cl->matu8->h + local_size + 1) / local_size; global_items_size[0] = work_groups1 * local_size; global_items_size[1] = work_groups2 * local_size; ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL, global_items_size, local_items_size, 0, NULL, NULL); CHECK_CL_ERROR(ret); return ret; } cl_int opencl_execute(cl_struct* cl) { cl_int ret; ret = clSetKernelArg(cl->kernel, 0, sizeof(cl_mem), (void *)&(cl->cl_buffer1)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 1, sizeof(cl_mem), (void *)&(cl->cl_buffer2)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 2, sizeof(cl_mem), (void *)&(cl->cl_result)); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 3, sizeof(uint32_t), (void *)&cl->mat1->w); CHECK_CL_ERROR(ret); ret = clSetKernelArg(cl->kernel, 4, sizeof(uint32_t), (void *)&cl->mat1->h); CHECK_CL_ERROR(ret); int k = cl->mat2->w-1/2; ret = clSetKernelArg(cl->kernel, 5, sizeof(uint32_t), (void *)&k); CHECK_CL_ERROR(ret); size_t mat1_size = matrixd_size(cl->mat1); size_t mat2_size = matrixd_size(cl->mat2); size_t local_size = 16; size_t local_items_size[2] = {local_size, local_size}; size_t global_items_size[2]; size_t work_groups1 = (cl->mat1->w + local_size + 1) / local_size; size_t work_groups2 = (cl->mat1->h + local_size + 1) / local_size; global_items_size[0] = work_groups1 * local_size; global_items_size[1] = work_groups2 * local_size; ret = clEnqueueNDRangeKernel(cl->command_queue, cl->kernel, 2, NULL, global_items_size, local_items_size, 0, NULL, NULL); CHECK_CL_ERROR(ret); return ret; } cl_int opencl_read_result(cl_struct *cl) { cl_int ret; clFinish(cl->command_queue); size_t result_size = matrixd_size(cl->mat1); double *result_buffer = calloc(result_size, sizeof(double)); ret = clEnqueueReadBuffer(cl->command_queue, cl->cl_result, CL_TRUE, 0, result_size * sizeof(double), result_buffer, 0, NULL, NULL); CHECK_CL_ERROR(ret); for (size_t i = 0; i < result_size; i+=100) { printf("%ld: %lf\n", i, result_buffer[i]); } free(result_buffer); return ret; } cl_int opencl_read_result_image(cl_struct* cl) { cl_int ret; clFinish(cl->command_queue); size_t result_size = matrixu8_size(cl->matu8) * 4; uint8_t *result_buffer = calloc(result_size, 1); size_t origin[] = {0, 0, 0}; size_t region[] = {cl->matu8->w, cl->matu8->h, 1}; ret = clEnqueueReadImage(cl->command_queue, cl->cl_result, CL_TRUE, origin, region, 0, 0, result_buffer, 0, NULL, NULL); CHECK_CL_ERROR(ret); stbi_write_bmp("image_blurred.bmp", cl->matu8->w, cl->matu8->h, 4, result_buffer); free(result_buffer); return ret; }