__kernel void template(__global double *buf1, __global double *buf2, __global double *result, int mat1_w, int mat1_h, int k) { int x = get_global_id(0); int y = get_global_id(1); int idx = x * mat1_h + y; if (idx >= mat1_h * mat1_w) return; double sum = 0.0; for (int kx = 0; kx < 2*k+1; kx++) { for (int ky = 0; ky < 2*k+1; ky++) { int overlap_x = x + kx - k; int overlap_y = y + ky - k; if (overlap_x < 0) overlap_x = 0; else if (overlap_x >= mat1_w) overlap_x = mat1_w - 1; if (overlap_y < 0) overlap_y = 0; else if (overlap_y >= mat1_h) overlap_y = mat1_h - 1; int idx1 = overlap_x * mat1_h + overlap_y; int idxk = kx * 2*k+1 + ky; if (idx1 >= mat1_h * mat1_w) return; sum += buf1[idx1] * buf2[idxk]; } } result[idx] = sum; }