c - Parallelizing a for loop (1D Naive Convolution) in CUDA -
can please me convert nested loop cuda kernel? here function trying convert cuda kernel:
// convolution on host void conv(int* a, int* b, int* out) { (int = 0; < n; ++i) (int j = 0; j < n; ++j) out[i + j] += a[i] * b[j]; }
i have tried hard parallelize code.
here attempt:
__global__ void conv_kernel(int* a, int* b, int* out) { int = blockidx.x; int j = threadidx.x; __shared__ int temp[n]; __syncthreads(); temp[i + j] = a[i] * b[j]; __syncthreads(); int sum = 0; (int k = 0; k < n; k++) sum += temp[k]; out[i + j] = sum; }
your cpu conv
function appears doing (for n
= 4, example):
a0b0 a0b1 a0b2 a0b3 + ^ a1b0 a1b1 a1b2 a1b3 + n a2b0 a2b1 a2b2 a2b3 + rows a3b0 a3b1 a3b2 a3b3 = v ------------------------------------------ out0 out1 out2 out3 out4 out5 out6 <- (2*n)-1 columns ->
your convolution (to me) distinguished fact convolving 2 signals of equal length. since gpu likes work on "large" problems, implies n
should large. 1 immediate problem conv_kernel
realization implies block dimension used index a
, , thread dimension used index b
. thread dimension (threadidx.x
) limited 512 or 1024 current cuda gpus. relegate solving pretty small problems.
there various other problems realization. 1 problem shared memory size allocated not enough fit i+j
range (which can go 0->2*(n-1)). trivial fix of course, more serious issue don't see way map arithmetic onto resembling desired pattern above. after spending little while thinking kernel, discarded it.
the convolution problem has great deal of research associated it, , can optimized in various ways massively parallel architectures gpu. therefore focus on 2 simple realizations suggest based on diagram above.
the first realization re-create diagram above. create intermediate temp
array store individual axby products, calculating , storing these products in conv_kernel
. launch second kernel (sum_kernel
) sums columns of temp
array, produce various out
values. first kernel requires n
threads, successively calculate each row of above diagram, in slanting fashion iterate through n
for-loop iterations, 1 per row. second kernel requires (2*n)-1 threads, 1 each column/out
value.
my second realization (conv_kernel2) dispenses need temp
array, , assigns 1 thread each column/out
value, , iterates through n
rows, computing necessary products row-by-row, , summing products "on-the-fly". sum result directly stored in out
array.
considering calculations, , not time required data movement/initialization, gpu realizations begin faster naive single-threaded cpu implementation @ around n
=512 on k20x gpu, happened using. second realization commended fact data movement required a, b, , result. first realization requires in addition temp
array allocated , initialized zeroes. size of temp
array proportional n
*n
, second realization has benefit not require temporary storage.
here's worked test case, running , timing cpu realization provided plus 2 different gpu realizations created:
$ cat t617.cu #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #define n 4096 #define rg 10 #define usecpsec 1000000ull #define ntpb 256 void conv(int* a, int* b, int* out) { (int = 0; < n; ++i) (int j = 0; j < n; ++j) out[i + j] += a[i] * b[j]; } unsigned long long dtime_usec(unsigned long long prev){ timeval tv1; gettimeofday(&tv1,0); return ((tv1.tv_sec * usecpsec)+tv1.tv_usec) - prev; } __global__ void conv_kernel(int* a, int *b, int* temp) { int idx = threadidx.x+blockdim.x*blockidx.x; if (idx < n){ int my_b = b[idx]; (int = 0; < n; i++) temp[idx + (i*2*n) + i] = my_b * a[i]; } } __global__ void sum_kernel(int *temp, int *out){ int idx = threadidx.x+blockdim.x*blockidx.x; if (idx < (2*n)-1){ int my_sum = 0; (int = 0; < n; i++) my_sum += temp[idx + (i*2*n)]; out[idx] = my_sum;} } __global__ void conv_kernel2(int *a, int *b, int *out){ int idx = threadidx.x+blockdim.x*blockidx.x; if (idx < (2*n)-1){ int my_sum = 0; (int = 0; < n; i++) if (((idx < n) && (i <= idx)) || ((idx >= n) && (i > (idx-n)))) my_sum += a[i]*b[idx-i]; out[idx] = my_sum; } } int main(){ int *h_a, *d_a, *h_result, *d_result, *result, *h_b, *d_b, *a, *b, *d_temp; b = (int *)malloc(n*sizeof(int)); = (int *)malloc(n*sizeof(int)); h_a = (int *)malloc(n*sizeof(int)); h_b = (int *)malloc(n*sizeof(int)); h_result = (int *)malloc(2*n*sizeof(int)); result = (int *)malloc(2*n*sizeof(int)); cudamalloc(&d_b, n*sizeof(int)); cudamalloc(&d_a, n*sizeof(int)); cudamalloc(&d_result, 2*n*sizeof(int)); cudamalloc(&d_temp, 2*n*n*sizeof(int)); (int i=0; < n; i++){ a[i] = rand()%rg; b[i] = rand()%rg; h_a[i] = a[i]; h_b[i] = b[i];} (int i=0; < 2*n; i++){ result[i] = 0; h_result[i] = 0;} unsigned long long cpu_time = dtime_usec(0); conv(a, b, result); cpu_time = dtime_usec(cpu_time); cudamemcpy(d_a, h_a, n*sizeof(int), cudamemcpyhosttodevice); cudamemcpy(d_b, h_b, n*sizeof(int), cudamemcpyhosttodevice); cudamemset(d_result, 0, 2*n*sizeof(int)); cudamemset(d_temp, 0, 2*n*n*sizeof(int)); unsigned long long gpu_time = dtime_usec(0); conv_kernel<<<(n+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_temp); sum_kernel<<<((2*(n-1))+ntpb-1)/ntpb, ntpb>>>(d_temp, d_result); cudadevicesynchronize(); gpu_time = dtime_usec(gpu_time); cudamemcpy(h_result, d_result, 2*n*sizeof(int), cudamemcpydevicetohost); (int = 0; < 2*n; i++) if (result[i] != h_result[i]) {printf("mismatch @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;} printf("finished. results match. cpu time: %ldus, gpu time: %ldus\n", cpu_time, gpu_time); cudamemset(d_result, 0, 2*n*sizeof(int)); // error checking, kernel2 require no initialization of result gpu_time = dtime_usec(0); conv_kernel2<<<((2*(n-1))+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_result); cudadevicesynchronize(); gpu_time = dtime_usec(gpu_time); cudamemcpy(h_result, d_result, 2*n*sizeof(int), cudamemcpydevicetohost); (int = 0; < 2*n; i++) if (result[i] != h_result[i]) {printf("mismatch2 @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;} printf("finished. results match. cpu time: %ldus, gpu2 time: %ldus\n", cpu_time, gpu_time); return 0; } $ nvcc -arch=sm_35 -o t617 t617.cu $ ./t617 finished. results match. cpu time: 69059us, gpu time: 3204us finished. results match. cpu time: 69059us, gpu2 time: 1883us $ nvcc -arch=sm_35 -o3 -o t617 t617.cu $ ./t617 finished. results match. cpu time: 13750us, gpu time: 3214us finished. results match. cpu time: 13750us, gpu2 time: 1886us $
(note using -o3 parameter makes significant difference in cpu code execution)
as mentioned, consider both of examples quite "naive" gpu code (niether uses shared memory, example), may give ideas how started.
for brevity of presentation, have dispensed cuda error checking. however, suggest time having trouble cuda code, perform proper cuda error checking. in case of conv_kernel
, believe have indicated errors (if tried run it.) quick test, can run cuda code cuda-memcheck
see if api errors occurring.
edit: tried simple shared memory version of conv_kernel2
wasn't faster. believe reason these data sets (at n
=4096, a
, b
16kbytes each, out
approximately 32kbytes) small enough fit in gpu l2 cache, no thrashing.
however, newer architectures (cc 3.5 , newer) cuda compiler can make additional optimizations if read-only input data identified such kernel. therefore if change conv_kernel2
definition to:
__global__ void conv_kernel2(const int * __restrict__ a, const int * __restrict__ b, int *out){
then witness improved execution times, in case:
$ ./t617 finished. results match. cpu time: 13792us, gpu time: 3209us finished. results match. cpu time: 13792us, gpu2 time: 1626us $
i created modified version of code following:
n
specified on command line- only cpu
conv
, gpuconv_kernel2
included. - time cost move data to/from gpu included in gpu timing measurement
- a
typedef ... mytype;
provided code can re-compiled test behavior various datatypes. - a "speedup factor" printed out, cpu time divided gpu time.
modified code:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> // rg*rg*maxn must fit within mytype #define maxn 100000 #define rg 10 #define usecpsec 1000000ull #define ntpb 256 typedef double mytype; void conv(const mytype *a, const mytype *b, mytype* out, int n) { (int = 0; < n; ++i) (int j = 0; j < n; ++j) out[i + j] += a[i] * b[j]; } unsigned long long dtime_usec(unsigned long long prev){ timeval tv1; gettimeofday(&tv1,0); return ((tv1.tv_sec * usecpsec)+tv1.tv_usec) - prev; } __global__ void conv_kernel2(const mytype * __restrict__ a, const mytype * __restrict__ b, mytype *out, const int n){ int idx = threadidx.x+blockdim.x*blockidx.x; if (idx < (2*n)-1){ mytype my_sum = 0; (int = 0; < n; i++) if (((idx < n) && (i <= idx)) || ((idx >= n) && (i > (idx-n)))) my_sum += a[i]*b[idx-i]; out[idx] = my_sum; } } int main(int argc, char *argv[]){ mytype *h_a, *d_a, *h_result, *d_result, *result, *h_b, *d_b, *a, *b; if (argc != 2) {printf("must specify n on command line\n"); return 1;} int my_n = atoi(argv[1]); if ((my_n < 1) || (my_n > maxn)) {printf("n out of range\n"); return 1;} b = (mytype *)malloc(my_n*sizeof(mytype)); = (mytype *)malloc(my_n*sizeof(mytype)); h_a = (mytype *)malloc(my_n*sizeof(mytype)); h_b = (mytype *)malloc(my_n*sizeof(mytype)); h_result = (mytype *)malloc(2*my_n*sizeof(mytype)); result = (mytype *)malloc(2*my_n*sizeof(mytype)); cudamalloc(&d_b, my_n*sizeof(mytype)); cudamalloc(&d_a, my_n*sizeof(mytype)); cudamalloc(&d_result, 2*my_n*sizeof(mytype)); (int i=0; < my_n; i++){ a[i] = rand()%rg; b[i] = rand()%rg; h_a[i] = a[i]; h_b[i] = b[i];} (int i=0; < 2*my_n; i++){ result[i] = 0; h_result[i] = 0;} unsigned long long cpu_time = dtime_usec(0); conv(a, b, result, my_n); cpu_time = dtime_usec(cpu_time); cudamemset(d_result, 0, 2*my_n*sizeof(mytype)); unsigned long long gpu_time = dtime_usec(0); cudamemcpy(d_a, h_a, my_n*sizeof(mytype), cudamemcpyhosttodevice); cudamemcpy(d_b, h_b, my_n*sizeof(mytype), cudamemcpyhosttodevice); conv_kernel2<<<((2*(my_n-1))+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_result, my_n); cudadevicesynchronize(); cudamemcpy(h_result, d_result, 2*my_n*sizeof(mytype), cudamemcpydevicetohost); gpu_time = dtime_usec(gpu_time); (int = 0; < 2*my_n; i++) if (result[i] != h_result[i]) {printf("mismatch2 @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;} printf("finished. results match. cpu time: %ldus, gpu time: %ldus\n", cpu_time, gpu_time); printf("cpu/gpu = %f\n", cpu_time/(float)gpu_time); return 0; }
Comments
Post a Comment