c - Parallelizing a for loop (1D Naive Convolution) in CUDA -

can please me convert nested loop cuda kernel? here function trying convert cuda kernel:

// convolution on host void conv(int* a, int* b, int* out) {      (int = 0; < n; ++i)         (int j = 0; j < n; ++j)             out[i + j] += a[i] * b[j]; }

i have tried hard parallelize code.
here attempt:

__global__ void conv_kernel(int* a, int* b, int* out) {      int = blockidx.x;     int j = threadidx.x;      __shared__ int temp[n];      __syncthreads();     temp[i + j] = a[i] * b[j];     __syncthreads();      int sum = 0;     (int k = 0; k < n; k++)         sum += temp[k];     out[i + j] = sum; }

your cpu conv function appears doing (for n = 4, example):

a0b0  a0b1  a0b2  a0b3                   +     ^       a1b0  a1b1  a1b2  a1b3             +     n             a2b0  a2b1  a2b2  a2b3       +    rows                   a3b0  a3b1  a3b2  a3b3 =     v ------------------------------------------ out0  out1  out2  out3  out4  out5  out6     <-  (2*n)-1 columns ->

your convolution (to me) distinguished fact convolving 2 signals of equal length. since gpu likes work on "large" problems, implies n should large. 1 immediate problem conv_kernel realization implies block dimension used index a, , thread dimension used index b. thread dimension (threadidx.x) limited 512 or 1024 current cuda gpus. relegate solving pretty small problems.

there various other problems realization. 1 problem shared memory size allocated not enough fit i+j range (which can go 0->2*(n-1)). trivial fix of course, more serious issue don't see way map arithmetic onto resembling desired pattern above. after spending little while thinking kernel, discarded it.

the convolution problem has great deal of research associated it, , can optimized in various ways massively parallel architectures gpu. therefore focus on 2 simple realizations suggest based on diagram above.

the first realization re-create diagram above. create intermediate temp array store individual axby products, calculating , storing these products in conv_kernel. launch second kernel (sum_kernel) sums columns of temp array, produce various out values. first kernel requires n threads, successively calculate each row of above diagram, in slanting fashion iterate through n for-loop iterations, 1 per row. second kernel requires (2*n)-1 threads, 1 each column/out value.

my second realization (conv_kernel2) dispenses need temp array, , assigns 1 thread each column/out value, , iterates through n rows, computing necessary products row-by-row, , summing products "on-the-fly". sum result directly stored in out array.

considering calculations, , not time required data movement/initialization, gpu realizations begin faster naive single-threaded cpu implementation @ around n=512 on k20x gpu, happened using. second realization commended fact data movement required a, b, , result. first realization requires in addition temp array allocated , initialized zeroes. size of temp array proportional n*n, second realization has benefit not require temporary storage.

here's worked test case, running , timing cpu realization provided plus 2 different gpu realizations created:

$ cat t617.cu #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h>  #define n 4096 #define rg 10 #define usecpsec 1000000ull #define ntpb 256   void conv(int* a, int* b, int* out) {      (int = 0; < n; ++i)         (int j = 0; j < n; ++j)             out[i + j] += a[i] * b[j]; }  unsigned long long dtime_usec(unsigned long long prev){   timeval tv1;   gettimeofday(&tv1,0);   return ((tv1.tv_sec * usecpsec)+tv1.tv_usec) - prev; }   __global__ void conv_kernel(int* a, int *b, int* temp) {      int idx = threadidx.x+blockdim.x*blockidx.x;     if (idx < n){       int my_b = b[idx];       (int = 0; < n; i++)         temp[idx + (i*2*n) + i] = my_b * a[i];       } }  __global__ void sum_kernel(int *temp, int *out){     int idx = threadidx.x+blockdim.x*blockidx.x;     if (idx < (2*n)-1){       int my_sum = 0;       (int = 0; < n; i++) my_sum += temp[idx + (i*2*n)];       out[idx] = my_sum;} }  __global__ void conv_kernel2(int *a, int *b, int *out){     int idx = threadidx.x+blockdim.x*blockidx.x;     if (idx < (2*n)-1){       int my_sum = 0;       (int = 0; < n; i++)         if (((idx < n) && (i <= idx)) || ((idx >= n) && (i > (idx-n)))) my_sum += a[i]*b[idx-i];       out[idx] = my_sum;     } }  int main(){    int *h_a, *d_a, *h_result, *d_result, *result, *h_b, *d_b, *a, *b, *d_temp;    b   = (int *)malloc(n*sizeof(int));     = (int *)malloc(n*sizeof(int));   h_a = (int *)malloc(n*sizeof(int));   h_b = (int *)malloc(n*sizeof(int));   h_result = (int *)malloc(2*n*sizeof(int));   result   = (int *)malloc(2*n*sizeof(int));    cudamalloc(&d_b, n*sizeof(int));   cudamalloc(&d_a, n*sizeof(int));   cudamalloc(&d_result, 2*n*sizeof(int));   cudamalloc(&d_temp, 2*n*n*sizeof(int));    (int i=0; < n; i++){     a[i] = rand()%rg;     b[i] = rand()%rg;     h_a[i] = a[i];     h_b[i] = b[i];}    (int i=0; < 2*n; i++){     result[i]   = 0;     h_result[i] = 0;}    unsigned long long cpu_time = dtime_usec(0);   conv(a, b, result);   cpu_time = dtime_usec(cpu_time);    cudamemcpy(d_a, h_a, n*sizeof(int), cudamemcpyhosttodevice);   cudamemcpy(d_b, h_b, n*sizeof(int), cudamemcpyhosttodevice);   cudamemset(d_result, 0, 2*n*sizeof(int));   cudamemset(d_temp, 0, 2*n*n*sizeof(int));    unsigned long long gpu_time = dtime_usec(0);   conv_kernel<<<(n+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_temp);   sum_kernel<<<((2*(n-1))+ntpb-1)/ntpb, ntpb>>>(d_temp, d_result);   cudadevicesynchronize();   gpu_time = dtime_usec(gpu_time);    cudamemcpy(h_result, d_result, 2*n*sizeof(int), cudamemcpydevicetohost);   (int = 0; < 2*n; i++) if (result[i] != h_result[i]) {printf("mismatch @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;}   printf("finished.  results match.  cpu time: %ldus, gpu  time: %ldus\n", cpu_time, gpu_time);     cudamemset(d_result, 0, 2*n*sizeof(int)); // error checking, kernel2 require no initialization of result    gpu_time = dtime_usec(0);   conv_kernel2<<<((2*(n-1))+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_result);   cudadevicesynchronize();   gpu_time = dtime_usec(gpu_time);    cudamemcpy(h_result, d_result, 2*n*sizeof(int), cudamemcpydevicetohost);   (int = 0; < 2*n; i++) if (result[i] != h_result[i]) {printf("mismatch2 @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;}   printf("finished.  results match.  cpu time: %ldus, gpu2 time: %ldus\n", cpu_time, gpu_time);   return 0; } $ nvcc -arch=sm_35 -o t617 t617.cu $ ./t617 finished.  results match.  cpu time: 69059us, gpu  time: 3204us finished.  results match.  cpu time: 69059us, gpu2 time: 1883us $ nvcc -arch=sm_35 -o3 -o t617 t617.cu $ ./t617 finished.  results match.  cpu time: 13750us, gpu  time: 3214us finished.  results match.  cpu time: 13750us, gpu2 time: 1886us $

(note using -o3 parameter makes significant difference in cpu code execution)

as mentioned, consider both of examples quite "naive" gpu code (niether uses shared memory, example), may give ideas how started.

for brevity of presentation, have dispensed cuda error checking. however, suggest time having trouble cuda code, perform proper cuda error checking. in case of conv_kernel, believe have indicated errors (if tried run it.) quick test, can run cuda code cuda-memcheck see if api errors occurring.

edit: tried simple shared memory version of conv_kernel2 wasn't faster. believe reason these data sets (at n=4096, a , b 16kbytes each, out approximately 32kbytes) small enough fit in gpu l2 cache, no thrashing.

however, newer architectures (cc 3.5 , newer) cuda compiler can make additional optimizations if read-only input data identified such kernel. therefore if change conv_kernel2 definition to:

__global__ void conv_kernel2(const int * __restrict__ a, const int * __restrict__ b, int *out){

then witness improved execution times, in case:

$ ./t617 finished.  results match.  cpu time: 13792us, gpu  time: 3209us finished.  results match.  cpu time: 13792us, gpu2 time: 1626us $

i created modified version of code following:

n specified on command line
only cpu conv , gpu conv_kernel2 included.
time cost move data to/from gpu included in gpu timing measurement
a typedef ... mytype; provided code can re-compiled test behavior various datatypes.
a "speedup factor" printed out, cpu time divided gpu time.

modified code:

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h>  // rg*rg*maxn must fit within mytype  #define maxn 100000 #define rg 10 #define usecpsec 1000000ull #define ntpb 256  typedef double mytype;  void conv(const mytype *a, const mytype *b, mytype* out, int n) {      (int = 0; < n; ++i)         (int j = 0; j < n; ++j)             out[i + j] += a[i] * b[j]; }  unsigned long long dtime_usec(unsigned long long prev){   timeval tv1;   gettimeofday(&tv1,0);   return ((tv1.tv_sec * usecpsec)+tv1.tv_usec) - prev; }    __global__ void conv_kernel2(const mytype * __restrict__ a, const mytype * __restrict__ b, mytype *out, const int n){     int idx = threadidx.x+blockdim.x*blockidx.x;     if (idx < (2*n)-1){       mytype my_sum = 0;       (int = 0; < n; i++)         if (((idx < n) && (i <= idx)) || ((idx >= n) && (i > (idx-n)))) my_sum += a[i]*b[idx-i];       out[idx] = my_sum;     } }  int main(int argc, char *argv[]){     mytype *h_a, *d_a, *h_result, *d_result, *result, *h_b, *d_b, *a, *b;   if (argc != 2) {printf("must specify n on command line\n"); return 1;}   int my_n = atoi(argv[1]);   if ((my_n < 1) || (my_n > maxn)) {printf("n out of range\n"); return 1;}   b   = (mytype *)malloc(my_n*sizeof(mytype));     = (mytype *)malloc(my_n*sizeof(mytype));   h_a = (mytype *)malloc(my_n*sizeof(mytype));   h_b = (mytype *)malloc(my_n*sizeof(mytype));   h_result = (mytype *)malloc(2*my_n*sizeof(mytype));   result   = (mytype *)malloc(2*my_n*sizeof(mytype));    cudamalloc(&d_b, my_n*sizeof(mytype));   cudamalloc(&d_a, my_n*sizeof(mytype));   cudamalloc(&d_result, 2*my_n*sizeof(mytype));    (int i=0; < my_n; i++){     a[i] = rand()%rg;     b[i] = rand()%rg;     h_a[i] = a[i];     h_b[i] = b[i];}    (int i=0; < 2*my_n; i++){     result[i]   = 0;     h_result[i] = 0;}    unsigned long long cpu_time = dtime_usec(0);   conv(a, b, result, my_n);   cpu_time = dtime_usec(cpu_time);    cudamemset(d_result, 0, 2*my_n*sizeof(mytype));    unsigned long long gpu_time = dtime_usec(0);   cudamemcpy(d_a, h_a, my_n*sizeof(mytype), cudamemcpyhosttodevice);   cudamemcpy(d_b, h_b, my_n*sizeof(mytype), cudamemcpyhosttodevice);   conv_kernel2<<<((2*(my_n-1))+ntpb-1)/ntpb,ntpb>>>(d_a, d_b, d_result, my_n);   cudadevicesynchronize();   cudamemcpy(h_result, d_result, 2*my_n*sizeof(mytype), cudamemcpydevicetohost);   gpu_time = dtime_usec(gpu_time);    (int = 0; < 2*my_n; i++) if (result[i] != h_result[i]) {printf("mismatch2 @ %d, cpu: %d, gpu %d\n", i, result[i], h_result[i]); return 1;}   printf("finished.  results match.  cpu time: %ldus, gpu time: %ldus\n", cpu_time, gpu_time);   printf("cpu/gpu = %f\n", cpu_time/(float)gpu_time);   return 0; }

Search This Blog

WINAPI

c - Parallelizing a for loop (1D Naive Convolution) in CUDA -

Comments

Post a Comment

Popular posts from this blog

ruby on rails - RuntimeError: Circular dependency detected while autoloading constant - ActiveAdmin.register Role -

c++ - OpenMP unpredictable overhead -

javascript - Disable\Enable Bootbox button with KO -