c - Using cuda thrust::max_element to find max element in array returns incorrect sometimes -


i have 2^20 element array being filled on device; these numbers should same every time. move array on host , search max element in array, technique works 2^10 element array once begin larger begin random answers not sure if thrust messing or device calculations.

the answer max_element should return 0.094479 first time program run code output correct answer answer randomly show every few times

gpu tesla k20 running 5.0 tested on 780gtx; same issue both times

//host code  int main( void ) {  float h_c[total]; float *d_c;  cudamalloc((void**)&d_c, sizeof(float)*total);  cudaevent_t start, stop; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start);  //number of threads kernel<<<blocks,threads>>>(d_c);  cudaeventrecord(stop); cudaeventsynchronize(stop); float mil = 0; cudaeventelapsedtime(&mil, start, stop);    cudamemcpy(h_c, d_c, sizeof(float)*total, cudamemcpydevicetohost);  for(int y = 0; y < total; y++){       printf(" %d: host c: %f \n",y, h_c[y]); }  float *result = thrust::max_element(h_c, h_c + total); printf("max is: %f \n", *result);  printf("time:  %f \n", mil/1000);  printf("threads:  %d \n", threads); printf("blocks:  %d \n", blocks); printf("total:  %d \n", total); cudafree(d_c); cudadevicereset() ; return 0; } 

device code

#include <thrust/extrema.h> #include <math.h> #include <stdio.h>  #define arraysize 15 #define threads 1024 #define blocks 32 #define total threads * blocks   __global__ void kernel(float *cc){  //get thread summing elements  int threadid = threadidx.x + blockdim.x * blockidx.x;  int decimalnumber,quotient; //size of array //const int size = 10; //holds binary number in array int binarynumber[arraysize]; int = 0;   int a[arraysize] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728}; int b[arraysize] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573,    1786, 1741};//, 1782, 1755, 1669, 1700, 1826};  //holds product dot product int c[arraysize]; //arrays hold integers summed  int asumarr[arraysize]; int bsumarr[arraysize];  for(int = 0; < arraysize; i++){     c[i] = 0;     asumarr[i] = 0;     bsumarr[i] = 0; }  //holds value dot product int dotsum = 0; //holds sum of valid array positions array int asum = 0; //holds sum of valid array positions array b int bsum = 0;  //holds value of arccos of dot product / sqrt(array a) * sqrt(array b) float finalvalue = 0; //printf("threadid: %d \n", threadid); //all 1's 1048575 = threads decimalnumber = threadid; //printf("decimal number:  %d \n", decimalnumber);   quotient = decimalnumber; //loop convert decimal binary , store in array while(quotient!=0){      binarynumber[i++]= quotient % 2;      quotient = quotient / 2;  }  //test if conversion decimal binary complete , correct //printf("equivalent binary value of decimal number %d: \n",decimalnumber);  //for(int in = size-1; in >= 0;in--){   //printf("index: %d | binary number:  %d ----  a:%d || b: %d\n",in,binarynumber[in],a[in],b[in]); //} //printf(" \n ");  //loop through binarynumber array for(int x = arraysize-1 ; x >= 0; x--){     //if index == 1 perform calculation     if(binarynumber[x] == 1){         //multiply numbers @ index         c[x] = a[x] * b[x];         //fill sum arrays @ correct index         asumarr[x] = a[x];         bsumarr[x] = b[x];          //checks if loop executing correctly         //sumarray[x] = 1;         //printf("multiplied - %d * %d = %f\n", a[x], b[x], c[x]);         //printf("--this should not run --\n");     }else{ //          printf("skipped - %d * %d = %f\n", a[x], b[x], c[x]);     }   }  //sums product array complete dot product for(int j = 0; j < arraysize; ++j){     dotsum += c[j];     //printf("asumarr %d \n", asumarr[j]);     //printf("bsumarr %d \n", bsumarr[j]);     asum += powf( asumarr[j], 2 );     bsum += powf( bsumarr[j], 2 ); //      printf("asum: %d +  asumarr %d \n", asum, asumarr[j]); //      printf("bsum: %d +  bsumarr %d \n", bsum, bsumarr[j]); }   //printf("\n"); //print out dot prudct //printf("dot product is: %d \n", dotsum); //printf("asum is: %d \n", asum); //printf("bsum is: %d \n", bsum);  float sqsum1 = sqrtf(asum); float sqsum2 = sqrtf(bsum); // printf("sqsum1: %f \n", sqsum1); // printf("sqsum2: %f \n", sqsum2); float sqsum = sqsum1 * sqsum2; // printf("sqsum %f \n", sqsum);      float div = dotsum / sqsum ; // printf("div: %f \n", div); finalvalue = acosf( div ) ;  //stores threads final value in array cc, in respected index if(finalvalue == finalvalue){     cc[threadid] = finalvalue; }else{     cc[threadid] = -2; } //printf("final value is: %f number %d \n", finalvalue, threadid); } 

it seems case of using improperly initiialized/uninitialized variables.

after added following line:

for(int = 0; < arraysize; i++){     c[i] = 0;     asumarr[i] = 0;     bsumarr[i] = 0;     binarynumber[i] = 0; // add line } 

i no longer able reproduce issue.


Comments

Popular posts from this blog

c++ - OpenMP unpredictable overhead -

ruby on rails - RuntimeError: Circular dependency detected while autoloading constant - ActiveAdmin.register Role -

javascript - Wordpress slider, not displayed 100% width -