c - Using cuda thrust::max_element to find max element in array returns incorrect sometimes -
i have 2^20 element array being filled on device; these numbers should same every time. move array on host , search max element in array, technique works 2^10 element array once begin larger begin random answers not sure if thrust messing or device calculations.
the answer max_element should return 0.094479 first time program run code output correct answer answer randomly show every few times
gpu tesla k20 running 5.0 tested on 780gtx; same issue both times
//host code int main( void ) { float h_c[total]; float *d_c; cudamalloc((void**)&d_c, sizeof(float)*total); cudaevent_t start, stop; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start); //number of threads kernel<<<blocks,threads>>>(d_c); cudaeventrecord(stop); cudaeventsynchronize(stop); float mil = 0; cudaeventelapsedtime(&mil, start, stop); cudamemcpy(h_c, d_c, sizeof(float)*total, cudamemcpydevicetohost); for(int y = 0; y < total; y++){ printf(" %d: host c: %f \n",y, h_c[y]); } float *result = thrust::max_element(h_c, h_c + total); printf("max is: %f \n", *result); printf("time: %f \n", mil/1000); printf("threads: %d \n", threads); printf("blocks: %d \n", blocks); printf("total: %d \n", total); cudafree(d_c); cudadevicereset() ; return 0; }
device code
#include <thrust/extrema.h> #include <math.h> #include <stdio.h> #define arraysize 15 #define threads 1024 #define blocks 32 #define total threads * blocks __global__ void kernel(float *cc){ //get thread summing elements int threadid = threadidx.x + blockdim.x * blockidx.x; int decimalnumber,quotient; //size of array //const int size = 10; //holds binary number in array int binarynumber[arraysize]; int = 0; int a[arraysize] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728}; int b[arraysize] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573, 1786, 1741};//, 1782, 1755, 1669, 1700, 1826}; //holds product dot product int c[arraysize]; //arrays hold integers summed int asumarr[arraysize]; int bsumarr[arraysize]; for(int = 0; < arraysize; i++){ c[i] = 0; asumarr[i] = 0; bsumarr[i] = 0; } //holds value dot product int dotsum = 0; //holds sum of valid array positions array int asum = 0; //holds sum of valid array positions array b int bsum = 0; //holds value of arccos of dot product / sqrt(array a) * sqrt(array b) float finalvalue = 0; //printf("threadid: %d \n", threadid); //all 1's 1048575 = threads decimalnumber = threadid; //printf("decimal number: %d \n", decimalnumber); quotient = decimalnumber; //loop convert decimal binary , store in array while(quotient!=0){ binarynumber[i++]= quotient % 2; quotient = quotient / 2; } //test if conversion decimal binary complete , correct //printf("equivalent binary value of decimal number %d: \n",decimalnumber); //for(int in = size-1; in >= 0;in--){ //printf("index: %d | binary number: %d ---- a:%d || b: %d\n",in,binarynumber[in],a[in],b[in]); //} //printf(" \n "); //loop through binarynumber array for(int x = arraysize-1 ; x >= 0; x--){ //if index == 1 perform calculation if(binarynumber[x] == 1){ //multiply numbers @ index c[x] = a[x] * b[x]; //fill sum arrays @ correct index asumarr[x] = a[x]; bsumarr[x] = b[x]; //checks if loop executing correctly //sumarray[x] = 1; //printf("multiplied - %d * %d = %f\n", a[x], b[x], c[x]); //printf("--this should not run --\n"); }else{ // printf("skipped - %d * %d = %f\n", a[x], b[x], c[x]); } } //sums product array complete dot product for(int j = 0; j < arraysize; ++j){ dotsum += c[j]; //printf("asumarr %d \n", asumarr[j]); //printf("bsumarr %d \n", bsumarr[j]); asum += powf( asumarr[j], 2 ); bsum += powf( bsumarr[j], 2 ); // printf("asum: %d + asumarr %d \n", asum, asumarr[j]); // printf("bsum: %d + bsumarr %d \n", bsum, bsumarr[j]); } //printf("\n"); //print out dot prudct //printf("dot product is: %d \n", dotsum); //printf("asum is: %d \n", asum); //printf("bsum is: %d \n", bsum); float sqsum1 = sqrtf(asum); float sqsum2 = sqrtf(bsum); // printf("sqsum1: %f \n", sqsum1); // printf("sqsum2: %f \n", sqsum2); float sqsum = sqsum1 * sqsum2; // printf("sqsum %f \n", sqsum); float div = dotsum / sqsum ; // printf("div: %f \n", div); finalvalue = acosf( div ) ; //stores threads final value in array cc, in respected index if(finalvalue == finalvalue){ cc[threadid] = finalvalue; }else{ cc[threadid] = -2; } //printf("final value is: %f number %d \n", finalvalue, threadid); }
it seems case of using improperly initiialized/uninitialized variables.
after added following line:
for(int = 0; < arraysize; i++){ c[i] = 0; asumarr[i] = 0; bsumarr[i] = 0; binarynumber[i] = 0; // add line }
i no longer able reproduce issue.
Comments
Post a Comment