gpu - Is prefix scan CUDA sample code in gpugems3 correct? -

i've written piece of code call kernel in gpugem3

but results got bunch of negative numbers instead of prefix scan. i'm wondering if kernel call wrong or there wrong gpugem3 code?

here code:

#include <stdio.h> #include <sys/time.h> #include <cuda.h>    __global__ void kernel(int *g_odata, int  *g_idata, int n, int dim) {      extern __shared__ int temp[];// allocated on invocation     int thid = threadidx.x;     int offset = 1;      temp[2*thid] = g_idata[2*thid]; // load input shared memory     temp[2*thid+1] = g_idata[2*thid+1];     (int d = n>>1; d > 0; d >>= 1) // build sum in place tree     {     __syncthreads();     if (thid < d)     {     int ai = offset*(2*thid+1)-1;     int bi = offset*(2*thid+2)-1;     temp[bi] += g_idata[ai];     }     offset *= 2;     }     if (thid == 0) { temp[n - 1] = 0; } // clear last element     (int d = 1; d < n; d *= 2) // traverse down tree & build scan     {     offset >>= 1;     __syncthreads();     if (thid < d)     {     int ai = offset*(2*thid+1)-1;     int bi = offset*(2*thid+2)-1;     int t = temp[ai];     temp[ai] = temp[bi];     temp[bi] += t;     }     }     __syncthreads();     g_odata[2*thid] = temp[2*thid]; // write results device memory     g_odata[2*thid+1] = temp[2*thid+1];  }   void initialize(int  *h_in,int num_items) {     int j;    for(j=0;j<num_items;j++)         h_in[j]=j;        printf(" input: ");          printf("\n\n");    }   int main(int argc, char** argv) {     int num_items = 512;       int*  h_in = new int[num_items];       // initialize problem      initialize(h_in, num_items);       int *d_in = null;     cudamalloc((void**)&d_in, sizeof(int) * num_items);   if(cudasuccess!=    cudamemcpy(d_in, h_in, sizeof(int) * num_items, cudamemcpyhosttodevice)) fprintf(stderr,"could not copy gpu");      // allocate device output array     int *d_out = null;     cudamalloc((void**)&d_out, sizeof(int) * (num_items+1));       kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);       int* h_out= new int[num_items+1];     if( cudasuccess !=cudamemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudamemcpydevicetohost))fprintf(stderr,"could not copy back");     int i;     printf(" \n");     for(i=0;i<num_items;i++)     printf(" ,%d ",h_out[i]);     // cleanup     if (h_in) delete[] h_in;     if (h_out) delete[] h_out;     if (d_in) cudafree(d_in);     if (d_out) cudafree(d_out);      printf("\n\n");      return 0; }

it seems you've made @ least 1 error in transcribing code gpu gems 3 chapter kernel. line incorrect:

temp[bi] += g_idata[ai];

it should be:

temp[bi] += temp[ai];

when make 1 change code have posted, seems print out correct (exclusive-scan) prefix sum me. there's few other things mention:

even without change, results close correct. if you're getting different stuff (e.g. negative numbers) may have problem machine setup or cuda install. suggest using more rigorous cuda error checking have (although machine setup problem should have been indicated in 1 of checks.)
the routine crafted have limitations. can used in single threadblock, have bank conflicts on shared memory access, , limited in data set size can handled single threadblock (this routine produces 2 output elements per thread, data set size expected equal twice number of threads). has been covered, dynamic shared memory allocation needs large data set size (ie. twice thread size, in number of elements).
this may useful learning, if want robust, fast prefix scan, advised use routine thrust or cub instead of own code, if derived (old) article.

the following code similar yours, has above issues fixed, , have templated kernel use various datatypes:

#include <stdio.h> #define dsize 512 #define cudacheckerrors(msg) \     { \         cudaerror_t __err = cudagetlasterror(); \         if (__err != cudasuccess) { \             fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \                 msg, cudageterrorstring(__err), \                 __file__, __line__); \             fprintf(stderr, "*** failed - aborting\n"); \             exit(1); \         } \     } while (0)   typedef int mytype;  template <typename t> __global__ void prescan(t *g_odata, t *g_idata, int n) {   extern __shared__ t temp[];  // allocated on invocation   int thid = threadidx.x;   int offset = 1;   temp[2*thid] = g_idata[2*thid]; // load input shared memory   temp[2*thid+1] = g_idata[2*thid+1];   (int d = n>>1; d > 0; d >>= 1)                    // build sum in place tree   {     __syncthreads();     if (thid < d)     {       int ai = offset*(2*thid+1)-1;       int bi = offset*(2*thid+2)-1;       temp[bi] += temp[ai];     }     offset *= 2;   }   if (thid == 0) { temp[n - 1] = 0; } // clear last element   (int d = 1; d < n; d *= 2) // traverse down tree & build scan     {       offset >>= 1;       __syncthreads();       if (thid < d)       {          int ai = offset*(2*thid+1)-1;          int bi = offset*(2*thid+2)-1;          t t = temp[ai];          temp[ai] = temp[bi];          temp[bi] += t;       }     }   __syncthreads();   g_odata[2*thid] = temp[2*thid]; // write results device memory   g_odata[2*thid+1] = temp[2*thid+1]; }  int main(){    mytype *h_i, *d_i, *h_o, *d_o;   int dszp = (dsize)*sizeof(mytype);    h_i = (mytype *)malloc(dszp);   h_o = (mytype *)malloc(dszp);   if ((h_i == null) || (h_o == null)) {printf("malloc fail\n"); return 1;}   cudamalloc(&d_i, dszp);   cudamalloc(&d_o, dszp);   cudacheckerrors("cudamalloc fail");   (int = 0 ; < dsize; i++){     h_i[i] = i;     h_o[i] = 0;}   cudamemset(d_o, 0, dszp);   cudacheckerrors("cudamemset fail");   cudamemcpy(d_i, h_i, dszp, cudamemcpyhosttodevice);   cudacheckerrors("cudamemcpy 1 fail");   prescan<<<1,dsize/2, dszp>>>(d_o, d_i, dsize);   cudadevicesynchronize();   cudacheckerrors("kernel fail");   cudamemcpy(h_o, d_o, dszp, cudamemcpydevicetohost);   cudacheckerrors("cudamemcpy 2 fail");   mytype psum = 0;   (int =1; < dsize; i++){     psum += h_i[i-1];     if (psum != h_o[i]) {printf("mismatch @ %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}     }   return 0; }

Search This Blog

ANgular

gpu - Is prefix scan CUDA sample code in gpugems3 correct? -

Comments

Post a Comment

Popular posts from this blog

c# - Validate object ID from GET to POST -

node.js - Custom Model Validator SailsJS -

php - Find a regex to take part of Email -