c++ - How to load data in global memory into shared memory SAFELY in CUDA? -
my kernel:
__global__ void mykernel(float * devdata, float * devvec, float * devstrfac, int natom, int vecno) { extern __shared__ float sdata[]; int idx = blockidx.x * blockdim.x + threadidx.x; float qx=devvec[3*idx]; float qy=devvec[3*idx+1]; float qz=devvec[3*idx+2]; __syncthreads();//sync_1 float c=0.0,s=0.0; (int iatom=0; iatom<natom; iatom += blockdim.x) { float rtx = devdata[3*(iatom + threadidx.x)];//tag_0 float rty = devdata[3*(iatom + threadidx.x)+1]; float rtz = devdata[3*(iatom + threadidx.x)+2]; __syncthreads();//sync_2 sdata[3*threadidx.x] = rtx;//tag_1 sdata[3*threadidx.x + 1] = rty; sdata[3*threadidx.x + 2] = rtz; __syncthreads();//sync_3 int end_offset= min(blockdim.x, natom - iatom); (int cur_offset=0; cur_offset<end_offset; cur_offset++) { float rx = sdata[3*cur_offset]; float ry = sdata[3*cur_offset + 1]; float rz = sdata[3*cur_offset + 2]; //sync_4 float theta = rx*qx + ry*qy + rz*qz; theta = theta - lrint (theta); theta = theta * 2 * 3.1415926;//reduce theta [-pi,pi] float ct,st; sincosf(theta,&st,&ct); c += ct; s += st; } } devstrfac[idx] += c*c + s*s; }
why "__syncthreads()" labeled sync_2 needed? without sync_2, sdata[] wrong numbers , wrong results. line "tag_1" use results of line "tag_0", in mind sync_2 no need. wrong? if due disorderd instruction executing, should put __syncthreads() in line "sync_4"?
consider 1 warp of thread block finishing first iteration , starting next one, while other warps still working on first iteration. if don't have __syncthreads
@ label sync2
, end warp writing shared memory while others reading shared memory, race condition.
you might move __syncthreads()
@ label sync2
end of outer loop sake of clarity.
"cuda-memcheck --tool racecheck" should tell problem is.
Comments
Post a Comment