cuda - Why does Hyper-Q selectively overlap async HtoD and DtoH transfer on my cc5.2 hardware? -
there's old parallel forall blog post demonstrates using streams , async memcpys generate overlap between kernels , memcpys, , between htod , dtoh memcpys. ran full async sample given on gtx titan x, , here's result:
as can see, when htod, kernel , dtoh called back in single loop, there's isn't overlapping between htod , dtoh transfers. however, when called separately in 3 loops, there overlapping between htod , dtoh.
if hyper-q did claims do, there should htod , dtoh overlap in first version of loop launching (as case of tesla k20c). understanding in devices compute capability 3.5 , above support hyper-q, user shouldn't worry tailoring launch order anymore.
i ran cuda 7.0 simplehyperq
sample. cuda_device_max_connections
set 32, can 32 concurrent kernels running, hyper-q working in case.
i under 64-bit windows 8.1, driver version 353.06 , cuda 7.0, compiling using visual studio 2013, targeting x64 platform release mode, code generation property being compute_52,sm_52
. cuda_device_max_connections
set ample 32.
since can't post more links, full code of async sample (with slight modification) posted below.
// copyright 2012 nvidia corporation // licensed under apache license, version 2.0 (the "license"); // may not use file except in compliance license. // may obtain copy of license @ // http://www.apache.org/licenses/license-2.0 // unless required applicable law or agreed in writing, software // distributed under license distributed on "as is" basis, // without warranties or conditions of kind, either express or implied. // see license specific language governing permissions , // limitations under license. #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <curand_kernel.h> #include <stdio.h> // convenience function checking cuda runtime api results // can wrapped around runtime api call. no-op in release builds. inline cudaerror_t checkcuda(cudaerror_t result) { #if defined(debug) || defined(_debug) if (result != cudasuccess) { fprintf(stderr, "cuda runtime error: %s\n", cudageterrorstring(result)); assert(result == cudasuccess); } #endif return result; } __global__ void kernel(float *a, int offset) { int = offset + threadidx.x + blockidx.x*blockdim.x; float x = (float)i; float s = sinf(x); float c = cosf(x); a[i] = a[i] + sqrtf(s*s + c*c); } float maxerror(float *a, int n) { float maxe = 0; (int = 0; < n; i++) { float error = fabs(a[i] - 1.0f); if (error > maxe) maxe = error; } return maxe; } int main(int argc, char **argv) { _putenv_s("cuda_device_max_connections", "32"); const int blocksize = 256, nstreams = 4; const int n = 4 * 1024 * blocksize * nstreams; const int streamsize = n / nstreams; const int streambytes = streamsize * sizeof(float); const int bytes = n * sizeof(float); int devid = 0; if (argc > 1) devid = atoi(argv[1]); cudadeviceprop prop; checkcuda(cudagetdeviceproperties(&prop, devid)); printf("device : %s\n", prop.name); checkcuda(cudasetdevice(devid)); // allocate pinned host memory , device memory float *a, *d_a; checkcuda(cudamallochost((void**)&a, bytes)); // host pinned checkcuda(cudamalloc((void**)&d_a, bytes)); // device float ms; // elapsed time in milliseconds // create events , streams cudaevent_t startevent, stopevent, dummyevent; cudastream_t stream[nstreams]; checkcuda(cudaeventcreate(&startevent)); checkcuda(cudaeventcreate(&stopevent)); checkcuda(cudaeventcreate(&dummyevent)); (int = 0; < nstreams; ++i) checkcuda(cudastreamcreate(&stream[i])); // baseline case - sequential transfer , execute memset(a, 0, bytes); checkcuda(cudaeventrecord(startevent, 0)); checkcuda(cudamemcpy(d_a, a, bytes, cudamemcpyhosttodevice)); kernel << <n / blocksize, blocksize >> >(d_a, 0); checkcuda(cudamemcpy(a, d_a, bytes, cudamemcpydevicetohost)); checkcuda(cudaeventrecord(stopevent, 0)); checkcuda(cudaeventsynchronize(stopevent)); checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent)); printf("time sequential transfer , execute (ms): %f\n", ms); printf(" max error: %e\n", maxerror(a, n)); // asynchronous version 1: loop on {copy, kernel, copy} memset(a, 0, bytes); checkcuda(cudaeventrecord(startevent, 0)); (int = 0; < nstreams; ++i) { int offset = * streamsize; checkcuda(cudamemcpyasync(&d_a[offset], &a[offset], streambytes, cudamemcpyhosttodevice, stream[i])); kernel << <streamsize / blocksize, blocksize, 0, stream[i] >> >(d_a, offset); checkcuda(cudamemcpyasync(&a[offset], &d_a[offset], streambytes, cudamemcpydevicetohost, stream[i])); } checkcuda(cudaeventrecord(stopevent, 0)); checkcuda(cudaeventsynchronize(stopevent)); checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent)); printf("time asynchronous v1 transfer , execute (ms): %f\n", ms); printf(" max error: %e\n", maxerror(a, n)); // asynchronous version 2: // loop on copy, loop on kernel, loop on copy memset(a, 0, bytes); checkcuda(cudaeventrecord(startevent, 0)); (int = 0; < nstreams; ++i) { int offset = * streamsize; checkcuda(cudamemcpyasync(&d_a[offset], &a[offset], streambytes, cudamemcpyhosttodevice, stream[i])); } (int = 0; < nstreams; ++i) { int offset = * streamsize; kernel << <streamsize / blocksize, blocksize, 0, stream[i] >> >(d_a, offset); } (int = 0; < nstreams; ++i) { int offset = * streamsize; checkcuda(cudamemcpyasync(&a[offset], &d_a[offset], streambytes, cudamemcpydevicetohost, stream[i])); } checkcuda(cudaeventrecord(stopevent, 0)); checkcuda(cudaeventsynchronize(stopevent)); checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent)); printf("time asynchronous v2 transfer , execute (ms): %f\n", ms); printf(" max error: %e\n", maxerror(a, n)); // cleanup checkcuda(cudaeventdestroy(startevent)); checkcuda(cudaeventdestroy(stopevent)); checkcuda(cudaeventdestroy(dummyevent)); (int = 0; < nstreams; ++i) checkcuda(cudastreamdestroy(stream[i])); cudafree(d_a); cudafreehost(a); cudadevicereset(); return 0; }
what observing artifact of running code on windows wddm platform. wddm subsystem has lot of latency other platforms not hampered by, improve overall performance, cuda wddm driver performs command batching. can interfere expect ordering or timing of concurrent operations , command overlap, , seeing here.
the solution either use windows tcc driver, requires supported telsa or quadro card, or change non wddm platform linux. latter seems have solved problem in case.
Comments
Post a Comment