cuda - Why does Hyper-Q selectively overlap async HtoD and DtoH transfer on my cc5.2 hardware? -


there's old parallel forall blog post demonstrates using streams , async memcpys generate overlap between kernels , memcpys, , between htod , dtoh memcpys. ran full async sample given on gtx titan x, , here's result:

as can see, when htod, kernel , dtoh called back in single loop, there's isn't overlapping between htod , dtoh transfers. however, when called separately in 3 loops, there overlapping between htod , dtoh.

if hyper-q did claims do, there should htod , dtoh overlap in first version of loop launching (as case of tesla k20c). understanding in devices compute capability 3.5 , above support hyper-q, user shouldn't worry tailoring launch order anymore.

i ran cuda 7.0 simplehyperq sample. cuda_device_max_connections set 32, can 32 concurrent kernels running, hyper-q working in case.

i under 64-bit windows 8.1, driver version 353.06 , cuda 7.0, compiling using visual studio 2013, targeting x64 platform release mode, code generation property being compute_52,sm_52. cuda_device_max_connections set ample 32.

since can't post more links, full code of async sample (with slight modification) posted below.

// copyright 2012 nvidia corporation  // licensed under apache license, version 2.0 (the "license"); // may not use file except in compliance license. // may obtain copy of license @  //     http://www.apache.org/licenses/license-2.0  // unless required applicable law or agreed in writing, software // distributed under license distributed on "as is" basis, // without warranties or conditions of kind, either express or implied. // see license specific language governing permissions , // limitations under license.  #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <curand_kernel.h>  #include <stdio.h>  // convenience function checking cuda runtime api results // can wrapped around runtime api call. no-op in release builds. inline cudaerror_t checkcuda(cudaerror_t result) { #if defined(debug) || defined(_debug)     if (result != cudasuccess) {         fprintf(stderr, "cuda runtime error: %s\n", cudageterrorstring(result));         assert(result == cudasuccess);     } #endif     return result; }  __global__ void kernel(float *a, int offset) {     int = offset + threadidx.x + blockidx.x*blockdim.x;     float x = (float)i;     float s = sinf(x);     float c = cosf(x);     a[i] = a[i] + sqrtf(s*s + c*c); }  float maxerror(float *a, int n) {     float maxe = 0;     (int = 0; < n; i++) {         float error = fabs(a[i] - 1.0f);         if (error > maxe) maxe = error;     }     return maxe; }  int main(int argc, char **argv) {     _putenv_s("cuda_device_max_connections", "32");      const int blocksize = 256, nstreams = 4;     const int n = 4 * 1024 * blocksize * nstreams;     const int streamsize = n / nstreams;     const int streambytes = streamsize * sizeof(float);     const int bytes = n * sizeof(float);      int devid = 0;     if (argc > 1) devid = atoi(argv[1]);      cudadeviceprop prop;     checkcuda(cudagetdeviceproperties(&prop, devid));     printf("device : %s\n", prop.name);     checkcuda(cudasetdevice(devid));      // allocate pinned host memory , device memory     float *a, *d_a;     checkcuda(cudamallochost((void**)&a, bytes));      // host pinned     checkcuda(cudamalloc((void**)&d_a, bytes)); // device      float ms; // elapsed time in milliseconds      // create events , streams     cudaevent_t startevent, stopevent, dummyevent;     cudastream_t stream[nstreams];     checkcuda(cudaeventcreate(&startevent));     checkcuda(cudaeventcreate(&stopevent));     checkcuda(cudaeventcreate(&dummyevent));     (int = 0; < nstreams; ++i)         checkcuda(cudastreamcreate(&stream[i]));      // baseline case - sequential transfer , execute     memset(a, 0, bytes);     checkcuda(cudaeventrecord(startevent, 0));     checkcuda(cudamemcpy(d_a, a, bytes, cudamemcpyhosttodevice));     kernel << <n / blocksize, blocksize >> >(d_a, 0);     checkcuda(cudamemcpy(a, d_a, bytes, cudamemcpydevicetohost));     checkcuda(cudaeventrecord(stopevent, 0));     checkcuda(cudaeventsynchronize(stopevent));     checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent));     printf("time sequential transfer , execute (ms): %f\n", ms);     printf("  max error: %e\n", maxerror(a, n));      // asynchronous version 1: loop on {copy, kernel, copy}     memset(a, 0, bytes);     checkcuda(cudaeventrecord(startevent, 0));     (int = 0; < nstreams; ++i) {         int offset = * streamsize;         checkcuda(cudamemcpyasync(&d_a[offset], &a[offset],             streambytes, cudamemcpyhosttodevice,             stream[i]));         kernel << <streamsize / blocksize, blocksize, 0, stream[i] >> >(d_a, offset);         checkcuda(cudamemcpyasync(&a[offset], &d_a[offset],             streambytes, cudamemcpydevicetohost,             stream[i]));     }     checkcuda(cudaeventrecord(stopevent, 0));     checkcuda(cudaeventsynchronize(stopevent));     checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent));     printf("time asynchronous v1 transfer , execute (ms): %f\n", ms);     printf("  max error: %e\n", maxerror(a, n));      // asynchronous version 2:      // loop on copy, loop on kernel, loop on copy     memset(a, 0, bytes);     checkcuda(cudaeventrecord(startevent, 0));     (int = 0; < nstreams; ++i)     {         int offset = * streamsize;         checkcuda(cudamemcpyasync(&d_a[offset], &a[offset],             streambytes, cudamemcpyhosttodevice,             stream[i]));     }     (int = 0; < nstreams; ++i)     {         int offset = * streamsize;         kernel << <streamsize / blocksize, blocksize, 0, stream[i] >> >(d_a, offset);     }     (int = 0; < nstreams; ++i)     {         int offset = * streamsize;         checkcuda(cudamemcpyasync(&a[offset], &d_a[offset],             streambytes, cudamemcpydevicetohost,             stream[i]));     }     checkcuda(cudaeventrecord(stopevent, 0));     checkcuda(cudaeventsynchronize(stopevent));     checkcuda(cudaeventelapsedtime(&ms, startevent, stopevent));     printf("time asynchronous v2 transfer , execute (ms): %f\n", ms);     printf("  max error: %e\n", maxerror(a, n));      // cleanup     checkcuda(cudaeventdestroy(startevent));     checkcuda(cudaeventdestroy(stopevent));     checkcuda(cudaeventdestroy(dummyevent));     (int = 0; < nstreams; ++i)         checkcuda(cudastreamdestroy(stream[i]));     cudafree(d_a);     cudafreehost(a);      cudadevicereset();      return 0; } 

what observing artifact of running code on windows wddm platform. wddm subsystem has lot of latency other platforms not hampered by, improve overall performance, cuda wddm driver performs command batching. can interfere expect ordering or timing of concurrent operations , command overlap, , seeing here.

the solution either use windows tcc driver, requires supported telsa or quadro card, or change non wddm platform linux. latter seems have solved problem in case.


Comments

Popular posts from this blog

c# - Validate object ID from GET to POST -

node.js - Custom Model Validator SailsJS -

php - Find a regex to take part of Email -