cuda_streamed_buffer.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. // Ceres Solver - A fast non-linear least squares minimizer
  2. // Copyright 2023 Google Inc. All rights reserved.
  3. // http://ceres-solver.org/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice,
  9. // this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above copyright notice,
  11. // this list of conditions and the following disclaimer in the documentation
  12. // and/or other materials provided with the distribution.
  13. // * Neither the name of Google Inc. nor the names of its contributors may be
  14. // used to endorse or promote products derived from this software without
  15. // specific prior written permission.
  16. //
  17. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. // POSSIBILITY OF SUCH DAMAGE.
  28. //
  29. // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
  30. #ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
  31. #define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
  32. #include "ceres/internal/config.h"
  33. #ifndef CERES_NO_CUDA
  34. #include "ceres/cuda_buffer.h"
  35. namespace ceres::internal {
  36. // Most contemporary CUDA devices are capable of simultaneous code execution and
  37. // host-to-device transfer. This class copies batches of data to GPU memory and
  38. // executes processing of copied data in parallel (asynchronously).
  39. // Data is copied to a fixed-size buffer on GPU (containing at most
  40. // max_buffer_size values), and this memory is re-used when the previous
  41. // batch of values is processed by user-provided callback
  42. // Host-to-device copy uses a temporary buffer if required. Each batch of values
  43. // has size of kValuesPerBatch, except the last one.
  44. template <typename T>
  45. class CERES_NO_EXPORT CudaStreamedBuffer {
  46. public:
  47. // If hardware supports only one host-to-device copy or one host-to-device
  48. // copy is able to reach peak bandwidth, two streams are sufficient to reach
  49. // maximum efficiency:
  50. // - If transferring batch of values takes more time, than processing it on
  51. // gpu, then at every moment of time one of the streams will be transferring
  52. // data and other stream will be either processing data or idle; the whole
  53. // process will be bounded by host-to-device copy.
  54. // - If transferring batch of values takes less time, than processing it on
  55. // gpu, then at every moment of time one of the streams will be processing
  56. // data and other stream will be either performing computations or
  57. // transferring data, and the whole process will be bounded by computations.
  58. static constexpr int kNumBatches = 2;
  59. // max_buffer_size is the maximal size (in elements of type T) of array
  60. // to be pre-allocated in gpu memory. The size of array determines size of
  61. // batch of values for simultaneous copying and processing. It should be large
  62. // enough to allow highly-parallel execution of user kernels; making it too
  63. // large increases latency.
  64. CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
  65. : kValuesPerBatch(max_buffer_size / kNumBatches),
  66. context_(context),
  67. values_gpu_(context, kValuesPerBatch * kNumBatches) {
  68. static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
  69. CHECK_GE(max_buffer_size, kNumBatches);
  70. // Pre-allocate a buffer of page-locked memory for transfers from a regular
  71. // cpu memory. Because we will be only writing into that buffer from cpu,
  72. // memory is allocated with cudaHostAllocWriteCombined flag.
  73. CHECK_EQ(cudaSuccess,
  74. cudaHostAlloc(&values_cpu_pinned_,
  75. sizeof(T) * kValuesPerBatch * kNumBatches,
  76. cudaHostAllocWriteCombined));
  77. for (auto& e : copy_finished_) {
  78. CHECK_EQ(cudaSuccess,
  79. cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
  80. }
  81. }
  82. CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
  83. ~CudaStreamedBuffer() {
  84. CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
  85. for (auto& e : copy_finished_) {
  86. CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
  87. }
  88. }
  89. // Transfer num_values at host-memory pointer from, calling
  90. // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
  91. // after scheduling transfer of each batch of data. User-provided callback
  92. // should perform processing of data at device_pointer only in
  93. // stream_to_use stream (device_pointer will be re-used in the next
  94. // callback invocation with the same stream).
  95. //
  96. // Two diagrams below describe operation in two possible scenarios, depending
  97. // on input data being stored in page-locked memory. In this example we will
  98. // have max_buffer_size = 2 * K, num_values = N * K and callback
  99. // scheduling a single asynchronous launch of
  100. // Kernel<<..., stream_to_use>>(device_pointer,
  101. // size_of_batch,
  102. // offset_of_batch)
  103. //
  104. // a. Copying from page-locked memory
  105. // In this case no copy on the host-side is necessary, and this method just
  106. // schedules a bunch of interleaved memory copies and callback invocations:
  107. //
  108. // cudaStreamSynchronize(context->DefaultStream());
  109. // - Iteration #0:
  110. // - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
  111. // - callback(values_gpu_, K, 0, stream_0)
  112. // - Iteration #1:
  113. // - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
  114. // stream_1)
  115. // - callback(values_gpu_ + K, K, K, stream_1)
  116. // - Iteration #2:
  117. // - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
  118. // stream_0)
  119. // - callback(values_gpu_, K, 2 * K, stream_0)
  120. // - Iteration #3:
  121. // - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
  122. // stream_1)
  123. // - callback(values_gpu_ + K, K, 3 * K, stream_1)
  124. // ...
  125. // - Iteration #i:
  126. // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
  127. // sizeof(T), H->D, stream_(i % 2))
  128. // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
  129. // ...
  130. // cudaStreamSynchronize(stream_0)
  131. // cudaStreamSynchronize(stream_1)
  132. //
  133. // This sequence of calls results in following activity on gpu (assuming that
  134. // kernel invoked by callback takes less time than host-to-device copy):
  135. // +-------------------+-------------------+
  136. // | Stream #0 | Stream #1 |
  137. // +-------------------+-------------------+
  138. // | Copy host->device | |
  139. // | | |
  140. // | | |
  141. // +-------------------+-------------------+
  142. // | Kernel | Copy host->device |
  143. // +-------------------+ |
  144. // | | |
  145. // +-------------------+-------------------+
  146. // | Copy host->device | Kernel |
  147. // | +-------------------+
  148. // | | |
  149. // +-------------------+-------------------+
  150. // | Kernel | Copy host->device |
  151. // | ... |
  152. // +---------------------------------------+
  153. //
  154. // b. Copying from regular memory
  155. // In this case a copy from regular memory to page-locked memory is required
  156. // in order to get asynchrnonous operation. Because pinned memory on host-side
  157. // is reused, additional synchronization is required. On each iteration method
  158. // the following actions are performed:
  159. // - Wait till previous copy operation in stream is completed
  160. // - Copy batch of values from input array into pinned memory
  161. // - Asynchronously launch host-to-device copy
  162. // - Setup event for synchronization on copy completion
  163. // - Invoke callback (that launches kernel asynchronously)
  164. //
  165. // Invocations are performed with the following arguments
  166. // cudaStreamSynchronize(context->DefaultStream());
  167. // - Iteration #0:
  168. // - cudaEventSynchronize(copy_finished_0)
  169. // - std::copy_n(from, K, values_cpu_pinned_)
  170. // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
  171. // stream_0)
  172. // - cudaEventRecord(copy_finished_0, stream_0)
  173. // - callback(values_gpu_, K, 0, stream_0)
  174. // - Iteration #1:
  175. // - cudaEventSynchronize(copy_finished_1)
  176. // - std::copy_n(from + K, K, values_cpu_pinned_ + K)
  177. // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
  178. // sizeof(T), H->D, stream_1)
  179. // - cudaEventRecord(copy_finished_1, stream_1)
  180. // - callback(values_gpu_ + K, K, K, stream_1)
  181. // - Iteration #2:
  182. // - cudaEventSynchronize(copy_finished_0)
  183. // - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
  184. // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
  185. // stream_0)
  186. // - cudaEventRecord(copy_finished_0, stream_0)
  187. // - callback(values_gpu_, K, 2 * K, stream_0)
  188. // - Iteration #3:
  189. // - cudaEventSynchronize(copy_finished_1)
  190. // - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
  191. // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
  192. // sizeof(T), H->D, stream_1)
  193. // - cudaEventRecord(copy_finished_1, stream_1)
  194. // - callback(values_gpu_ + K, K, 3 * K, stream_1)
  195. // ...
  196. // - Iteration #i:
  197. // - cudaEventSynchronize(copy_finished_(i % 2))
  198. // - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
  199. // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
  200. // 2) * K, K * sizeof(T), H->D, stream_(i % 2))
  201. // - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
  202. // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
  203. // ...
  204. // cudaStreamSynchronize(stream_0)
  205. // cudaStreamSynchronize(stream_1)
  206. //
  207. // This sequence of calls results in following activity on cpu and gpu
  208. // (assuming that kernel invoked by callback takes less time than
  209. // host-to-device copy and copy in cpu memory, and copy in cpu memory is
  210. // faster than host-to-device copy):
  211. // +----------------------------+-------------------+-------------------+
  212. // | Stream #0 | Stream #0 | Stream #1 |
  213. // +----------------------------+-------------------+-------------------+
  214. // | Copy to pinned memory | | |
  215. // | | | |
  216. // +----------------------------+-------------------| |
  217. // | Copy to pinned memory | Copy host->device | |
  218. // | | | |
  219. // +----------------------------+ | |
  220. // | Waiting previous h->d copy | | |
  221. // +----------------------------+-------------------+-------------------+
  222. // | Copy to pinned memory | Kernel | Copy host->device |
  223. // | +-------------------+ |
  224. // +----------------------------+ | |
  225. // | Waiting previous h->d copy | | |
  226. // +----------------------------+-------------------+-------------------+
  227. // | Copy to pinned memory | Copy host->device | Kernel |
  228. // | | +-------------------+
  229. // | ... ... |
  230. // +----------------------------+---------------------------------------+
  231. //
  232. template <typename Fun>
  233. void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
  234. // This synchronization is not required in some cases, but we perform it in
  235. // order to avoid situation when user callback depends on data that is
  236. // still to be computed in default stream
  237. CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
  238. // If pointer to input data does not correspond to page-locked memory,
  239. // host-to-device memory copy might be executed synchrnonously (with a copy
  240. // to pinned memory happening inside the driver). In that case we perform
  241. // copy to a pre-allocated array of page-locked memory.
  242. const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
  243. T* batch_values_gpu[kNumBatches];
  244. T* batch_values_cpu[kNumBatches];
  245. auto streams = context_->streams_;
  246. for (int i = 0; i < kNumBatches; ++i) {
  247. batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
  248. batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
  249. }
  250. int batch_id = 0;
  251. for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
  252. const int num_values_batch =
  253. std::min(num_values - offset, kValuesPerBatch);
  254. const T* batch_from = from + offset;
  255. T* batch_to = batch_values_gpu[batch_id];
  256. auto stream = streams[batch_id];
  257. auto copy_finished = copy_finished_[batch_id];
  258. if (copy_to_pinned_memory) {
  259. // Copying values to a temporary buffer should be started only after the
  260. // previous copy from temporary buffer to device is completed.
  261. CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
  262. std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
  263. batch_from = batch_values_cpu[batch_id];
  264. }
  265. CHECK_EQ(cudaSuccess,
  266. cudaMemcpyAsync(batch_to,
  267. batch_from,
  268. sizeof(T) * num_values_batch,
  269. cudaMemcpyHostToDevice,
  270. stream));
  271. if (copy_to_pinned_memory) {
  272. // Next copy to a temporary buffer can start straight after asynchronous
  273. // copy is completed (and might be started before kernels asynchronously
  274. // executed in stream by user-supplied callback are completed).
  275. // No explicit synchronization is required when copying data from
  276. // page-locked memory, because memory copy and user kernel execution
  277. // with corresponding part of values_gpu_ array is serialized using
  278. // stream
  279. CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
  280. }
  281. callback(batch_to, num_values_batch, offset, stream);
  282. batch_id = (batch_id + 1) % kNumBatches;
  283. }
  284. // Explicitly synchronize on all CUDA streams that were utilized.
  285. for (int i = 0; i < kNumBatches; ++i) {
  286. CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
  287. }
  288. }
  289. private:
  290. // It is necessary to have all host-to-device copies to be completely
  291. // asynchronous. This requires source memory to be allocated in page-locked
  292. // memory.
  293. static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
  294. cudaPointerAttributes attributes;
  295. auto status = cudaPointerGetAttributes(&attributes, ptr);
  296. #if CUDART_VERSION < 11000
  297. // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
  298. // pointer will return cudaErrorInvalidValue
  299. if (status == cudaErrorInvalidValue) {
  300. return true;
  301. }
  302. #endif
  303. CHECK_EQ(status, cudaSuccess);
  304. // This class only supports cpu memory as a source
  305. CHECK_NE(attributes.type, cudaMemoryTypeDevice);
  306. // If host memory was allocated (or registered) with CUDA API, or is a
  307. // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
  308. // of managed memory it might be slightly better to perform a single call of
  309. // user-provided call-back (and hope that page migration will provide a
  310. // similar throughput with zero efforts from our side).
  311. return attributes.type == cudaMemoryTypeUnregistered;
  312. }
  313. const int kValuesPerBatch;
  314. ContextImpl* context_ = nullptr;
  315. CudaBuffer<T> values_gpu_;
  316. T* values_cpu_pinned_ = nullptr;
  317. cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
  318. };
  319. } // namespace ceres::internal
  320. #endif // CERES_NO_CUDA
  321. #endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_