Skip to content

Commit 39ea874

Browse files
committed
[tmp] non-blocking somehow not working
1 parent 2e88470 commit 39ea874

File tree

3 files changed

+29
-16
lines changed

3 files changed

+29
-16
lines changed

csrc/pthread_backend.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "pthread_backend.h"
22

3-
#include <iostream>
4-
53
void PthreadAsyncIO::write(int fd, void *buffer, size_t n_bytes, unsigned long long offset, callback_t callback) {
64
auto fut = this->pool.submit_task(
75
[fd, buffer, n_bytes, offset] {
@@ -81,21 +79,23 @@ void PthreadAsyncIO::synchronize() {
8179
void PthreadAsyncIO::register_file(int fd) {}
8280

8381
void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long offset, callback_t callback, std::optional<torch::Tensor> pinned) {
82+
auto stream = c10::cuda::getCurrentCUDAStream();
83+
at::cuda::CUDAStreamGuard guard(stream); // https://pytorch.org/cppdocs/notes/tensor_cuda_stream.html
84+
auto event_ptr = std::make_shared<c10::Event>(torch::kCUDA); // make a shared ptr here since event is not copyable
85+
if (t.is_cuda()) {
86+
if (pinned.has_value()) {
87+
pinned.value().copy_(t, /*non_blocking*/ true);
88+
t = pinned.value();
89+
} else {
90+
t = t.to(t.options().device(c10::DeviceType::CPU), /*non_blocking*/ true, /*copy*/ false); // modified from torch::Tensor::cpu()
91+
}
92+
}
93+
event_ptr->record(stream);
8494
auto fut = this->pool.submit_task(
85-
[fd, t, offset, pinned] {
86-
torch::Tensor cpu_tensor;
87-
if (t.is_cuda()) {
88-
if (pinned.has_value()) {
89-
pinned.value().copy_(t);
90-
cpu_tensor = pinned.value();
91-
} else {
92-
cpu_tensor = t.to(torch::kCPU);
93-
}
94-
} else {
95-
cpu_tensor = t;
96-
}
97-
void *buf = cpu_tensor.data_ptr();
98-
size_t n_bytes = cpu_tensor.numel() * cpu_tensor.element_size();
95+
[fd, t, offset, pinned, event_ptr] {
96+
event_ptr->synchronize(); // sync with comm stream
97+
void *buf = t.data_ptr();
98+
size_t n_bytes = t.numel() * t.element_size();
9999
return pwrite(fd, buf, n_bytes, offset);
100100
}
101101
);

include/pthread_backend.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
#include <queue>
1010
#include <tuple>
1111
#include <functional>
12+
#include <iostream>
13+
#include <c10/cuda/CUDAStream.h>
14+
#include <c10/cuda/CUDAGuard.h>
1215

1316
#include "asyncio.h"
1417
#include "threadpool.hpp"

tensornvme/async_file_io.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import ctypes
2+
import torch
23
from functools import partial
34
from torch import Tensor
45
from typing import List, Optional
@@ -16,6 +17,7 @@ def __init__(self, fp: IOBase, n_entries: int = 16, backend=None) -> None:
1617
self.offset = 0
1718
# must ensure the data is not garbage collected
1819
self.buffers = []
20+
self.comm_stream = torch.cuda.Stream()
1921

2022
def write(self, data: bytes) -> int:
2123
ptr = ctypes.cast(data, ctypes.POINTER(ctypes.c_char))
@@ -36,6 +38,14 @@ def write_tensor(self, tensor: Tensor, pinned: Optional[Tensor] = None) -> None:
3638
self.io.write_tensor(tensor, self.offset, partial(AsyncFileWriter.gc_callback, self.buffers, len(self.buffers) - 1), pinned)
3739
self.offset += tensor.numel() * tensor.element_size()
3840

41+
def write_gpu_tensor(self, tensor: Tensor, pinned: Optional[Tensor] = None) -> None:
42+
assert tensor.device.type == 'cuda', f"tensor must be on cuda device, got {tensor.device}"
43+
with torch.cuda.stream(self.comm_stream):
44+
self.write_tensor(tensor, pinned)
45+
46+
def sync_before_step(self):
47+
self.comm_stream.synchronize()
48+
3949
@staticmethod
4050
def gc_callback(listt: List, idx: int) -> None:
4151
listt[idx] = None

0 commit comments

Comments
 (0)