Skip to content

Commit 1cd9007

Browse files
author
Dalong
authored
support float16 tensor (#45)
* support float16 tensor * use tensor.data_ptr directly * add fp16 test * fix * add MAG240M Example * fix bugs * add fp16 test * fix bugs * update * update distribute training * update parameters * add readme
1 parent a1e2413 commit 1cd9007

13 files changed

+601
-27
lines changed

csrc/include/qvf/dist_tensor_client.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <infinity/queues/QueuePairFactory.h>
1212
#include <infinity/requests/RequestToken.h>
1313
#include <torch/extension.h>
14+
#include <ATen/ATen.h>
1415
#include <chrono>
1516
#include <deque>
1617
#include <thread>
@@ -98,15 +99,17 @@ class DistTensorClient {
9899
{tensor_shape[0], tensor_shape[1]}, tensor_option);
99100
}
100101

101-
void register_float32_tensor(torch::Tensor& float_tensor) {
102+
void register_float_tensor(torch::Tensor& float_tensor) {
102103
QUIVER_FEATURE_ASSERT(
103104
float_tensor.dim() == 2,
104105
"Only support 2-dimensional tensor, But got %d-dimensional tensor\n",
105106
float_tensor.dim());
106-
uint64_t size_in_bytes = 4 * float_tensor.numel();
107+
108+
uint64_t size_in_bytes = float_tensor.element_size() * float_tensor.numel();
107109

108110
tensor_buffer = new infinity::memory::Buffer(
109-
context, float_tensor.data_ptr<float>(), size_in_bytes);
111+
context, float_tensor.data_ptr(), size_in_bytes);
112+
110113
tensor_token = tensor_buffer->createRegionToken();
111114
}
112115

@@ -134,12 +137,12 @@ class DistTensorClient {
134137
torch::Tensor& local_offsets,
135138
torch::Tensor& remote_offsets) {
136139
QUIVER_FEATURE_ASSERT(
137-
reinterpret_cast<uint64_t>(res_tensor.data_ptr<float>()) ==
140+
reinterpret_cast<uint64_t>(res_tensor.data_ptr()) ==
138141
tensor_buffer->getAddress(),
139142
"Result Tensor is not created from registered buffer");
140143

141144
pipes[server_rank]->read(tensor_buffer, local_offsets, remote_offsets,
142-
res_tensor.size(1) * 4);
145+
res_tensor.size(1) * res_tensor.element_size());
143146
}
144147

145148
void collect_inner(CollectionTask collection_task) {

csrc/include/qvf/dist_tensor_server.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <vector>
1717

1818
#include <torch/extension.h>
19+
#include <ATen/ATen.h>
1920

2021
namespace qvf {
2122
class DistTensorServer {
@@ -51,9 +52,10 @@ class DistTensorServer {
5152

5253
void serve_tensor(torch::Tensor& data) {
5354
std::cout << "Registering Buffer, Please Wait..." << std::endl;
54-
uint64_t size_in_bytes = data.numel() * 4;
55+
uint64_t size_in_bytes = data.numel() * data.element_size();
56+
5557
feature_buffer = new infinity::memory::Buffer(
56-
context, data.data_ptr<float>(), size_in_bytes);
58+
context, data.data_ptr(), size_in_bytes);
5759
bufferToken = feature_buffer->createRegionToken();
5860
server_thread = std::thread(run, qpFactory, bufferToken,
5961
qp_per_pipe * (world_size - 1));

csrc/src/register.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ void register_DistTensorClient(pybind11::module& m) {
4747
.def("create_registered_float32_tensor",
4848
&qvf::DistTensorClient::create_registered_float32_tensor,
4949
py::call_guard<py::gil_scoped_release>())
50-
.def("register_float32_tensor",
51-
&qvf::DistTensorClient::register_float32_tensor,
50+
.def("register_float_tensor",
51+
&qvf::DistTensorClient::register_float_tensor,
5252
py::call_guard<py::gil_scoped_release>())
5353
.def("create_registered_float32_tensor_cuda",
5454
&qvf::DistTensorClient::create_registered_float32_tensor_cuda,

examples/mag240m/README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Introduction
2+
3+
Distributed training setting on MAG240M dataset is almost the same as the [official example in DGL](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) except that we use `Quiver-Feature` for distributed feature collection.
4+
5+
Our implementation is much faster than DGL's offical example while achieved similar accuracy.
6+
7+
# Data Preprocess & Partition
8+
9+
First, please run [preprocess.py](./preprocess.py) to generate `graph.dgl` and `full.npy`, you can check [DGL's official guide](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) for more details.
10+
11+
Then we use [Range Partition](../../docs/partition_methods.md) to partition feature data, it is very easy to understand, you can check [process_quiver.py](./process_quiver.py) for more details.
12+
13+
![](../../docs/imgs/range_partition.png)
14+
15+
16+
# Running Training Script
17+
18+
On each machine, please run:
19+
20+
python3 distributed_training.py \
21+
--rootdir . \
22+
--graph-path ./graph.dgl \
23+
--feature-partition-path ./feature_part.pt \
24+
--server_world_size 2
25+
--server_rank 0
26+
27+
Remember to:
28+
29+
- Set shm size limit as large as your physical memory size. You can set by:
30+
31+
sudo mount -o remount,size=300G /dev/shm
32+
33+
- Set `MASTER_IP` as your master node's IP
34+
35+
36+
The validation accuracy is 0.680. We do not have ground truth test labels so we do not report test accuracy.
37+
38+
# Performance
39+
40+
With 2 machines and 1 GPU per machine, we need 2 minutes 10 seconds to train and 15 seconds to validate for each epoch. This is 3x faster than [DGL's performance result](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M).
41+
42+
43+
# Hardware configurations
44+
45+
We have 2 machines, each have 377G memory and they are connected by 100Gbps IB. Running training script will consume around 256GB memory.

examples/mag240m/config.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
PORT_NUMBER = 3344
2+
MASTER_IP = "155.198.152.17"
3+
#MASTER_IP = "127.0.0.1"
4+
HLPER_PORT = 5678
5+
NODE_COUNT = 1200000
6+
FEATURE_DIM = 128
7+
FEATURE_TYPE_SIZE = 4
8+
SAMPLE_NUM = 80000
9+
ITER_NUM = 10
10+
POST_LIST_SIZE = 128
11+
QP_NUM = 8
12+
TX_DEPTH = 2048
13+
CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
14+
TEST_TLB_OPTIMIZATION = True
15+
16+
# For MAG240M Training
17+
SAMPLE_PARAM = [15, 25]
18+
BATCH_SIZE = 1024

0 commit comments

Comments
 (0)