From e7f695dc71d87fce4f6becd4ddf43f3f7d3b6b5d Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 15:11:52 +0800 Subject: [PATCH 01/19] change / into * reverse number --- python/singa/tensor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/singa/tensor.py b/python/singa/tensor.py index 441431fc25..ae9ba5ba83 100644 --- a/python/singa/tensor.py +++ b/python/singa/tensor.py @@ -472,9 +472,9 @@ def __idiv__(self, x): x (float or Tensor): ''' if isinstance(x, Tensor): - self.data /= x.data + self.data *= 1/x.data else: - self.data /= float(x) + self.data *= 1/float(x) return self ''' From 2cc6ba497c5b6631b11e33c0db8f581597c505b2 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 15:29:39 +0800 Subject: [PATCH 02/19] align train.py with vc12 --- examples/cifar10/train.py | 50 +++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index b2ab4af689..d785aa84c1 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -31,24 +31,25 @@ import os import argparse +# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python')) from singa import utils from singa import optimizer from singa import device from singa import tensor +from singa.proto import core_pb2 from caffe import caffe_net -import cnn +import alexnet import vgg import resnet +from datetime import datetime +import time def load_dataset(filepath): print('Loading data file %s' % filepath) with open(filepath, 'rb') as fd: - try: - cifar10 = pickle.load(fd, encoding='latin1') - except TypeError: - cifar10 = pickle.load(fd) + cifar10 = pickle.load(fd) image = cifar10['data'].astype(dtype=np.uint8) image = image.reshape((-1, 3, 32, 32)) label = np.asarray(cifar10['labels'], dtype=np.uint8) @@ -129,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, dev = device.get_default_device() else: print('Using GPU') - dev = device.create_cuda_gpu() + dev = device.create_cuda_gpu_on(2) net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) @@ -137,16 +138,27 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) - ty = tensor.Tensor((batch_size,), dev, tensor.int32) + ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) - for epoch in range(max_epoch): + fileTimeLog =open("epochTimeLog.text","a") + for epoch in range(1): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) - for b in range(num_train_batch): + print(datetime.now().timetz()) # miliseconds + print(int(round(time.time()*1000))) + fileTimeLog.write('Epoch %d: ' % epoch) + fileTimeLog.write(str(int(round(time.time()*1000)))) + fileTimeLog.write('\n') + for b in range(10): #num_train_batch): + print ("start of iteration %d: " %b) + #time.sleep(1) + fileTimeLog.write('iteration %d: ' % b) + fileTimeLog.write(str(int(round(time.time()*1000)))) + fileTimeLog.write('\n') x = train_x[idx[b * batch_size: (b + 1) * batch_size]] y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) @@ -164,7 +176,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print(info) loss, acc = 0.0, 0.0 - for b in range(num_test_batch): + for b in range(0): x = test_x[b * batch_size: (b + 1) * batch_size] y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) @@ -175,14 +187,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) + fileTimeLog.close() net.save('model', 20) # save model params into checkpoint file if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train dcnn for cifar10') - parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'], - default='vgg') + parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'], + default='alexnet') parser.add_argument('data', default='cifar-10-batches-py') parser.add_argument('--use_cpu', action='store_true') + parser.add_argument('batch_size',type=int, default=100) args = parser.parse_args() assert os.path.exists(args.data), \ 'Pls download the cifar10 dataset via "download_data.py py"' @@ -194,22 +208,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, net = caffe_net.create_net(args.use_cpu) # for cifar10_full_train_test.prototxt train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) # for cifar10_quick_train_test.prototxt # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004, # use_cpu=args.use_cpu) - elif args.model == 'cnn': + elif args.model == 'alexnet': train_x, test_x = normalize_for_alexnet(train_x, test_x) - net = cnn.create_net(args.use_cpu) + net = alexnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) elif args.model == 'vgg': train_x, test_x = normalize_for_vgg(train_x, test_x) net = vgg.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) else: train_x, test_x = normalize_for_alexnet(train_x, test_x) net = resnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) \ No newline at end of file From 611ad3ccf15f8a281465445798257440dbe370b2 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 15:33:39 +0800 Subject: [PATCH 03/19] revert back train.py --- examples/cifar10/train.py | 50 ++++++++++++++------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index d785aa84c1..b2ab4af689 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -31,25 +31,24 @@ import os import argparse -# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python')) from singa import utils from singa import optimizer from singa import device from singa import tensor -from singa.proto import core_pb2 from caffe import caffe_net -import alexnet +import cnn import vgg import resnet -from datetime import datetime -import time def load_dataset(filepath): print('Loading data file %s' % filepath) with open(filepath, 'rb') as fd: - cifar10 = pickle.load(fd) + try: + cifar10 = pickle.load(fd, encoding='latin1') + except TypeError: + cifar10 = pickle.load(fd) image = cifar10['data'].astype(dtype=np.uint8) image = image.reshape((-1, 3, 32, 32)) label = np.asarray(cifar10['labels'], dtype=np.uint8) @@ -130,7 +129,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, dev = device.get_default_device() else: print('Using GPU') - dev = device.create_cuda_gpu_on(2) + dev = device.create_cuda_gpu() net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) @@ -138,27 +137,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) - ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt) + ty = tensor.Tensor((batch_size,), dev, tensor.int32) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) - fileTimeLog =open("epochTimeLog.text","a") - for epoch in range(1): + for epoch in range(max_epoch): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) - print(datetime.now().timetz()) # miliseconds - print(int(round(time.time()*1000))) - fileTimeLog.write('Epoch %d: ' % epoch) - fileTimeLog.write(str(int(round(time.time()*1000)))) - fileTimeLog.write('\n') - for b in range(10): #num_train_batch): - print ("start of iteration %d: " %b) - #time.sleep(1) - fileTimeLog.write('iteration %d: ' % b) - fileTimeLog.write(str(int(round(time.time()*1000)))) - fileTimeLog.write('\n') + for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) @@ -176,7 +164,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print(info) loss, acc = 0.0, 0.0 - for b in range(0): + for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) @@ -187,16 +175,14 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) - fileTimeLog.close() net.save('model', 20) # save model params into checkpoint file if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train dcnn for cifar10') - parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'], - default='alexnet') + parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'], + default='vgg') parser.add_argument('data', default='cifar-10-batches-py') parser.add_argument('--use_cpu', action='store_true') - parser.add_argument('batch_size',type=int, default=100) args = parser.parse_args() assert os.path.exists(args.data), \ 'Pls download the cifar10 dataset via "download_data.py py"' @@ -208,22 +194,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, net = caffe_net.create_net(args.use_cpu) # for cifar10_full_train_test.prototxt train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004, - use_cpu=args.use_cpu,batch_size=args.batch_size) + use_cpu=args.use_cpu) # for cifar10_quick_train_test.prototxt # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004, # use_cpu=args.use_cpu) - elif args.model == 'alexnet': + elif args.model == 'cnn': train_x, test_x = normalize_for_alexnet(train_x, test_x) - net = alexnet.create_net(args.use_cpu) + net = cnn.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004, - use_cpu=args.use_cpu,batch_size=args.batch_size) + use_cpu=args.use_cpu) elif args.model == 'vgg': train_x, test_x = normalize_for_vgg(train_x, test_x) net = vgg.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005, - use_cpu=args.use_cpu,batch_size=args.batch_size) + use_cpu=args.use_cpu) else: train_x, test_x = normalize_for_alexnet(train_x, test_x) net = resnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4, - use_cpu=args.use_cpu,batch_size=args.batch_size) \ No newline at end of file + use_cpu=args.use_cpu) From 9974d5dc751a7643fbb0d548d604df85d150ec28 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 15:37:31 +0800 Subject: [PATCH 04/19] revert back train.py --- examples/cifar10/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index b2ab4af689..6e343540c3 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -212,4 +212,4 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, train_x, test_x = normalize_for_alexnet(train_x, test_x) net = resnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu) \ No newline at end of file From 7309dc0ffc7981db35af5eeb10aababe0e284d65 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 15:41:06 +0800 Subject: [PATCH 05/19] update train iteration number --- examples/cifar10/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index 6e343540c3..4a4d94fe30 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -142,11 +142,11 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) - for epoch in range(max_epoch): + for epoch in range(1): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) - for b in range(num_train_batch): + for b in range(20): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) @@ -164,7 +164,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print(info) loss, acc = 0.0, 0.0 - for b in range(num_test_batch): + for b in range(0): x = test_x[b * batch_size: (b + 1) * batch_size] y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) From 25fe79dbe17382ccad07c03a8b6956d7af0b8d51 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 16:03:50 +0800 Subject: [PATCH 06/19] change cnmem src, common.h, common.cc and the cmakelist --- .DS_Store | Bin 0 -> 8196 bytes CMakeLists.txt | 2 +- include/singa/core/common.h | 26 +++++----- src/.DS_Store | Bin 0 -> 8196 bytes src/CMakeLists.txt | 1 + src/core/.DS_Store | Bin 0 -> 6148 bytes src/core/common/common.cc | 98 ++++++++++++++++++++++++++++++++++++ 7 files changed, 114 insertions(+), 13 deletions(-) create mode 100644 .DS_Store create mode 100644 src/.DS_Store create mode 100644 src/core/.DS_Store create mode 100644 src/core/common/common.cc diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e74703a65a58eddc3bf6ead31e5ced542081bb7d GIT binary patch literal 8196 zcmeHMTWl0n82=R4;= z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&* zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O5cxhWj-)YuLJJ5A_yJ z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!& z7c=SpNUS&h86O)s)*ua3p~ zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS* z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i zH5|emo<g1!o2x-7Wi{SIsxr1_#?V_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z z_)SXmL#XnP #include #include "singa/utils/logging.h" - +#include #ifdef USE_CUDA #include #include @@ -52,24 +52,25 @@ typedef struct _Cuda { } Cuda; typedef struct _Opencl { } Opencl; } // namespace lang +class Device; /// Block represent a chunk of memory (on device or host). class Block { public: - Block(void* ptr, size_t size, size_t offset = 0) - : data_(ptr), size_(size), offset_(offset) { + Block(void* ptr, size_t size, size_t offset = 0, Device* ptrDevice = nullptr) + : data_(ptr), size_(size), offset_(offset), ptrDevice_(ptrDevice) { ref_count_ = 1; // std::make_shared>(1); } // Disabled as it is not used currently. // Block(void* ptr, size_t size, size_t offset, std::shared_ptr> // ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {} - void* mutable_data() { - initialized_ = true; - return static_cast(data_) + offset_; - } - const void* data() const { - CHECK(initialized_) << "Must initialize data before reading it"; - return static_cast(data_) + offset_; - } + void* mutable_data() ; + + const void* data() const; + + void* get_data() ; + + void update_data(void* data_new) ; + size_t size() const { return size_; } size_t offset() const { return offset_; } int IncRefCount() { @@ -89,6 +90,7 @@ class Block { void* data_ = nullptr; size_t size_ = 0; size_t offset_ = 0; + Device* ptrDevice_; bool initialized_ = false; // Disabled as it is not used currently. // std::shared_ptr> ref_count_ = nullptr; @@ -114,4 +116,4 @@ typedef struct _Context { } Context; } // namespace singa -#endif // SINGA_CORE_COMMON_H_ +#endif // SINGA_CORE_COMMON_H_ \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a87953d857c6cf9354df29d312f03c027a6411d3 GIT binary patch literal 8196 zcmeHMTWl0n82Ice&DxX3#+ zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj zFd;FlLk2Wo$+=jZ|kmqpQD;LO0oV# zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc7`ATOBLfA~ zHPe=nw6!s#5+@a9u6%t}bS78K0pg9Lz~r|jO)rVOG_J; zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3 z!rrFqhRztPOKg_w?bA?~h|8Wr+er;2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZPTtm#zUmPB=+MV z9>*cfph4;zMH6$Rz5R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC literal 0 HcmV?d00001 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7dd9bf7751..aa8d41f05b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,7 @@ AUX_SOURCE_DIRECTORY(io io_source) AUX_SOURCE_DIRECTORY(io/network io_source) LIST(APPEND singa_sources ${io_source}) +AUX_SOURCE_DIRECTORY(core/common core_source) AUX_SOURCE_DIRECTORY(core/device core_source) AUX_SOURCE_DIRECTORY(core/memory core_source) AUX_SOURCE_DIRECTORY(core/scheduler core_source) diff --git a/src/core/.DS_Store b/src/core/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 +#include +#include +//TODO(junzhe) ifdef to counter verify +///only include mutable_data() and data() + +namespace singa { + +void* Block::mutable_data() { + //std::cout<<"mutable_data() "<AppendInfo(temp); + } + //TODO(junzhe) this should not happen, can verify and remove + if (data_ == nullptr) { + //cout<<"to sleep"<GetRealGpuPtrInfo(this); + cout<<"slept to get data_ updated: "<(data_) + offset_; + } + + +const void* Block::data() const { + CHECK(initialized_) << "Must initialize data before reading it"; + //std::cout<<"data() "<AppendInfo(temp); + } + + //TODO(junzhe) this should not happen, can verify and remove + if (data_ == nullptr) { + //cout<<"to sleep"<GetRealGpuPtrInfo(this); + cout<<"slept to get data_ updated"<SwapOutInfo(this); + // ptrDevice_->SwapInInfo(this); + + return static_cast(data_) + offset_; + } + +void* Block::get_data() { + return data_; +} + +void Block::update_data(void* data_new) { + data_ = data_new; + std::cout<<"results update_data:: "< Date: Mon, 13 Aug 2018 16:21:27 +0800 Subject: [PATCH 07/19] disable common.cc appendInfo, for device src done first. --- src/core/common/common.cc | 83 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/src/core/common/common.cc b/src/core/common/common.cc index b232398d27..73ec0b0c6e 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -28,27 +28,28 @@ namespace singa { void* Block::mutable_data() { + //TODO(junzhe) go back to enable it after device done //std::cout<<"mutable_data() "<AppendInfo(temp); - } - //TODO(junzhe) this should not happen, can verify and remove - if (data_ == nullptr) { - //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"slept to get data_ updated: "<AppendInfo(temp); + // } + // //TODO(junzhe) this should not happen, can verify and remove + // if (data_ == nullptr) { + // //cout<<"to sleep"<GetRealGpuPtrInfo(this); + // cout<<"slept to get data_ updated: "<(data_) + offset_; } @@ -56,30 +57,28 @@ void* Block::mutable_data() { const void* Block::data() const { CHECK(initialized_) << "Must initialize data before reading it"; - //std::cout<<"data() "<AppendInfo(temp); - } + //TODO(junzhe) go back to enable it after device done + // if (ptrDevice_!=nullptr){ + // //Append info. + // stringstream strm2; + // strm2<AppendInfo(temp); + // } + + // //TODO(junzhe) this should not happen, can verify and remove + // if (data_ == nullptr) { + // //cout<<"to sleep"<GetRealGpuPtrInfo(this); + // cout<<"slept to get data_ updated"<GetRealGpuPtrInfo(this); - cout<<"slept to get data_ updated"<SwapOutInfo(this); - // ptrDevice_->SwapInInfo(this); return static_cast(data_) + offset_; } From c07dcc6c622bc0aec0e48e7b1beab63a38c83601 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 13 Aug 2018 16:54:12 +0800 Subject: [PATCH 08/19] update device and memory family src --- include/singa/core/device.h | 171 +++++- include/singa/core/memory.h | 143 ++++- src/core/device/cuda_gpu.cc | 21 +- src/core/device/device.cc | 46 +- src/core/device/platform.cc | 6 +- src/core/device/swap_gpu.cc | 1144 +++++++++++++++++++++++++++++++++++ src/core/memory/memory.cc | 1079 ++++++++++++++++++++++++++++++++- 7 files changed, 2595 insertions(+), 15 deletions(-) create mode 100644 src/core/device/swap_gpu.cc diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 1a960d8ae7..36b10d0d56 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "singa/singa_config.h" #include "singa/core/common.h" @@ -64,6 +65,11 @@ class Device { /// Called by Tensor. void FreeBlock(Block* block); + + void AppendInfo(string blockInfo); + void* GetRealGpuPtrInfo(const Block* block_); + void SwapOutInfo(const Block* block_); + void SwapInInfo(const Block* block_); /// Return the size (bytes) of memory in use /// TODO(wangwei) override this function for all devices. @@ -102,6 +108,8 @@ class Device { int id() const { return id_; } + virtual void* GetRealGpuPtr(const Block* block_) = 0; + private: Device() {}; @@ -117,6 +125,11 @@ class Device { /// Free device memory. virtual void Free(void* ptr) = 0; + virtual void MakeMetaTable(Block* block,void* data_,int size) = 0; + virtual void Append(string blockInfo) = 0; + + virtual void SwapOut(const Block* block_) = 0; + virtual void SwapIn(const Block* block_) = 0; protected: int id_ = 0; @@ -158,6 +171,11 @@ class CppCPU : public Device { /// Free cpu memory. void Free(void* ptr) override; + void MakeMetaTable(Block* block,void* data_,int size) override {} + void Append(string blockInfo) override {} + void* GetRealGpuPtr(const Block* block_) override {} + void SwapOut(const Block* block_) override {} + void SwapIn(const Block* block_) override {} }; @@ -188,16 +206,159 @@ class CudaGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; + void MakeMetaTable(Block* block,void* data_,int size) override {} + void Append(string blockInfo) override; + void* GetRealGpuPtr(const Block* block_) override; + void SwapOut(const Block* block_) override; + void SwapIn(const Block* block_) override; private: void Setup(); private: - shared_ptr pool_; + shared_ptr pool_; }; /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host. +///SwapGPU +struct onePieceMsg{ + /* + members: [ptr, size, MallocFree, idx] + */ + string ptr; + size_t size; + int MallocFree; + int idx; + double t; + onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} +}; + +struct BlockMeta{ + /* + block Meta. + */ + Block* block_ = nullptr; + void* data_ = nullptr; + void* cpu_ptr = nullptr; + size_t size = 0; + cudaEvent_t out_event; + cudaEvent_t in_event; + cudaStream_t out_stream; + cudaStream_t in_stream; +}; + +struct SwapBlock{ + + string ptr; + string cat; //A1, A2, A3... + int name; + size_t size; + int r_idx; //out idx + int d_idx; //in idx + double r_time; // out time + double d_time; //in time + double dt; //delta t: t2'-t1' + double pri; //look at here if big enough TODO(junzhe) + double dto; //t2-t1 + double wdto = 0; //t2-t1 weighted by swap_load + double r_idx_ready; //r_idx + buffer, could be set during selection. + //int free = -1; //when it is freed + //below as per planned. + int i1; + int i1p; + int i2; + int i2p; + double t1; + double t2; + double t1p; + double t2p; + SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): + ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {} +}; +/// Device able to Swap memory between Nvidia GPU and Swap +class SwapGPU : public Device { + public: + ~SwapGPU(); + /// Construct the device using default mem pool setting. + SwapGPU(int id = 0); + /// Construct the device given the physical device ID and memory pool. + SwapGPU(int id, std::shared_ptr pool); + + void SetRandSeed(unsigned seed) override; + size_t GetAllocatedMem() override; + + protected: + void DoExec(function&& fn, int executor) override; + + void CopyToFrom(void* dst, const void* src, size_t nBytes, + CopyDirection direction, Context* ctx) override; + + /// Allocate cpu memory. + void* Malloc(int size) override; + + /// Free cpu memory. + void Free(void* ptr) override; + void MakeMetaTable(Block* block,void* data_,int size) override; + int swap_test(vectorvec_block,int &maxLen, int &location); + void swap_sched(vectorvec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); + void swap_plan(); + vector swap_select(vectorvec_swap,double maxLoad,double memLimit,string mode); + vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); + void Test_sched_switch_swap(); + void DeploySwap(); + void Append(string blockInfo) override; + void* GetRealGpuPtr(const Block* block_) override; + void SwapOut(const Block* block_) override; + void SwapIn(const Block* block_) override; + + //changed to intake data_ instead + void SwapOut_idx(const int r_idx); + void SwapIn_idx(const int r_idx); + + private: + void Setup(); + ///Tables needed + //r_idx->BlockMeta + mapTable_meta; + mapTable_block_meta; //TODO(junzhe) for measure speed only. + mapTable_not_at_device; //int refers to its r_idx of the block/meta + //mapTable_block_size; //Table block_ -> size TODO(junzhe) no need, can call block_->size() + + //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D. + map>Table_sched; // changed to with sync_r_idx + + + //vec_block + vectorvec_block; //iteration 0-3 + vectorvec_block_fresh; //iteration 4 5 6 + vectorglobal_load; + vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. + vectorvec_run; + int asyncSwapFlag = 0; //0 for sync, 1 for async. + int testFlag = 0; //0 means open for test, 1 means no need test anymore. + int gc = 0; //global counter each time Malloc/Free, add 1. + int globeCounter = -1; + int maxLen = 0; + int location = 0; + //design requirement + float memLimit_ratio = 0.70; + size_t smallest_block = 1<<20; //1 MB + int data_buffer = 4; // used to control readyIdx + int mutable_data_buffer = 6; + double maxLoad; + int maxIdx; + double total_swapInTime = 0; + double total_swapOutTime = 0; + double tempTime = 0; + double tempTime2 = 0; + double tempTime_baseline; //vec_run[0] time + int maxLen_threshold = 1000; + + private: + shared_ptr pool_; +}; + #endif // USE_CUDA #ifdef USE_OPENCL @@ -248,6 +409,12 @@ class OpenclDevice : public singa::Device { /// Converts the void pointer into a Buffer object, then deletes the object. /// This has the effect of freeing up device memory. void Free(void* ptr) override; + void MakeMetaTable(Block* block,void* data_,int size) override {} + void Append(string blockInfo) override {} + void* GetRealGpuPtr(const Block* block_) override {} + void SwapOut(const Block* block_) override {} + void SwapIn(const Block* block_) override {} + private: @@ -338,4 +505,4 @@ class Platform { } // namespace singa -#endif // SINGA_CORE_DEVICE_H_ +#endif // SINGA_CORE_DEVICE_H_ \ No newline at end of file diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h index f664f95ced..cc40d0433c 100644 --- a/include/singa/core/memory.h +++ b/include/singa/core/memory.h @@ -23,6 +23,17 @@ #include #include "singa/proto/core.pb.h" #include "singa/singa_config.h" +//for SmartMemPool +#include +#include +#include +#include +#include +#include +#include +#include /* malloc, free, rand */ +#include +using namespace std; #ifdef USE_CUDA #include "cnmem.h" @@ -38,7 +49,10 @@ class DeviceMemPool { public: virtual void Malloc(void** ptr, const size_t size) = 0; virtual void Free(void* ptr) = 0; - + virtual void Append(string blockInfo) = 0; + + virtual void SwapOut(void* data_) = 0; + virtual void SwapIn(void* data_) = 0; /// Return a pair for free and total memory managed by this pool. virtual std::pair GetMemUsage() { return std::make_pair(0u, 0u); @@ -60,7 +74,10 @@ class CnMemPool : public DeviceMemPool { void Malloc(void** ptr, const size_t size); void Free(void* ptr); - + void Append(string blockInfo){} + + void SwapOut(void* data_) override {} + void SwapIn(void* data_) override {} std::pair GetMemUsage() override; // release all memory and set cnmem manager to unintialized @@ -85,7 +102,127 @@ class CudaMemPool : public DeviceMemPool { public: void Malloc(void** ptr, const size_t size) override; void Free(void* ptr) override; +void Append(string blockInfo){} + + void SwapOut(void* data_) override {} + void SwapIn(void* data_) override {} +}; + +//for SmartMemPool +struct lookUpElement{ + /* + for memory pool Malloc look-up table. + */ + int r_idx; + int d_idx; + size_t size; + size_t offset; + void* ptr; + int Occupied; //0 is free, 1 is occupied. + int crossItr; + int Occupied_backup; +}; + +///class mem-pool SmartMemPool +class SmartMemPool: public DeviceMemPool { +public: + SmartMemPool(const MemPoolConf &conf); //constructor + //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~SmartMemPool(); + void getMaxLoad(void); + std::pair GetMemUsage() override; + void Append(string blockInfo); + + void SwapOut(void* data_) override {} + void SwapIn(void* data_) override {} +protected: + void Init(); +private: + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; + + string colorMethod; + int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc + int gc =0; //global counter each time Malloc/Free, add 1. + int globeCounter=-1; + int loadLogFlag =1; //record when its 1. + void* ptrPool = NULL; + int idxRange = 0; + size_t offset = 0; + size_t offsetCrossItr=0; //cross iteration offset. + int maxLen =0; + int location=0; + vector vec; + vector vec_block_RW; + vector vec_block_RWMF; + mapTable_r2d; //full duration info, cross-iteration duration. + mapTable_d2r; + //mapTable_r2Ver; + vector>Vec_r2Ver; //b. replace Table_r2Ver + map>Table_load; //gc, + mapTable_p2s; //For tracking load in Free. add when allocate, delete when deallocate. + mapTable_p2r; //ptr for arrival idx, for look up Table during free + int checkPoint=300; //for reduce number of test. + size_t maxTotalLoad; + size_t maxMemUsage; + float memRatio; +}; + + +//for Swap +struct swapLookUpElement{ + /* + book keep the block info and status + */ + void* data_ = nullptr; + void* realGpuPtr = nullptr; + void* realCpuPtr = nullptr; + + int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C. + size_t size; //size may used as of now. +}; + +struct SwapMeta{ + /* + for copy between block and info. + */ + size_t swapSize; + void* ptr; + void* d_ptr; //not used for +}; + +class Swap : public DeviceMemPool { +public: + Swap(const MemPoolConf &conf); //constructor + //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~Swap(); + void getMaxLoad(void); + std::pair GetMemUsage() override; + void Append(string blockInfo); + + void SwapOut(void* data_); + void SwapIn(void* data_); +protected: + void Init(); +private: + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; + vector vec_block; + size_t swapLimit = 1<<23; //8MB + mapTable_id2LookUpElement; //old TODO(junzhe) remove + map>Table_Meta; }; + #endif } // namespace singa -#endif // SINGA_CORE_MEMORY_H_ +#endif // SINGA_CORE_MEMORY_H_ \ No newline at end of file diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc index f6603d3632..16de1ef5ca 100644 --- a/src/core/device/cuda_gpu.cc +++ b/src/core/device/cuda_gpu.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include "singa/core/device.h" #include "singa/utils/cuda_utils.h" namespace singa { @@ -47,7 +48,7 @@ const int kNumCudaStream = 1; CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) { MemPoolConf conf; conf.add_device(id); - pool_ = std::make_shared(conf); + pool_ = std::make_shared(conf); Setup(); } @@ -122,5 +123,21 @@ void CudaGPU::Free(void* ptr) { } } +void CudaGPU::Append(string blockInfo){ + pool_->Append(blockInfo); +} + +void* CudaGPU::GetRealGpuPtr(const Block* block_){ + return nullptr; +} + +void CudaGPU::SwapOut(const Block* block_){ + +} + +void CudaGPU::SwapIn(const Block* block_){ + +} + } // namespace singa -#endif // USE_CUDA +#endif // USE_CUDA \ No newline at end of file diff --git a/src/core/device/device.cc b/src/core/device/device.cc index cda1b9f942..b2988f3615 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -17,6 +17,9 @@ */ #include "singa/core/device.h" +#include +#include +#include namespace singa { Device::Device(int id, int num_executors) @@ -37,7 +40,11 @@ Block* Device::NewBlock(int size) { << "from size_t to int. In that case, the size is too large."; if (size > 0) { void* ptr = Malloc(size); - return new Block(ptr, size); + Block* block_ = new Block(ptr, size,0,this); + //std::cout<<"(reference) from device.cc after, data_, block_ device: "<mutable_data()); + //TODO(junzhe) to merge it + auto tempPtr = block->mutable_data(); + //cout<<"FreeBlock: "<mutable_data()); + + //Add Append for free here. + stringstream strm1; + strm1< &devices, size_t init_size) { vector > ret; for (auto device : devices) { - auto dev = std::make_shared(device, pool); + auto dev = std::make_shared(device, pool); ret.push_back(dev); } return ret; @@ -170,14 +170,12 @@ Platform::CreateOpenclDevices(const size_t num_devices) { } return (int)total_num_devices; } - static const std::vector> Platform::CreateOpenclDevices(const std::vector &id) { - } */ #endif // USE_OPENCL } // namespace singa -#endif +#endif \ No newline at end of file diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc new file mode 100644 index 0000000000..694d1d4806 --- /dev/null +++ b/src/core/device/swap_gpu.cc @@ -0,0 +1,1144 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "singa/singa_config.h" +#ifdef USE_CUDA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // std::tuple, std::get, std::tie, std::ignore +#include "singa/core/device.h" +#include "singa/utils/cuda_utils.h" + + +using namespace std; +namespace singa { + +const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice, + cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToDevice}; + +///functions to be used +///Section for structs and respective sorting function: +// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm + + + +struct less_than_ptrIdx{ + /* + sort onePieceMsg by ptr and then idx. + */ + inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2) + { + return ((struct1.ptr swap_split(string s, string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; +} + +///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg +//vector of pairMsg is used in run. +//vector of iterMsg is used in test. + +vector swap_strVec_2_pieceMsgVec(vector vec, int &idxRange){ + /* + convert vector of string into vector of onePieceMsg, sorted by ptr + and then idx, and update idxRange to pieceMsgVec size. + format of onePieceMsg [ptr, size/-1, flag, idx, timestamp] + flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable + version on 5/29, with equval blockInfo length: flag, block_, size, t + */ + vectoronePieceMsgVec_; + + for (int i=0;i v = swap_split(vec[i], " "); + int MallocFree; + if (v[0]=="Malloc"){ + MallocFree = 1; + }else if (v[0]=="Free"){ + MallocFree = -1; + }else if (v[0]=="Mutable"){ + MallocFree = 4; + }else if (v[0]=="Read"){ + MallocFree = 2; + }else if (v[0]=="Layer"){ + MallocFree = 3; + } + //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + size_t result; + stringstream convert(v[2]); + if (!(convert>>result)){ + result =-1; + cout<<"error for converting size from str to int."<>tempTime; + tempMsg.t =tempTime; + onePieceMsgVec_.push_back(tempMsg); + } + + sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx()); + idxRange = static_cast(onePieceMsgVec_.size()); + + return onePieceMsgVec_; +}// end of strVec_2_pieceMsgVec function + + +vector Swap_piece2rep (vectoronePieceMsgVec_){ + vectoroneIterMsgVec_; + string tempStr; + int tempIdx=0; + for (int i=0;irep; // vector of size_delta, name it as rep for simlisity. + for (int i =0; irep, int &maxLen, int &location, int maxLen_threshold, int gc ){ + int idxRange = (int)rep.size(); + int threshold = maxLen_threshold; + vector>maxLen_location; + + for (int i=0; ithreshold){ + break; + } + for (int len=1; len<(idxRange-i);len++){ + if (maxLen>threshold){ + break; + } + if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLenstruct2.dto); + } +}; + +struct less_than_wdto{ + /* + sort SwapBlock by weighted dto, descending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.wdto>struct2.wdto); + } +}; + +struct less_than_r_idx_ready{ + /* + sort SwapBlock by r_idx_ready, ascending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.r_idx_readystruct2.pri); + } +}; + +struct less_than_Idx_Swap{ + /* + sort onePieceMsg_Swap by idx. + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.r_idxstruct2.d_idx); + } +}; + + +pair load_over_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){ + //input: vec_load, memLimit, range [start_idx, end_idx) + //return range overlimit [first_over_limit, first_below_limit) + int first_over_limit = start_idx; + int first_below_limit = end_idx; + + for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){ + if (vec_load[i] > memLimit){ + first_over_limit = i-maxLen; + break; + } + } + + for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){ + if (vec_load[i] > memLimit){ + first_below_limit = i-1-maxLen; + break; + } + } + if (first_over_limit == start_idx) first_over_limit = -1; + if (first_below_limit == end_idx) first_below_limit = -1; + + return std::make_pair(first_over_limit, first_below_limit); +} + +pair load_below_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){ + //input: vec_load, memLimit, range [start_idx, end_idx] + //return range overlimit [first_over_limit, first_below_limit) + int first_below_limit = maxIdx; + int last_below_limit = maxIdx; + + for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){ + if (vec_load[i] > memLimit){ + first_below_limit = i+1-maxLen; + break; + } + } + + for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){ + if (vec_load[i] > memLimit){ + last_below_limit = i-1-maxLen; + break; + } + } + + return std::make_pair(first_below_limit, last_below_limit); +} + +pair load_peak(vectorvec_load_test,int maxLen){ + double maxLoad_test = 0; + int maxIdx_test = 0; + for (int i = maxLen; i < maxLen*2; i++){ + if (maxLoad_test < vec_load_test[i]){ + maxLoad_test = vec_load_test[i]; + maxIdx_test = i - maxLen; + } + } + return std::make_pair(maxLoad_test,maxIdx_test); +} + +void load_update(vector& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){ + //update load [start_idx, end_idx) by plusMinus*size + //if (start_idx < end_idx){ + for (int i = start_idx+maxLen; i(size) * plusMinus; + } + // } else { + // for (int i = start_idx; i < maxLen; i++){ + // vec_load[i] = vec_load[i] + static_cast(size) * plusMinus; + // } + // for (int i = 0; i < end_idx; i++){ //TODO(junzhe) NOTE, end_idx excluded + // vec_load[i] = vec_load[i] + static_cast(size) * plusMinus; + // } + // } +} + +vector SwapGPU::swap_select(vectorvec_swap,double maxLoad,double memLimit,string mode){ + vectorvec_swap_selct; + vectorvec_swap_reject; + if (mode == "dto"){ + sort(vec_swap.begin(),vec_swap.end(),less_than_dto()); + } + if (mode == "pri"){ + sort(vec_swap.begin(),vec_swap.end(),less_than_pri()); + } + if (mode == "wdto"){ + //TODO(junzhe) time complexity + for (int i = 0; i < vec_swap.size(); i++){ + auto itm = vec_swap[i]; + for (int j = itm.r_idx; j < itm.d_idx; j++){ + itm.wdto += origin_load[i+maxLen] - memLimit; + } + } + sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); + } + + size_t load_swap_selct = 0; + if (mode != "r_idx"){ + for (int i =0; iload_swap_selct){ + vec_swap_selct.push_back(vec_swap[i]); + load_swap_selct+=vec_swap[i].size; + //cout<<"Item selected: (r_idx, d_idx, dto) "<(vec_swap[i].dto/1000000)<load_swap_selct){ + + // } + // for (int i =0; i SwapGPU::swap_load_ideal(vectorvec_load,vector vec_swap_selct){ + auto vec_load_return = vec_load; + for (int i =0; ivec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode){ + /* + update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit. + */ + //TODO(junzhe) wordy, can merge in common part. + if (mode == "no-overhead"){ + //update i1p + //sort by r_idx for i1p update + sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); + for (int i = 0; i 0){ + readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + } + itm.i1 = readyIdx; + itm.t1 = vec_run[readyIdx].t; + itm.t1p = itm.t1 + SwapOutTime(itm.size); + while (itm.t1p > vec_run[readyIdx].t){ + readyIdx++; + } + itm.i1p = readyIdx; + vec_swap_selct[i] = itm; + } + //update i2p + sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs()); + for (int i =0; i 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } + itm.i2 = needIdx; + double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size); + while (prepareTime < vec_run[needIdx].t){ + needIdx--; + } + itm.i2p = needIdx; + itm.t2p = prepareTime; + vec_swap_selct[i] = itm; + load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary + } + + } + if (mode == "stick-to-limit"){ + sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); + for (int i = 0; i 0){ + readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + } + cout<<"||compare with last i1p "< vec_run[readyIdx].t){ //TODO(junzhe) reduce time complexity. + readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead. + } + //get min compare with maxIdx and readyIdx. + readyIdx = std::min(maxIdx,readyIdx); + cout<<"||count swap time "< 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } + cout<<"||compare with last i2p "< 0){ + cout< itm.t2p)) { + overhead+=(vec_run[tempOverLimit_.second].t - itm.t2p); + load_update(vec_load_temp,itm.i2p,tempOverLimit_.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary + itm.i2p = tempOverLimit_.second+1; + auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen); + } + cout<<"||count overlimit "<vec_block,int &maxLen, int &location){ + + ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx. + int idxRange = 0; + vector vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); + cout<<"size of vec_pieceMsg & vec_block: "< vec_rep = Swap_piece2rep(vec_pieceMsg); + //int idxRange3=0; //rename TODO(junzhe) + //int maxLen=0, location =0; + repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc); + cout<<"maxLen and location are: "< v = swap_split(vec_block[location+i], " "); + if (v[0]=="Malloc"){ + shift_counter = i; + break; + } + } + location =location+shift_counter; + cout<<"shift_counter is "< vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); + cout<<"size of vec_pieceMsg & vec_block: "<temp_vec_run(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]); + vec_run = temp_vec_run; + fstream file_vec_run("vec_run.csv", ios::in|ios::out|ios::app); + for (int i =0; ivec_load(&global_load[location],&global_load[location+3*maxLen]); + origin_load = vec_load; + //load before swap, write in + fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); + for (int i=0; ivec_swap; + size_t load_swap = 0; + for (int i =1; i= smallest_block) && (vec_run[i-1].idxmaxIdx) + && (vec_run[i-1].ptr ==vec_run[i].ptr) + && ((vec_run[i-1].MallocFree==3) or (vec_run[i-1].MallocFree==2) or (vec_run[i-1].MallocFree==4))) + { + SwapBlock itm(vec_run[i].ptr, vec_run[i].size, vec_run[i-1].idx, vec_run[i].idx, vec_run[i-1].t, vec_run[i].t); + itm.dto = itm.d_time-itm.r_time; + itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); + if (itm.dt>=0){ + itm.pri = itm.dt * itm.size; + } else { + itm.pri = itm.dt * 1/itm.size; + } + //cat A + if (vec_run[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } + if (vec_run[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} + if (vec_run[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} + + vec_swap.push_back(itm); + load_swap+=itm.size; + cout<<"Items Swappable: (r_idx, d_idx, cat, MB, dt/us, PS) || "<(conf); + Setup(); + +} + +SwapGPU::SwapGPU(int id, std::shared_ptr pool) + : Device(id, kNumCudaStream) { + CHECK(pool != nullptr); + pool_ = pool; + Setup(); +} + +void SwapGPU::Setup() { + lang_ = kCuda; + ctx_.stream = NULL; // use the default sync stream + // TODO(wangwei) create one handle for each steam? + CUDA_CHECK(cudaSetDevice(id_)); + // use curandCreateGeneratorHost for CudaHost device + CURAND_CHECK( + curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT)); + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + SetRandSeed(seed); + // TODO(wangwei) if one generator per stream, then need diff offset per gen? + CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0)); + CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle))); + +#ifdef USE_CUDNN + // TODO(wangwei) create one handle for each stream? + auto status = cudnnCreate(&ctx_.cudnn_handle); + CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status); +#endif // USE_CUDNN +} + +void SwapGPU::SetRandSeed(unsigned seed) { + CHECK(ctx_.curand_generator); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed)); +} + +void SwapGPU::DoExec(function&& fn, int executor) { fn(&ctx_); } + +void SwapGPU::CopyToFrom(void* dst, const void* src, size_t nBytes, + CopyDirection direction, Context* ctx) { + cudaMemcpy(dst, src, nBytes, copyKind[direction]); + // TODO(wangwei) use async copy + // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream); +} + +size_t SwapGPU::GetAllocatedMem() { + if (pool_ != nullptr) { + auto ret = pool_->GetMemUsage(); + return ret.second - ret.first; + } + LOG(ERROR) << "The memory pool is not set"; + return 0u; +} + +/// Allocate gpu memory. +void* SwapGPU::Malloc(int size) { + + void* ptr = nullptr; + if (size > 0) { + CUDA_CHECK(cudaSetDevice(id_)); + pool_->Malloc((void**)&ptr, size); + // TODO(wangwei) remove the memset. + CUDA_CHECK(cudaMemset(ptr, 0, size)); + } + //cout<<"malloc done"<Free(ptr); + } + + //cout<<"free done"< maxLen_threshold) { + testFlag = 1; + cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<vec_load2(&global_load[location],&global_load[location+3*maxLen]); + origin_load = vec_load2; + //load before swap, write in + fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); + for (int i=0; i(Table_sched.find(r_gc)->second); + auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); + auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); + auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); + if (swap_dir == 0){ SwapOut_idx(swap_idx); + cout<<"Swap Out "<second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaEventSynchronize(last_meta.in_event); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + last_meta.block_->update_data(nullptr); + cout<<"to free data_"<Free(last_meta.data_); + last_meta.data_ = nullptr; //not really needed TODO(junzhe) + cout<<"sync out "<second = last_meta; + } + if (sync_dir == 1){ + //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe) + auto last_meta = Table_meta.find(sync_idx)->second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaEventSynchronize(last_meta.out_event); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + Table_not_at_device.erase(last_meta.block_); + last_meta.block_->update_data(last_meta.data_); + cout<<"sync in "<second = last_meta; + } + cout<<"-------"< v = swap_split(blockInfo, " "); + void* tempPtr; + stringstream convert(v[1]); + convert>>tempPtr; + auto tempBlock_ = static_cast(tempPtr); + + // insert size, malloc : flag, block_, size, t; others: insert size t. + if (v.size() != 4) { + stringstream strm1; + strm1<size(); + string tempStr1 = strm1.str(); + blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2]; + } + // update global load + if (maxLen < maxLen_threshold){ + if (v[0] == "Malloc"){ + if (global_load.size()>0){ + global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size()); + } else { + global_load.push_back(tempBlock_->size()); + } + } else if (v[0] == "Free"){ + global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size()); + } else { + global_load.push_back(global_load[global_load.size()-1]); + } + } + //cout<size()<maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){ + fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app); + for (int i =0; i maxLen_threshold) { + //cout<get_data()<second.block_ = tempBlock_; + Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); + } + } + if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){ + if (tempTime != 0){ + fstream file_time("itr_time.csv", ios::in|ios::out|ios::app); + auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count(); + file_time<<(float)(t_now - tempTime)/(float)(1000000)<second)->second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaEventSynchronize(reading_meta.in_event); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); + + //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<second<second; + cudaEventCreate (&meta.out_event); + //cout<<"right before cudaMemcpyAsync Out"<second = meta; + //cout<<"time for asynchrous: "<second; + cudaEventCreate (&meta.in_event); + //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); + //cout<<"expected results update_data:: "<second = meta; + //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update. + //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + //cout<<"time for asynchrous: "<size() > 1<<20) { + fstream file_block5("speed.csv", ios::in|ios::out|ios::app); + BlockMeta meta; + meta.data_ = meta.block_->get_data(); + void* tempPtr = nullptr; + cudaMallocHost(&tempPtr,block_->size()); //pinned memory. + meta.cpu_ptr = tempPtr; + Table_block_meta[block_] = meta; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaError_t err; + err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + file_block5<<"Out "<size()<<' '<size() > 1<<20) { + fstream file_block5("speed.csv", ios::in|ios::out|ios::app); + BlockMeta meta = Table_block_meta.find(block_)->second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaError_t err; + err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + file_block5<<"In "<size()<<' '< +#include //a. +#include +//for SmartMemoryPool +using namespace std; #ifdef USE_CUDA @@ -94,6 +98,7 @@ void CnMemPool::Malloc(void **ptr, const size_t size) { void CnMemPool::Free(void *ptr) { CHECK(initialized_) << "Cannot free the memory as the pool is not initialzied"; + //cout<<"(normal)to free ptr "< colorRange; + vector> colorOccupied; +}; +Vertex::Vertex(int n, size_t s, int r1, int d1){ + name =n; + size = s; + r = r1; + d = d1; +}//end of class Vertex + + +///Section for structs and respective sorting function: +// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm +struct onePieceMsg{ + /* + members: [ptr, size, MallocFree, idx] + */ + string ptr; + size_t size; + int MallocFree; + int idx; + onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} +}; + + +struct less_than_ptrIdx{ + /* + sort onePieceMsg by ptr and then idx. + */ + inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2) + { + return ((struct1.ptrstruct2.size); + } +}; + +struct less_than_size_rIdx{ + /* + sort onePairMsg by descending size and r_idx + */ + inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2) + { + return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx split(string s, string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; +} + +///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg +//vector of pairMsg is used in run. +//vector of iterMsg is used in test. + +vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ + /* + convert vector of string into vector of onePieceMsg, sorted by ptr and then idx, and update idxRange to pieceMsgVec size. + */ + vectoronePieceMsgVec_; + for (int i=0;i v = split(vec[i], " "); + if (v[0]=="Malloc"){ + //convert v[2] from str to size_t + size_t result; + stringstream convert(v[2]); + if (!(convert>>result)){ + result =-1; + cout<<"error for converting size from str to int."<(onePieceMsgVec_.size()); + + return onePieceMsgVec_; +}// end of strVec_2_pieceMsgVec function + + +pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vectoronePieceMsgVec_, int idxRange){ + /* + pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks. + */ + vectoronePairMsgVec_1; + vectoronePairMsgVec_2; + int i=0; + + //while loop processes a pair at each time, if got a pair. + while (i<(onePieceMsgVec_.size()-1)){ + //condition A: start with free. do nothing. + if (onePieceMsgVec_[i].MallocFree==-1){ + i+=1; + } + //condition B: start with Malloc, next item same ptr and is free. + if ((onePieceMsgVec_[i].MallocFree==1)&& (onePieceMsgVec_[i+1].MallocFree==-1)&&((onePieceMsgVec_[i].ptr==onePieceMsgVec_[i+1].ptr))){ + onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,onePieceMsgVec_[i+1].idx); + onePairMsgVec_1.push_back(tempPair); + i+=2; + } + // condition C: start with Malloc, no free. + if ((onePieceMsgVec_[i].MallocFree==1)&&(onePieceMsgVec_[i].ptr!=onePieceMsgVec_[i+1].ptr)){ + onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange); + onePairMsgVec_2.push_back(tempPair); + i+=1; + } + }//end of while + //condition D: if still left with the last item + if ((i,vector>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2); + + return pairOfPairMsgVec_; +}//end of pieceMsgVec_2_pairOfPairMsgVec function + +///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph. +vector> mergeSeg(vector> colorOccupied){ + /* + version 12/9 11am -- modify to accomodate unsigned int/size_t + input:the collection of color ranges that is once occupied by some block during a block's life time. + function: merge consecutive/overlapping segments of colorOccupied + output: merged segments in ascending order. + time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied. + */ + sort(colorOccupied.begin(), colorOccupied.end()); + + if(colorOccupied.size()<=1){ + return colorOccupied; + } + + int m = 0; + while (m<(colorOccupied.size()-1)){ + + if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){ + pairtempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second)); + //remove m+1 and m + colorOccupied.erase(colorOccupied.begin()+m+1); + colorOccupied.erase(colorOccupied.begin()+m); + //insert the combined range + colorOccupied.insert(colorOccupied.begin()+m,tempItem); + }else{ + m+=1; + } + }//end of while loop + + //verify if mergeSeg is completed. O(n^2) optional +// if(colorOccupied.size()>1){ +// for (int i=0;i<(colorOccupied.size()-1);i++){ +// if(colorOccupied[i].second>=colorOccupied[i+1].first){ +// cout<<"error in mergeSeg"< FFallocation(vector> colorMerged,size_t size, size_t local_offset){ + /* + version 12/2 4pm + First Fit weighted coloring + return a pair standing for colorRange. + local_offset shifts the returned colorRange, allowing multiple run(). + local_offset not changable, whereas offset is changable. + */ + // condition A: if no occupied, put after the local_offset + if (colorMerged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + + // condition B: able to fit before first block, after the local_offset + if ((size+local_offset)<(colorMerged[0].first+1)){ + return pair(0+local_offset,size-1+local_offset); + } + + size_t yLocation= -1; + if (colorMerged.size()>1) { + int n = 0; + while (n<(colorMerged.size()-1)){ + // condition C: able to fit in between middle blocks. + if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){ + yLocation = colorMerged[n].second+1; + break; + } + n+=1; + }//end of while loop. + // condition D: allocate after the last block. + if (yLocation == -1){ + yLocation = colorMerged[colorMerged.size()-1].second+1; + } + }// end of if loop, conditon C and D. + + // condition E: colorMeger len =1, allocate after the last block. + if (colorMerged.size()==1){ + yLocation = colorMerged[0].second+1; + } + + if (yLocation==-1){ + cout<<"error in FFallocation!!!"<(yLocation,yLocation+size-1); +}//end of FFallocation function + + +pair BFallocation(vector> colorMerged,size_t size, size_t local_offset){ + /* + version 12/11 1pm + Best Fit allocation, input and output same as FFallocation + */ + // condition A: if no occupied, put after the local_offset + if (colorMerged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + //condition B: if size=1, able to fit before the first block + if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){ + return pair(0+local_offset,size-1+local_offset); + } + + //condition C: else of B + if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){ + return pair(colorMerged[0].second+1,colorMerged[0].second+size); + } + + //condition D and E: + size_t yLocation=-1; + pairtempHole(-1,-1); // n, hole size between n and n+1 + if (colorMerged.size()>1) { + int n = 0; + while (n<(colorMerged.size()-1)){ + // condition C: able to fit in between middle blocks. select smallest. + if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)(yLocation,yLocation+size-1); +} + +vector colorSomeVertices(vector pairMsgVec_, size_t &offset,string colorMethod){ + /* + color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset. + time complexity: O(n^2). + */ + size_t local_offset = offset; //feed into FFallocation, shall never change. + int m = static_cast(pairMsgVec_.size()); + //init all vertices + vectorvertices; + for (int i=0; i>colorMerged = mergeSeg(vertices[i].colorOccupied); + + if(colorMethod=="FF"){ + vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); + + }else{ //BF + vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset); + } + + //update of offset, largest memory footprint as well. + if (vertices[i].colorRange.second >=offset){ + offset = vertices[i].colorRange.second+1; + } + }//end of for loop. + + return vertices; +} + + +///get cross-iteration duration pairs +pair,map> cross_itr_durations(vectorvec_double, int location, int maxLen, int &doubleRange){ + + vectoronePieceMsgVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange); + pair,vector>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_2,doubleRange); + + mapTable_r2d; //full duration info, cross-iteration duration. + mapTable_d2r; + for (int i=0;i,map>(Table_r2d,Table_d2r); +} + +/// main run funtion +vector run(vectorvec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){ + /* + run function, input vector of strings, return colored vertices, + update idxRange, offset. + time complexity: O(n^2) where n is maxLen. + */ + vectoronePieceMsgVec_ = strVec_2_pieceMsgVec(vec,idxRange); + pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_,idxRange); + //1. normal blocks 2. cross-iteration blocks. + vectorpairMsgVec_1 = pairOfPairMsgVec_.first; + vectorpairMsgVec_2 = pairOfPairMsgVec_.second; + + vectorvertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod); + for (int i=0; ivertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod); + //merge + vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end()); + + return vertices; +} + + +///Section of test functions. +vector pairOfPairMsgVec_2_repSeq(pair,vector>pairOfPairMsgVec_){ + int counter_1M=0; int counter_1F=0; int counter_2=0; + vectoronePairMsgVec_1 = pairOfPairMsgVec_.first; + vectoronePairMsgVec_2 = pairOfPairMsgVec_.second; + vectoroneIterMsgVec_; + for (int i =0; i(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx); + oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx); + oneIterMsgVec_.push_back(tempIterF); + counter_1F++; + } + + for (int i =0; irep; // vector of size_delta, name it as rep for simlisity. + for (int i =0; i maxRepeatedSeg(vectorrep, int idxRange, int &maxLen, int &location){ + /* + get max repeated non-overlapping Seg of a vector, return the repeated segment, + update maxLen, and location of where Seg starts to repeat. + brtue force method using equal() + time complexity O(n^2) + */ + for (int i=0; isubSeq(&rep[location],&rep[location+maxLen]); + if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){ + cout<<"error in get the maxRep"<subSeq, int &maxLen, int &location){ + /* + to cut, in case the repeated Seg contains multiple iterations. + */ + int tempMaxLen=0; + int tempLocation =0; + int tempIdxRange = maxLen; + + vectortempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation); + //TODO(junzhe), tunable threshold. + int threshold =50; + if (tempMaxLen>threshold){ + maxLen = tempMaxLen; + location += tempLocation; + cout<<"max length get cut"<vec3, int &maxLen, int &location){ + /* + main function of test, returns globeCounter, which is when flag shall be switched, + update maxLen and location of where the repeated Seg starts. + */ + cout<<"====================== test ========================="<onePieceMsgVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3); + pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_3,idxRange3); + vectorrep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_); + + //get repeated sub vector. + vectorsubSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location); + //cout<100){ //TODO(junzhe) tunable threshold. + cout<<"new location and maxLen: "< vertices){ + size_t s = vertices.size(); + int i,j; + for (i=0; ivec_run(&vec[location],&vec[location+maxLen]); + + vectorvertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod); + + //here to verify if the coloring got overlapping. TODO(junzhe) optional + //overlap_test(vertices); + + //obtain the cross-iteration duration info + int doubleRange=0; + vectorvec_double(&vec[location],&vec[location+2*maxLen]); + pair,map>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange); + Table_r2d = pairs.first; + Table_d2r = pairs.second; + + //update ptrPool + cudaMalloc(&ptrPool,offset); //poolSize or memory foot print offset. + cout<<"ptrPool is: "<second; + temp.size =vertices[i].size; + temp.offset=vertices[i].colorRange.first; + temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char)); + temp.Occupied =0; + temp.crossItr = vertices[i].crossItr; + temp.Occupied_backup =0; + //build tables for lookup. + Vec_r2Ver[vertices[i].r].second= temp; + } + } + + if(mallocFlag==0){ + /// 2. if flag=0, malloc/cudaMalloc + cudaMalloc(ptr, size); + allocatedPtr = *ptr; + //update load + if(loadLogFlag==1){ + if (gc>0){ + Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second); + }else{ //very first block + Table_load[gc]=make_pair(size,0); + } + } + //push_back the string for later test and run. + string tempStr1 ="Malloc "; + stringstream strm2; + strm2<second.first,Table_load.find(gc-1)->second.second+size); + } + //file<<" Condition M2, addr: "<<*ptr<second.first,Table_load.find(gc-1)->second.second+size); + } + //file<<" Condition M4, addr: "<<*ptr<second.first+size,Table_load.find(gc-1)->second.second); + } + //file<<" Condition M3, addr: "<<*ptr<checkPoint)){ + cout<<"gc and GC before test: "<0)){ + getMaxLoad(); + loadLogFlag=0; + } + + gc++; + Table_p2s[allocatedPtr]=size; //update it for load tracking purpose. + *ptr = allocatedPtr; + ///update block_RWMF + string tempStr1 ="Malloc "; + stringstream strm2; + strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<second; + + if ((globeCounter==-1)||(gcsecond.first-deallocatedSize,Table_load.find(gc-1)->second.second); + } + /// before flag switch, for sure all free shall be done by free() + cudaFree(ptr); + }else{ + if (!(Table_p2r.find(ptr)==Table_p2r.end())){ + int resp_rIdx = Table_p2r.find(ptr)->second; + Table_p2r.erase(ptr); + + if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){ + //Condition F2, from M2 + Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again. + //file<<" Condition F2, addr: "<0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){ + Vec_r2Ver[resp_rIdx].second.Occupied_backup =0; + }else{ + Vec_r2Ver[resp_rIdx].second.Occupied =0; + } + } + //update load + if(loadLogFlag==1){ + Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize); + } + }else{ + //update load + if(loadLogFlag==1){ + Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second); + } + //file<<" Condition F3, addr: "<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<cudaLoadLog; + for (int i=0; isecond.first); + } + size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end()); + int idxMaxCudaLoad = static_cast(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end()))); + + vectorcolorLoadLog; + for (int i=0; isecond.second); + } + size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end()); + int idxMaxColorLoad = static_cast(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end()))); + size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first; + + maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad); + maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad); + memRatio = (float)maxMemUsage/(float)maxTotalLoad; + + cout<<"=============================memory usage stats print: ================================"< SmartMemPool::GetMemUsage() { + //TODO(junzhe) note here the pair is different from that of CnMemPool. + return std::make_pair(maxMemUsage, maxTotalLoad); +} + +void SmartMemPool::Append(string blockInfo) { + //TODO(junzhe) add idx later + vec_block_RW.push_back(blockInfo); + vec_block_RWMF.push_back(blockInfo); +} + +///Swap: +Swap::Swap(const MemPoolConf &conf){ + conf_ = conf; +} + +void Swap::Init(){ + //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. + mtx_.lock(); + if(!initialized_){ + initialized_ =true; + } + mtx_.unlock(); +} + +void Swap::Malloc(void** ptr, const size_t size){ + //cout<<"to malloc"<meta = std::make_pair(cpu, gpu); + Table_Meta[*ptr] = meta; + + swapLookUpElement temp; + temp.size = size; + Table_id2LookUpElement[*ptr] = temp; + // int i = 0; + // if (!(Table_id2LookUpElement.find(*ptr)==Table_id2LookUpElement.end())){ + // i = i + 1; + // temp.data_ = *ptr +i*sizeof(char); + // while(!(Table_id2LookUpElement.find(temp.data_)==Table_id2LookUpElement.end())){ + // //TODO(swap) verify this loop, can simplify as well. + // i = i + 1 + // temp.data_ = *ptr +i*sizeof(char); + // } + // } else { + // temp.data_ = *ptr; + // } + // temp.realGpuPtr = *ptr; + // temp.location = 1; + // temp.size = size; + + // create before swap. + // if (size>swapLimit){ + // temp.realCpuPtr = malloc(size); + // } + // stringstream strm1; + // strm1<second.second.swapSize; + Table_Meta.find(data_)->second.first.ptr = malloc(swapSize); + SwapMeta cpu, gpu; + cpu = Table_Meta.find(data_)->second.first; + gpu = Table_Meta.find(data_)->second.second; + cudaError_t err; + err=cudaMemcpy(cpu.ptr,gpu.ptr,gpu.swapSize,cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + fstream file_block3("blockInfo_swapOut.text", ios::in|ios::out|ios::app); + file_block3<second.first; + gpu = Table_Meta.find(data_)->second.second; + gpu.ptr=nullptr; + cudaError_t status = cudaMalloc(&gpu.ptr, gpu.swapSize); + CHECK_EQ(status, cudaError_t::cudaSuccess); + Table_Meta.find(data_)->second.second.ptr=gpu.ptr; + //cout<<"after alloc:1 "<second.second.ptr<second.size; + // cpu.ptr=malloc(cpu.swapSize); + // gpu=cpu; + // cudaMalloc(&gpu.ptr,cpu.swapSize); + + // cudaError_t err; + // err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice); + // printf("2. swapIn done.\n"); + +///below partial copy + // SwapMeta h_meta; + // SwapMeta* d_meta; + // h_meta.swapSize=Table_id2LookUpElement.find(data_)->second.size; + // h_meta.h_ptr=malloc(h_meta.swapSize); + // cudaMalloc(d_meta,sizeof(SwapMeta)); + // cudaMemcpy(d_meta,&h_meta,sizeof(SwapMeta),cudaMemcpyHostToDevice); + + // void** d_ptr; + // cudaMalloc() + // void* h_ptr=malloc(tempSize); + +///below is to swapIn swapped out items + // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + // size_t tempSize = Table_id2LookUpElement.find(data_)->second.size; + // void** tempPtr; + // cudaMalloc(tempPtr,tempSize); + // cout<<"1. to swapIn."<second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice); + // if (err != cudaSuccess) + // { + // fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); + // exit(EXIT_FAILURE); + // } + // //cudaMemcpy(data_, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice); + // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + // fstream file_block4("blockInfo_swapIn.text", ios::in|ios::out|ios::app); + // file_block4<second.realCpuPtr); + // cout<<"testing after SwapIn"< Swap::GetMemUsage() { + //empty + return std::make_pair(0, 0); +} + +Swap::~Swap(){ + //put in notes + fstream file_block1("blockInfo.text", ios::in|ios::out|ios::app); + for (int i=0; i< vec_block.size();i++){ + file_block1< Date: Mon, 13 Aug 2018 17:16:35 +0800 Subject: [PATCH 09/19] unable common.cc appendInfo --- src/core/common/common.cc | 76 +++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/core/common/common.cc b/src/core/common/common.cc index 73ec0b0c6e..5a6f61268b 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -31,25 +31,25 @@ void* Block::mutable_data() { //TODO(junzhe) go back to enable it after device done //std::cout<<"mutable_data() "<AppendInfo(temp); - // } - // //TODO(junzhe) this should not happen, can verify and remove - // if (data_ == nullptr) { - // //cout<<"to sleep"<GetRealGpuPtrInfo(this); - // cout<<"slept to get data_ updated: "<AppendInfo(temp); + } + //TODO(junzhe) this should not happen, can verify and remove + if (data_ == nullptr) { + //cout<<"to sleep"<GetRealGpuPtrInfo(this); + cout<<"slept to get data_ updated: "<(data_) + offset_; } @@ -58,26 +58,26 @@ void* Block::mutable_data() { const void* Block::data() const { CHECK(initialized_) << "Must initialize data before reading it"; //TODO(junzhe) go back to enable it after device done - // if (ptrDevice_!=nullptr){ - // //Append info. - // stringstream strm2; - // strm2<AppendInfo(temp); - // } + if (ptrDevice_!=nullptr){ + //Append info. + stringstream strm2; + strm2<AppendInfo(temp); + } - // //TODO(junzhe) this should not happen, can verify and remove - // if (data_ == nullptr) { - // //cout<<"to sleep"<GetRealGpuPtrInfo(this); - // cout<<"slept to get data_ updated"<GetRealGpuPtrInfo(this); + cout<<"slept to get data_ updated"<(data_) + offset_; From 572fe4d5f23ac25a1a1a42aa00f2c861657c11c4 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Mon, 20 Aug 2018 14:37:13 +0800 Subject: [PATCH 10/19] correct swap_select() --- include/singa/core/device.h | 4 +- src/core/.DS_Store | Bin 6148 -> 6148 bytes src/core/device/swap_gpu.cc | 161 ++++++++++++++++-------------------- 3 files changed, 71 insertions(+), 94 deletions(-) diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 36b10d0d56..86c0a20b84 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -303,7 +303,7 @@ class SwapGPU : public Device { int swap_test(vectorvec_block,int &maxLen, int &location); void swap_sched(vectorvec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); void swap_plan(); - vector swap_select(vectorvec_swap,double maxLoad,double memLimit,string mode); + vector swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode); vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); void Test_sched_switch_swap(); void DeploySwap(); @@ -332,7 +332,7 @@ class SwapGPU : public Device { //vec_block vectorvec_block; //iteration 0-3 vectorvec_block_fresh; //iteration 4 5 6 - vectorglobal_load; + vectorglobal_load; // from begining vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. vectorvec_run; int asyncSwapFlag = 0; //0 for sync, 1 for async. diff --git a/src/core/.DS_Store b/src/core/.DS_Store index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..018d123c47540aef56047ebb82a09e1df9135f84 100644 GIT binary patch delta 207 zcmZoMXfc=|#>B`mF;Q%yo}wrV0|Nsi1A_nqLq0=ZPP$=ma(-^X#Kh(GAPF{xWT0>^ z5F$we#bE$NJ~!XRC8e|^nStT@w3nPAjHu~2NHo+1YW5HK<@2yC9nSjM(_0izY;W_AvK4xj>{$am(+{342+ UKzW7)kiy9(Jj$D6L{=~Z03gE-+W-In diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 694d1d4806..59f376e93e 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -352,21 +352,12 @@ pair load_peak(vectorvec_load_test,int maxLen){ void load_update(vector& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){ //update load [start_idx, end_idx) by plusMinus*size - //if (start_idx < end_idx){ for (int i = start_idx+maxLen; i(size) * plusMinus; } - // } else { - // for (int i = start_idx; i < maxLen; i++){ - // vec_load[i] = vec_load[i] + static_cast(size) * plusMinus; - // } - // for (int i = 0; i < end_idx; i++){ //TODO(junzhe) NOTE, end_idx excluded - // vec_load[i] = vec_load[i] + static_cast(size) * plusMinus; - // } - // } } -vector SwapGPU::swap_select(vectorvec_swap,double maxLoad,double memLimit,string mode){ +vector SwapGPU::swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode){ vectorvec_swap_selct; vectorvec_swap_reject; if (mode == "dto"){ @@ -386,30 +377,14 @@ vector SwapGPU::swap_select(vectorvec_swap,double maxLoad, sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); } - size_t load_swap_selct = 0; - if (mode != "r_idx"){ - for (int i =0; iload_swap_selct){ - vec_swap_selct.push_back(vec_swap[i]); - load_swap_selct+=vec_swap[i].size; - //cout<<"Item selected: (r_idx, d_idx, dto) "<(vec_swap[i].dto/1000000)<load_swap_selct){ - - // } - // for (int i =0; i Date: Mon, 20 Aug 2018 14:44:50 +0800 Subject: [PATCH 11/19] correct swap_select() --- src/core/device/swap_gpu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 59f376e93e..7e31eb15dd 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -381,7 +381,7 @@ vector SwapGPU::swap_select(vectorvec_swap,vector load_update(tempLoad,0,maxLen,-1,vec_swap[i].size,maxLen); vec_swap_selct.push_back(vec_swap[i]); auto max_current = load_peak(tempLoad,maxLen); - newMaxLoad = max_current.first; + auto newMaxLoad = max_current.first; if (newMaxLoad < memLimit){ break; } @@ -674,7 +674,7 @@ void SwapGPU::swap_plan(){ for (int i=maxLen; i Date: Mon, 20 Aug 2018 14:57:43 +0800 Subject: [PATCH 12/19] enable swap_plan() --- src/core/device/swap_gpu.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 7e31eb15dd..19f8f08e4c 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -693,7 +693,6 @@ void SwapGPU::swap_plan(){ swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,450<<20,mode); - // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app); // for (int i=maxLen; ivec_load2(&global_load[location],&global_load[location+3*maxLen]); origin_load = vec_load2; //load before swap, write in From ed73c3ad04ff48fa4fb28aaa21df4961b9138ba9 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Sun, 26 Aug 2018 22:40:36 +0800 Subject: [PATCH 13/19] documentation delay swap_plan() by 3 more iterations update train correct swap_sched(), swap_select(),swap_plan() correct load update in swap_select vec_run changed to new 3 iterations correct vec_run36 index issue correct overhead issue, verify vec_run.t vec_run duplicate to avoid sorting issue --- examples/cifar10/{cnn.py => alexnet.py} | 0 examples/cifar10/train.py | 46 ++-- include/singa/core/device.h | 45 +++- src/core/device/swap_gpu.cc | 344 ++++++++++++++---------- 4 files changed, 263 insertions(+), 172 deletions(-) rename examples/cifar10/{cnn.py => alexnet.py} (100%) diff --git a/examples/cifar10/cnn.py b/examples/cifar10/alexnet.py similarity index 100% rename from examples/cifar10/cnn.py rename to examples/cifar10/alexnet.py diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index 4a4d94fe30..652772ba5a 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -31,24 +31,25 @@ import os import argparse +# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python')) from singa import utils from singa import optimizer from singa import device from singa import tensor +from singa.proto import core_pb2 from caffe import caffe_net -import cnn +import alexnet import vgg import resnet +from datetime import datetime +import time def load_dataset(filepath): print('Loading data file %s' % filepath) with open(filepath, 'rb') as fd: - try: - cifar10 = pickle.load(fd, encoding='latin1') - except TypeError: - cifar10 = pickle.load(fd) + cifar10 = pickle.load(fd) image = cifar10['data'].astype(dtype=np.uint8) image = image.reshape((-1, 3, 32, 32)) label = np.asarray(cifar10['labels'], dtype=np.uint8) @@ -129,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, dev = device.get_default_device() else: print('Using GPU') - dev = device.create_cuda_gpu() + dev = device.create_cuda_gpu_on(1) net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) @@ -137,16 +138,27 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) - ty = tensor.Tensor((batch_size,), dev, tensor.int32) + ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) + fileTimeLog =open("epochTimeLog.text","a") for epoch in range(1): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) - for b in range(20): + print(datetime.now().timetz()) # miliseconds + print(int(round(time.time()*1000))) + fileTimeLog.write('Epoch %d: ' % epoch) + fileTimeLog.write(str(int(round(time.time()*1000)))) + fileTimeLog.write('\n') + for b in range(15): #num_train_batch): + print ("start of iteration %d: " %b) + #time.sleep(1) + fileTimeLog.write('iteration %d: ' % b) + fileTimeLog.write(str(int(round(time.time()*1000)))) + fileTimeLog.write('\n') x = train_x[idx[b * batch_size: (b + 1) * batch_size]] y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) @@ -175,14 +187,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) + fileTimeLog.close() net.save('model', 20) # save model params into checkpoint file if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train dcnn for cifar10') - parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'], - default='vgg') + parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'], + default='alexnet') parser.add_argument('data', default='cifar-10-batches-py') parser.add_argument('--use_cpu', action='store_true') + parser.add_argument('batch_size',type=int, default=100) args = parser.parse_args() assert os.path.exists(args.data), \ 'Pls download the cifar10 dataset via "download_data.py py"' @@ -194,22 +208,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, net = caffe_net.create_net(args.use_cpu) # for cifar10_full_train_test.prototxt train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) # for cifar10_quick_train_test.prototxt # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004, # use_cpu=args.use_cpu) - elif args.model == 'cnn': + elif args.model == 'alexnet': train_x, test_x = normalize_for_alexnet(train_x, test_x) - net = cnn.create_net(args.use_cpu) + net = alexnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) elif args.model == 'vgg': train_x, test_x = normalize_for_vgg(train_x, test_x) net = vgg.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005, - use_cpu=args.use_cpu) + use_cpu=args.use_cpu,batch_size=args.batch_size) else: train_x, test_x = normalize_for_alexnet(train_x, test_x) net = resnet.create_net(args.use_cpu) train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4, - use_cpu=args.use_cpu) \ No newline at end of file + use_cpu=args.use_cpu,batch_size=args.batch_size) \ No newline at end of file diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 86c0a20b84..8d51d61d09 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -265,14 +265,14 @@ struct SwapBlock{ double r_idx_ready; //r_idx + buffer, could be set during selection. //int free = -1; //when it is freed //below as per planned. - int i1; - int i1p; - int i2; - int i2p; - double t1; - double t2; - double t1p; - double t2p; + int i1 = 0; + int i1p = 0; + int i2 = 0; + int i2p = 0; + double t1 = 0; + double t2 = 0; + double t1p = 0; + double t2p = 0; SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {} }; @@ -299,16 +299,37 @@ class SwapGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override; + + //append info after Malloc, pair. + void MakeMetaTable(Block* block,void* data_,int size) override; + + //test iteration, return GC int swap_test(vectorvec_block,int &maxLen, int &location); + + //schedule algo void swap_sched(vectorvec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); + + //entire plan, from swap_select() to swap_sched() void swap_plan(); + + //selection algo vector swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode); + + //load profile as per synchronous swap. vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); + + //all the testing, without swap, during Append() void Test_sched_switch_swap(); + + //swap, during Append() void DeploySwap(); + + //Append at every index: malloc, free, read, mutable void Append(string blockInfo) override; + + //in case gpu ptr wrong. TODO(junzhe) to verify if needed. void* GetRealGpuPtr(const Block* block_) override; + void SwapOut(const Block* block_) override; void SwapIn(const Block* block_) override; @@ -337,11 +358,13 @@ class SwapGPU : public Device { vectorvec_run; int asyncSwapFlag = 0; //0 for sync, 1 for async. int testFlag = 0; //0 means open for test, 1 means no need test anymore. - int gc = 0; //global counter each time Malloc/Free, add 1. + int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write. int globeCounter = -1; int maxLen = 0; int location = 0; - //design requirement + int three_more_location = 0; //location at 3 more iterations later. + int three_more_globeCounter = -1; // + //design requirement TODO(junzhe) float memLimit_ratio = 0.70; size_t smallest_block = 1<<20; //1 MB int data_buffer = 4; // used to control readyIdx diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 19f8f08e4c..6768aee844 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -217,14 +217,14 @@ struct less_than_Idx{ int SwapOutTime(size_t size){ - int ans =0; //TODO(junzhe) used to be 0.29; new param as per vgg + int ans = 0; //TODO(junzhe) used to be 0.29; new param as per vgg if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;} return ans; } int SwapInTime(size_t size){ //yet to get the formula - int ans =0; //TODO(junzhe) used to be 0.13; new param as per vgg + int ans = 0; //TODO(junzhe) used to be 0.13; new param as per vgg if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;} return ans; } @@ -249,15 +249,15 @@ struct less_than_wdto{ } }; -struct less_than_r_idx_ready{ - /* - sort SwapBlock by r_idx_ready, ascending - */ - inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) - { - return (struct1.r_idx_ready load_over_limit(vectorvec_load, size_t memLimit, int start return std::make_pair(first_over_limit, first_below_limit); } -pair load_below_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){ - //input: vec_load, memLimit, range [start_idx, end_idx] - //return range overlimit [first_over_limit, first_below_limit) - int first_below_limit = maxIdx; - int last_below_limit = maxIdx; - - for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){ - if (vec_load[i] > memLimit){ - first_below_limit = i+1-maxLen; - break; - } - } - - for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){ - if (vec_load[i] > memLimit){ - last_below_limit = i-1-maxLen; - break; - } - } - - return std::make_pair(first_below_limit, last_below_limit); -} +// pair load_below_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){ +// //input: vec_load, memLimit, range [start_idx, end_idx] +// //return range overlimit [first_over_limit, first_below_limit) +// int first_below_limit = maxIdx; +// int last_below_limit = maxIdx; + +// for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){ +// if (vec_load[i] > memLimit){ +// first_below_limit = i+1-maxLen; +// break; +// } +// } + +// for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){ +// if (vec_load[i] > memLimit){ +// last_below_limit = i-1-maxLen; +// break; +// } +// } + +// return std::make_pair(first_below_limit, last_below_limit); +// } pair load_peak(vectorvec_load_test,int maxLen){ double maxLoad_test = 0; @@ -359,7 +359,7 @@ void load_update(vector& vec_load,int start_idx, int end_idx, int plusMi vector SwapGPU::swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode){ vectorvec_swap_selct; - vectorvec_swap_reject; + //vectorvec_swap_reject; if (mode == "dto"){ sort(vec_swap.begin(),vec_swap.end(),less_than_dto()); } @@ -376,16 +376,19 @@ vector SwapGPU::swap_select(vectorvec_swap,vector } sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); } - + cout<<"===============select block one by one================="<vec_swap_selct, vector&vec_loa update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit. */ //TODO(junzhe) wordy, can merge in common part. - if (mode == "no-overhead"){ - //update i1p - //sort by r_idx for i1p update + overhead = 0; + cout<<"----------------swap_sched----------------"< 0){ readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); } + cout<<" -> "< vec_run[readyIdx].t){ - readyIdx++; + total_swapOutTime+=SwapOutTime(itm.size); + while (itm.t1p > vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity. + readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead. } + //get min compare with maxIdx and readyIdx. + readyIdx = std::min(maxIdx,readyIdx); + cout<<" || "< "< 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } + cout<<" -> "< 0){ + // cout< itm.t2p)) { + overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p); + cout<<"==== overhead added "< "< 0){ readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); } - cout<<"||compare with last i1p "< vec_run[readyIdx].t){ //TODO(junzhe) reduce time complexity. - readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead. - } - //get min compare with maxIdx and readyIdx. - readyIdx = std::min(maxIdx,readyIdx); - cout<<"||count swap time "< vec_run[readyIdx].t){ + readyIdx++; } - cout<<"||count over limit "< 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } - cout<<"||compare with last i2p "< 0){ - cout< itm.t2p)) { - overhead+=(vec_run[tempOverLimit_.second].t - itm.t2p); - load_update(vec_load_temp,itm.i2p,tempOverLimit_.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary - itm.i2p = tempOverLimit_.second+1; - auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen); - } - cout<<"||count overlimit "<temp_vec_run(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]); + vectortemp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]); vec_run = temp_vec_run; - fstream file_vec_run("vec_run.csv", ios::in|ios::out|ios::app); + fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app); for (int i =0; itemp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]); + auto vec_run2 = temp_vec_run2; + fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app); + for (int i =0; ivec_load(&global_load[location],&global_load[location+3*maxLen]); origin_load = vec_load; - //load before swap, write in - fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); + //3 iterations + fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app); for (int i=0; ivec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]); + // auto origin_load2 = vec_load2; + // //3 iterations + // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app); + // for (int i=0; ivec_swap; - size_t load_swap = 0; - for (int i =1; i= smallest_block) && (vec_run[i-1].idxmaxIdx) - && (vec_run[i-1].ptr ==vec_run[i].ptr) - && ((vec_run[i-1].MallocFree==3) or (vec_run[i-1].MallocFree==2) or (vec_run[i-1].MallocFree==4))) + if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idxmaxIdx) + && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) + && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4))) { - SwapBlock itm(vec_run[i].ptr, vec_run[i].size, vec_run[i-1].idx, vec_run[i].idx, vec_run[i-1].t, vec_run[i].t); + SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t); itm.dto = itm.d_time-itm.r_time; itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); if (itm.dt>=0){ @@ -618,15 +665,16 @@ void SwapGPU::swap_plan(){ itm.pri = itm.dt * 1/itm.size; } //cat A - if (vec_run[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } - if (vec_run[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} - if (vec_run[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} + if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } + if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} + if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} vec_swap.push_back(itm); - load_swap+=itm.size; + // load_swap+=itm.size; + cout< maxLen_threshold) { testFlag = 1; + three_more_globeCounter = globeCounter + 3*maxLen; + three_more_location = location + 3*maxLen; cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<vec_load2(&global_load[location],&global_load[location+3*maxLen]); - origin_load = vec_load2; - //load before swap, write in - fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); - for (int i=0; ivec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]); + // origin_load = vec_load2; + // //load before swap, write in + // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); + // for (int i=0; i Date: Wed, 29 Aug 2018 15:17:05 +0800 Subject: [PATCH 14/19] impl swap_construct_tables(), swap_update_tables(), DeploySwap() verified itm 5 indices in Table_sched vec_swap_select pass by reference in swap_sched() impl swap_update_tables(), before DeploySwap(), both at Append() for time being, remove negative r_idx itms && git push origin vd1 handle last itr by impl sizeSqn and verification to change asyncSwapFlag back to 0 --- include/singa/core/device.h | 33 +++-- src/core/device/swap_gpu.cc | 235 +++++++++++++++++++++++++++--------- src/core/memory/memory.cc | 4 +- 3 files changed, 200 insertions(+), 72 deletions(-) diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 8d51d61d09..db50dd4d4a 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -300,33 +300,41 @@ class SwapGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; + //Append at every index: malloc, free, read, mutable + void Append(string blockInfo) override; + //append info after Malloc, pair. void MakeMetaTable(Block* block,void* data_,int size) override; + //all the testing, without swap, during Append() + void Test_sched_switch_swap(); + //test iteration, return GC int swap_test(vectorvec_block,int &maxLen, int &location); - //schedule algo - void swap_sched(vectorvec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); - - //entire plan, from swap_select() to swap_sched() + //entire plan, from swap_select() to swap_sched(), swap_deploy_tables() void swap_plan(); //selection algo vector swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode); + + //schedule algo + void swap_sched(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); - //load profile as per synchronous swap. - vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); - - //all the testing, without swap, during Append() - void Test_sched_switch_swap(); + //make tables Table_sched and Table_meta + void swap_construct_tables(vectorvec_swap_selct); + + //update Table_meta, during Append() + void swap_update_tables(Block* tempBlock_); //swap, during Append() void DeploySwap(); - //Append at every index: malloc, free, read, mutable - void Append(string blockInfo) override; + + //load profile as per synchronous swap. + vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); + //in case gpu ptr wrong. TODO(junzhe) to verify if needed. void* GetRealGpuPtr(const Block* block_) override; @@ -349,6 +357,7 @@ class SwapGPU : public Device { //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D. map>Table_sched; // changed to with sync_r_idx + // vectorvec_swap_selct_global; //vec_block vectorvec_block; //iteration 0-3 @@ -356,6 +365,8 @@ class SwapGPU : public Device { vectorglobal_load; // from begining vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. vectorvec_run; + vectoropsSequence; //sequence of operations of one middle iteration + vectorsizeSequence; //size of all operations of one middle iteration int asyncSwapFlag = 0; //0 for sync, 1 for async. int testFlag = 0; //0 means open for test, 1 means no need test anymore. int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write. diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 6768aee844..1521912b9c 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -405,7 +405,7 @@ vector SwapGPU::swap_load_ideal(vectorvec_load,vector return vec_load_return; } -void SwapGPU::swap_sched(vectorvec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode){ +void SwapGPU::swap_sched(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode){ /* update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit. */ @@ -417,7 +417,7 @@ void SwapGPU::swap_sched(vectorvec_swap_selct, vector&vec_loa for (int i = 0; i 0){ readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); @@ -445,11 +445,11 @@ void SwapGPU::swap_sched(vectorvec_swap_selct, vector&vec_loa // cout<<" ((("< "<vec_swap_selct, vector&vec_loa sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs()); for (int i =0; i 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } @@ -489,7 +490,7 @@ void SwapGPU::swap_sched(vectorvec_swap_selct, vector&vec_loa if ((tempOverLimit_3.second != -1) && (vec_run[tempOverLimit_3.second+maxLen].t > itm.t2p)) { overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p); - cout<<"==== overhead added "<vec_swap_selct, vector&vec_loa } +void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ + cudaStream_t stream1; + cudaStream_t stream2; + cout<<"---------------print all 1, 1', 2', 2-----------"<(vec_swap_selct.size()-1);i>=0; i--){ + for (int i =0; i= 0){ + //TODO(junzhe) for time being, remove negative r_idx itms. + cout<(Table_sched.find(itm.i1)->second) = itm.r_idx; + std::get<1>(Table_sched.find(itm.i1)->second) = 0; + } + //i2p swap + if (Table_sched.find(itm.i2p) == Table_sched.end()){ + Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1); + } else { + std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx; + std::get<1>(Table_sched.find(itm.i2p)->second) = 1; + } + // i1p sync + if (Table_sched.find(itm.i1p) == Table_sched.end()){ + Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0); + } else { + std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx; + std::get<3>(Table_sched.find(itm.i1p)->second) = 0; + } + //i2 sync + if (Table_sched.find(itm.i2) == Table_sched.end()){ + Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1); + } else { + std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx; + std::get<3>(Table_sched.find(itm.i1p)->second) = 1; + } + + ///Make Table_meta + void* tempPtr = nullptr; + cudaMallocHost(&tempPtr,itm.size); //pinned memory. + BlockMeta meta; + meta.size = itm.size; + meta.cpu_ptr = tempPtr; + meta.out_stream = stream1; + meta.in_stream = stream2; + //meta.last_out_idx = vec_swap_selct[i].last_out_idx; + //meta.last_in_idx = vec_swap_selct[i].last_in_idx; + //meta.i2 = vec_swap_selct[i].i2; + Table_meta[itm.r_idx] = meta; + } + + } + cout<<"---------------print all 1, 1', 2', 2-----------"<"; + cout<(Table_sched.find(i)->second)<<" "; + cout<(Table_sched.find(i)->second)<<" "; + cout<(Table_sched.find(i)->second)<<" "; + cout<(Table_sched.find(i)->second)<get_data()<second.block_ = tempBlock_; + Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); + } + } + +} + int SwapGPU::swap_test(vectorvec_block,int &maxLen, int &location){ ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx. @@ -593,6 +683,15 @@ void SwapGPU::swap_plan(){ vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - three_more_location - maxLen; vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline; } + + // build opsSqn, and sizeSqn + // cout<<"------printing sequenc--------"<one_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]); + for (int i =0; itemp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]); vec_run = temp_vec_run; @@ -715,9 +814,10 @@ void SwapGPU::swap_plan(){ /// select till maxLoad_ideal, wdto auto tempLoad = origin_load; - auto memLimit_wdto = 350<<20; + auto memLimit_wdto = 550<<20; //TODO(junzhe) memLimit = maxLoad_ideal*1.4 auto vec_swap_wdto = swap_select(vec_swap,tempLoad,memLimit_wdto,"wdto"); + // vec_swap_selct_global = vec_swap_wdto; cout<<"size of vec_swap_wdto: "<(Table_sched.find(r_gc)->second); auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); - if (swap_dir == 0){ SwapOut_idx(swap_idx); - cout<<"Swap Out "<second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.in_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + + // Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + last_meta.block_->update_data(nullptr); - cout<<"to free data_"<Free(last_meta.data_); last_meta.data_ = nullptr; //not really needed TODO(junzhe) - cout<<"sync out "<second = last_meta; } if (sync_dir == 1){ + ///sync swap-in, including sync, update block's data_ to new gpu address, update meta. //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe) auto last_meta = Table_meta.find(sync_idx)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.out_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device.erase(last_meta.block_); + // Table_not_at_device.erase(last_meta.block_); last_meta.block_->update_data(last_meta.data_); - cout<<"sync in "<second = last_meta; } - cout<<"-------"< v = swap_split(blockInfo, " "); void* tempPtr; stringstream convert(v[1]); @@ -1013,6 +1120,7 @@ void SwapGPU::Append(string blockInfo){ string tempStr1 = strm1.str(); blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2]; } + // update global load if (maxLen < maxLen_threshold){ if (v[0] == "Malloc"){ @@ -1027,36 +1135,29 @@ void SwapGPU::Append(string blockInfo){ global_load.push_back(global_load[global_load.size()-1]); } } + + //append into vec_block + vec_block.push_back(blockInfo); + + //cout<size()<maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){ - fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app); - for (int i =0; imaxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){ + // fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app); + // for (int i =0; i maxLen_threshold) { - //cout<get_data()<second.block_ = tempBlock_; - Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); - } - } + //print time duration per iteration if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){ if (tempTime != 0){ fstream file_time("itr_time.csv", ios::in|ios::out|ios::app); @@ -1067,6 +1168,18 @@ void SwapGPU::Append(string blockInfo){ tempTime = (std::chrono::system_clock::now()).time_since_epoch().count(); } + //check if last iteration, TODO(junzhe) further verify with MallocFree. + if (asyncSwapFlag == 1){ + int r_gc = (gc-location)%maxLen; + if (tempBlock_->size() != sizeSequence[r_gc]){ + asyncSwapFlag = 0; + cout<<"!!!! asyncSwapFlag changed back to 0"<second)->second; - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - cudaEventSynchronize(reading_meta.in_event); - auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); + // //here should be not update_data() + // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second; + // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + // cudaEventSynchronize(reading_meta.in_event); + // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + // //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); - //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<second<second<second = meta; //cout<<"time for asynchrous: "<second = meta; //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update. //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc index 93a17a823f..ae8293b3db 100644 --- a/src/core/memory/memory.cc +++ b/src/core/memory/memory.cc @@ -98,10 +98,12 @@ void CnMemPool::Malloc(void **ptr, const size_t size) { void CnMemPool::Free(void *ptr) { CHECK(initialized_) << "Cannot free the memory as the pool is not initialzied"; - //cout<<"(normal)to free ptr "< Date: Fri, 31 Aug 2018 11:18:55 +0800 Subject: [PATCH 15/19] enable include negative r_idx into Table_sched --- src/core/device/swap_gpu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 1521912b9c..6d101a23d2 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -558,7 +558,7 @@ void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ // for (int i = static_cast(vec_swap_selct.size()-1);i>=0; i--){ for (int i =0; i= 0){ + // if (itm.r_idx >= 0){ //TODO(junzhe) for time being, remove negative r_idx itms. cout<vec_swap_selct){ //meta.last_in_idx = vec_swap_selct[i].last_in_idx; //meta.i2 = vec_swap_selct[i].i2; Table_meta[itm.r_idx] = meta; - } + // } } cout<<"---------------print all 1, 1', 2', 2-----------"< Date: Fri, 31 Aug 2018 14:14:09 +0800 Subject: [PATCH 16/19] cross iteration swap, last iteration, milestone correct swap_construct_tables(), included negative r_idx for swap_update_tables() and DeploySwap() include negative r_idx for DeploySwap() impl GetRealGpuPtr() to swapIn nullptr Block at last iteration impl GetRealGpuPtr(), and optimize data() and mutable_data() impl GetRealGpuPtr(), and optimize data() and mutable_data() verify const issue change to return tempData instead of updating data_ without remove erasing in Table_not_at_device milestone of last itr, at 550 MB --- include/singa/core/device.h | 5 +- src/core/common/common.cc | 8 +- src/core/device/swap_gpu.cc | 156 +++++++++++++++++++++++++----------- 3 files changed, 118 insertions(+), 51 deletions(-) diff --git a/include/singa/core/device.h b/include/singa/core/device.h index db50dd4d4a..ae9dd88425 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -327,9 +327,12 @@ class SwapGPU : public Device { //update Table_meta, during Append() void swap_update_tables(Block* tempBlock_); - //swap, during Append() + //swap/sync during Append() void DeploySwap(); + //exec DelpoySwap + void DeploySwap_exec(int r_gc); + //load profile as per synchronous swap. diff --git a/src/core/common/common.cc b/src/core/common/common.cc index 5a6f61268b..f3d144a9c8 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -48,7 +48,9 @@ void* Block::mutable_data() { //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"slept to get data_ updated: "<(tempData_) + offset_; } return static_cast(data_) + offset_; @@ -76,7 +78,9 @@ const void* Block::data() const { //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"slept to get data_ updated"<(tempData_) + offset_; } diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 6d101a23d2..82a72f80b1 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -587,7 +587,7 @@ void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1); } else { std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx; - std::get<3>(Table_sched.find(itm.i1p)->second) = 1; + std::get<3>(Table_sched.find(itm.i2)->second) = 1; } ///Make Table_meta @@ -605,7 +605,7 @@ void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ // } } - cout<<"---------------print all 1, 1', 2', 2-----------"<vec_swap_selct){ void SwapGPU::swap_update_tables(Block* tempBlock_){ // update Table_meta's block_ and data_; update once atfer swap test is passed. - //TODO(junzhe) should not be able to update negative r_idx, as of now. + // enable to update negative r_idx. + // it's safe in below procedure, as r_gc and r_gc_n should never be the same. if (testFlag == 1) { - //cout<second.block_ = tempBlock_; Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); } + + //update negative r_idx + int r_gc_n = r_gc - maxLen; + if (!(Table_meta.find(r_gc_n)==Table_meta.end())){ + //cout<<"r_gc, gc and size ot Table_meta "<get_data()<second.block_ = tempBlock_; + Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data(); + } } } @@ -1058,51 +1069,68 @@ void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){ void SwapGPU::DeploySwap(){ ///swap and sync as per schedule. int r_gc = (gc-location)%maxLen; + int r_gc_n = r_gc - maxLen; - if ((asyncSwapFlag == 1) && (!(Table_sched.find(r_gc) == Table_sched.end()))){ - cout<<"--------sched action at "<(Table_sched.find(r_gc)->second); - auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); - auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); - auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); - if (swap_dir == 0){ - SwapOut_idx(swap_idx); - cout<<"----Swap Out "<second; - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - cudaEventSynchronize(last_meta.in_event); - auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - - // Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. - - last_meta.block_->update_data(nullptr); - // cout<<"to free data_"<Free(last_meta.data_); - last_meta.data_ = nullptr; //not really needed TODO(junzhe) - cout<<"----sync out "<second = last_meta; + if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) { + cout<<"condition B"<second; - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - cudaEventSynchronize(last_meta.out_event); - auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // Table_not_at_device.erase(last_meta.block_); - last_meta.block_->update_data(last_meta.data_); - cout<<"----sync in "<second = last_meta; + if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) { + cout<<"condition C"<(Table_sched.find(r_gc)->second); + auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); + auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); + auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); + if (swap_dir == 0){ + SwapOut_idx(swap_idx); + cout<<"----Swap Out "<second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaEventSynchronize(last_meta.in_event); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + + Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + + last_meta.block_->update_data(nullptr); + // cout<<"to free data_"<Free(last_meta.data_); + last_meta.data_ = nullptr; //not really needed TODO(junzhe) + cout<<"----sync out "<second = last_meta; + } + if (sync_dir == 1){ + ///sync swap-in, including sync, update block's data_ to new gpu address, update meta. + //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe) + auto last_meta = Table_meta.find(sync_idx)->second; + auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaEventSynchronize(last_meta.out_event); + auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); + Table_not_at_device.erase(last_meta.block_); + last_meta.block_->update_data(last_meta.data_); + cout<<"----sync in "<second = last_meta; + } } void SwapGPU::Append(string blockInfo){ @@ -1191,6 +1219,37 @@ void SwapGPU::Append(string blockInfo){ } void* SwapGPU::GetRealGpuPtr(const Block* block_){ + // in case that block is at host memory, swapIn ad hoc. + auto r_idx = Table_not_at_device.find(block_)->second; + + // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); + cudaError_t err; + BlockMeta meta = Table_meta.find(r_idx)->second; + cudaEventCreate (&meta.in_event); + //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); + //cout<<"expected results update_data:: "<update_data(last_meta.data_); + // cout<<"----sync in "<second = last_meta; + Table_meta.find(r_idx)->second = meta; + // //here should be not update_data() // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second; // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); @@ -1199,12 +1258,13 @@ void* SwapGPU::GetRealGpuPtr(const Block* block_){ // //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); - // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<second<update_data(static_cast(ptr)); + + cout<<"print ptr from function GetRealGpuPtr() "< Date: Mon, 10 Sep 2018 16:06:56 +0800 Subject: [PATCH 17/19] vd2: swap+pool new class of pool: SwapPool important APIs: PoolOpt(), Malloc(), Free() PoolOpt() takes in M/F sequences including those induced by swapping cross-iteration variables and last iteration case solved. record down MF after swap done, for one iteration --- examples/cifar10/train.py | 4 +- include/singa/core/device.h | 1 + include/singa/core/memory.h | 36 ++- src/core/device/cuda_gpu.cc | 2 +- src/core/device/platform.cc | 2 +- src/core/device/swap_gpu.cc | 89 ++++++- src/core/memory/memory.cc | 467 ++++++++++++++++++++---------------- 7 files changed, 376 insertions(+), 225 deletions(-) diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py index 652772ba5a..861bd65ac0 100644 --- a/examples/cifar10/train.py +++ b/examples/cifar10/train.py @@ -130,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, dev = device.get_default_device() else: print('Using GPU') - dev = device.create_cuda_gpu_on(1) + dev = device.create_cuda_gpu_on(0) net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) @@ -153,7 +153,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, fileTimeLog.write('Epoch %d: ' % epoch) fileTimeLog.write(str(int(round(time.time()*1000)))) fileTimeLog.write('\n') - for b in range(15): #num_train_batch): + for b in range(20): #num_train_batch): print ("start of iteration %d: " %b) #time.sleep(1) fileTimeLog.write('iteration %d: ' % b) diff --git a/include/singa/core/device.h b/include/singa/core/device.h index ae9dd88425..96bc8e9c41 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -365,6 +365,7 @@ class SwapGPU : public Device { //vec_block vectorvec_block; //iteration 0-3 vectorvec_block_fresh; //iteration 4 5 6 + vectorvec_block_mf; //itr 8 9 10 vectorglobal_load; // from begining vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. vectorvec_run; diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h index cc40d0433c..343b4449de 100644 --- a/include/singa/core/memory.h +++ b/include/singa/core/memory.h @@ -50,6 +50,8 @@ class DeviceMemPool { virtual void Malloc(void** ptr, const size_t size) = 0; virtual void Free(void* ptr) = 0; virtual void Append(string blockInfo) = 0; + + virtual void PoolOpt(vector &vec_mf) = 0; virtual void SwapOut(void* data_) = 0; virtual void SwapIn(void* data_) = 0; @@ -74,7 +76,9 @@ class CnMemPool : public DeviceMemPool { void Malloc(void** ptr, const size_t size); void Free(void* ptr); - void Append(string blockInfo){} + void Append(string blockInfo){} + + void PoolOpt(vector &vec_mf) override {} void SwapOut(void* data_) override {} void SwapIn(void* data_) override {} @@ -102,7 +106,9 @@ class CudaMemPool : public DeviceMemPool { public: void Malloc(void** ptr, const size_t size) override; void Free(void* ptr) override; -void Append(string blockInfo){} + void Append(string blockInfo){} + + void PoolOpt(vector &vec_mf) override {} void SwapOut(void* data_) override {} void SwapIn(void* data_) override {} @@ -134,9 +140,11 @@ class SmartMemPool: public DeviceMemPool { void getMaxLoad(void); std::pair GetMemUsage() override; void Append(string blockInfo); + + void PoolOpt(vector &vec_mf) override {} - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} + void SwapOut(void* data_) override {} + void SwapIn(void* data_) override {} protected: void Init(); private: @@ -196,19 +204,22 @@ struct SwapMeta{ void* d_ptr; //not used for }; -class Swap : public DeviceMemPool { +class SwapPool : public DeviceMemPool { public: - Swap(const MemPoolConf &conf); //constructor + SwapPool(const MemPoolConf &conf); //constructor //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. void Malloc(void** ptr, const size_t size); void Free(void* ptr); - ~Swap(); + ~SwapPool(); void getMaxLoad(void); std::pair GetMemUsage() override; void Append(string blockInfo); void SwapOut(void* data_); void SwapIn(void* data_); + + //PoolOpt() construct pool based on MF info after Swap constructed. + void PoolOpt(vector &vec_mf); protected: void Init(); private: @@ -219,8 +230,15 @@ class Swap : public DeviceMemPool { std::mutex mtx_; vector vec_block; size_t swapLimit = 1<<23; //8MB - mapTable_id2LookUpElement; //old TODO(junzhe) remove - map>Table_Meta; + int poolFlag = 0; + int pc = 0; + int maxLen_mf = 0; + void* ptrPool = nullptr; + mapTable_p2r; //ptr for arrival idx, for look up Table during free + mapTable_r2v; //r-> vertex + vector>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe) + // mapTable_id2LookUpElement; //old TODO(junzhe) remove + // map>Table_Meta; }; #endif diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc index 16de1ef5ca..52d4b4fb02 100644 --- a/src/core/device/cuda_gpu.cc +++ b/src/core/device/cuda_gpu.cc @@ -48,7 +48,7 @@ const int kNumCudaStream = 1; CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) { MemPoolConf conf; conf.add_device(id); - pool_ = std::make_shared(conf); + pool_ = std::make_shared(conf); Setup(); } diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc index d64b2594c2..48b2a94520 100644 --- a/src/core/device/platform.cc +++ b/src/core/device/platform.cc @@ -128,7 +128,7 @@ Platform::CreateCudaGPUsOn(const vector &devices, size_t init_size) { conf.add_device(device); CHECK_LE(bytes, Platform::GetGPUMemSize(device).first); } - auto pool = std::make_shared(conf); + auto pool = std::make_shared(conf); vector > ret; for (auto device : devices) { diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 82a72f80b1..cd4f1fdbd9 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -42,7 +42,6 @@ const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice, ///functions to be used ///Section for structs and respective sorting function: -// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm @@ -924,7 +923,8 @@ SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) { MemPoolConf conf; conf.add_device(id); - pool_ = std::make_shared(conf); + //TODO(junzhe) note that it has been for building SwapGPU, which doesnt matter. + pool_ = std::make_shared(conf); Setup(); } @@ -987,6 +987,26 @@ void* SwapGPU::Malloc(int size) { if (size > 0) { CUDA_CHECK(cudaSetDevice(id_)); pool_->Malloc((void**)&ptr, size); + + ///append vec_block_mf + if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) + && ((gc - maxLen) >= three_more_globeCounter)){ + string tempStr1 ="Malloc "; + stringstream strm2; + strm2<Free(ptr); + ///append vec_block_mf + if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) + && ((gc - maxLen) >= three_more_globeCounter)){ + string tempStr1 ="Free "; + stringstream strm2; + strm2<update_data(nullptr); // cout<<"to free data_"<Free(last_meta.data_); + ///append vec_block_mf + if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) + && ((gc - maxLen) >= three_more_globeCounter)){ + string tempStr1 ="Free "; + stringstream strm2; + strm2<second = last_meta; @@ -1213,8 +1263,24 @@ void SwapGPU::Append(string blockInfo){ //test moved from start of malloc/free to end of append, only gc+1 changed Test_sched_switch_swap(); - //NOTE: this gc++ includes read/write and AppendLayer as well, in addition to malloc/free. + //NOTE: this gc includes read/write and AppendLayer as well, in addition to malloc/free. gc++; + if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) == three_more_globeCounter)){ + cout<<"==================to call PoolOpt"<PoolOpt(vec_block_mf); + cout<<"==================to call PoolOpt done"<Malloc((void**)&ptr, meta.size); + ///append vec_block_mf + if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) + && ((gc - maxLen) >= three_more_globeCounter)){ + string tempStr1 ="Malloc "; + stringstream strm2; + strm2< split(string s, string delimiter) { //vector of pairMsg is used in run. //vector of iterMsg is used in test. -vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ +vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ /* - convert vector of string into vector of onePieceMsg, sorted by ptr and then idx, and update idxRange to pieceMsgVec size. + convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size. */ - vectoronePieceMsgVec_; + vectoronePieceMsg_poolVec_; for (int i=0;i v = split(vec[i], " "); if (v[0]=="Malloc"){ @@ -262,24 +262,24 @@ vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ result =-1; cout<<"error for converting size from str to int."<(onePieceMsgVec_.size()); + sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx()); + idxRange = static_cast(onePieceMsg_poolVec_.size()); - return onePieceMsgVec_; + return onePieceMsg_poolVec_; }// end of strVec_2_pieceMsgVec function -pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vectoronePieceMsgVec_, int idxRange){ +pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vectoronePieceMsg_poolVec_, int idxRange){ /* pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks. */ @@ -288,27 +288,27 @@ pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vecto int i=0; //while loop processes a pair at each time, if got a pair. - while (i<(onePieceMsgVec_.size()-1)){ + while (i<(onePieceMsg_poolVec_.size()-1)){ //condition A: start with free. do nothing. - if (onePieceMsgVec_[i].MallocFree==-1){ + if (onePieceMsg_poolVec_[i].MallocFree==-1){ i+=1; } //condition B: start with Malloc, next item same ptr and is free. - if ((onePieceMsgVec_[i].MallocFree==1)&& (onePieceMsgVec_[i+1].MallocFree==-1)&&((onePieceMsgVec_[i].ptr==onePieceMsgVec_[i+1].ptr))){ - onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,onePieceMsgVec_[i+1].idx); + if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){ + onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx); onePairMsgVec_1.push_back(tempPair); i+=2; } // condition C: start with Malloc, no free. - if ((onePieceMsgVec_[i].MallocFree==1)&&(onePieceMsgVec_[i].ptr!=onePieceMsgVec_[i+1].ptr)){ - onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange); + if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){ + onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange); onePairMsgVec_2.push_back(tempPair); i+=1; } }//end of while //condition D: if still left with the last item - if ((i colorSomeVertices(vector pairMsgVec_, size_t &offset, ///get cross-iteration duration pairs pair,map> cross_itr_durations(vectorvec_double, int location, int maxLen, int &doubleRange){ - vectoronePieceMsgVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange); - pair,vector>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_2,doubleRange); + vectoronePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange); + pair,vector>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange); mapTable_r2d; //full duration info, cross-iteration duration. mapTable_d2r; @@ -536,8 +536,8 @@ vector run(vectorvec, int &idxRange, size_t &offset, size_t &off update idxRange, offset. time complexity: O(n^2) where n is maxLen. */ - vectoronePieceMsgVec_ = strVec_2_pieceMsgVec(vec,idxRange); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_,idxRange); + vectoronePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange); + pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange); //1. normal blocks 2. cross-iteration blocks. vectorpairMsgVec_1 = pairOfPairMsgVec_.first; vectorpairMsgVec_2 = pairOfPairMsgVec_.second; @@ -643,8 +643,8 @@ int test(vectorvec3, int &maxLen, int &location){ */ cout<<"====================== test ========================="<onePieceMsgVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_3,idxRange3); + vectoronePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3); + pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3); vectorrep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_); //get repeated sub vector. @@ -976,13 +976,13 @@ void SmartMemPool::Append(string blockInfo) { vec_block_RWMF.push_back(blockInfo); } -///Swap: -Swap::Swap(const MemPoolConf &conf){ +///SwapPool +SwapPool::SwapPool(const MemPoolConf &conf){ conf_ = conf; } -void Swap::Init(){ - //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. +void SwapPool::Init(){ + mtx_.lock(); if(!initialized_){ initialized_ =true; @@ -990,203 +990,252 @@ void Swap::Init(){ mtx_.unlock(); } -void Swap::Malloc(void** ptr, const size_t size){ - //cout<<"to malloc"<meta = std::make_pair(cpu, gpu); - Table_Meta[*ptr] = meta; - - swapLookUpElement temp; - temp.size = size; - Table_id2LookUpElement[*ptr] = temp; - // int i = 0; - // if (!(Table_id2LookUpElement.find(*ptr)==Table_id2LookUpElement.end())){ - // i = i + 1; - // temp.data_ = *ptr +i*sizeof(char); - // while(!(Table_id2LookUpElement.find(temp.data_)==Table_id2LookUpElement.end())){ - // //TODO(swap) verify this loop, can simplify as well. - // i = i + 1 - // temp.data_ = *ptr +i*sizeof(char); - // } - // } else { - // temp.data_ = *ptr; - // } - // temp.realGpuPtr = *ptr; - // temp.location = 1; - // temp.size = size; - // create before swap. - // if (size>swapLimit){ - // temp.realCpuPtr = malloc(size); +void SwapPool::PoolOpt(vector &vec_mf) { + //TODO(junzhe) redo 9/17 + + ///process vec_mf of 3itr into blocks,maxLen + //assume format of string: MF ptr size; + //onePieceMsg_pool verified + // for (int i = 0; i< vec_mf.size();i++){ + // cout<<"print mf "<onePieceMsg_poolVec_; + maxLen_mf = vec_mf.size()/3; + cout<<"maxLen_mf "< v = split(vec_mf[i], " "); + // cout<<"print mf "<>result)){ + result =-1; + cout<<"error for converting size from str to int."<pairMsgVec_; + int i = 0; + // cout<<"before while loop"<=0 && onePieceMsg_poolVec_[i].idx =0 && onePieceMsg_poolVec_[i+1].idx (pairMsgVec_.size()); + vectorvertices; + for (int i=0; i>colorMerged = mergeSeg(vertices[i].colorOccupied); + + // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); + vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset); + //update of offset, largest memory footprint as well. + if (vertices[i].colorRange.second >=offset){ + offset = vertices[i].colorRange.second+1; + } + }//end of for loop. + cout<<"offset is "<second.size))){ + //not in table of negative r_idx + cudaError_t status = cudaMalloc(ptr, size); + CHECK_EQ(status, cudaError_t::cudaSuccess); + } else{ + //in the table of negative r_idx + auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second; + allocatedPtr = tempMeta.ptr; + *ptr = allocatedPtr; + Table_p2r[allocatedPtr]=pc-maxLen_mf; + + } + } else{ + //8 9 10 + int r_pc = pc%maxLen_mf; + if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){ + //not here, should be abnormal + cudaError_t status = cudaMalloc(ptr, size); + CHECK_EQ(status, cudaError_t::cudaSuccess); + } else{ + //in the table + auto tempMeta = Table_r2v.find(r_pc)->second; + allocatedPtr = tempMeta.ptr; + *ptr = allocatedPtr; + Table_p2r[allocatedPtr]=r_pc; -void Swap::SwapOut(void* data_){ - printf("A. to swapOut\n"); - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - size_t swapSize = Table_Meta.find(data_)->second.second.swapSize; - Table_Meta.find(data_)->second.first.ptr = malloc(swapSize); - SwapMeta cpu, gpu; - cpu = Table_Meta.find(data_)->second.first; - gpu = Table_Meta.find(data_)->second.second; - cudaError_t err; - err=cudaMemcpy(cpu.ptr,gpu.ptr,gpu.swapSize,cudaMemcpyDeviceToHost); - if (err != cudaSuccess) - { - fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); - exit(EXIT_FAILURE); } - auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - fstream file_block3("blockInfo_swapOut.text", ios::in|ios::out|ios::app); - file_block3<second.first; - gpu = Table_Meta.find(data_)->second.second; - gpu.ptr=nullptr; - cudaError_t status = cudaMalloc(&gpu.ptr, gpu.swapSize); - CHECK_EQ(status, cudaError_t::cudaSuccess); - Table_Meta.find(data_)->second.second.ptr=gpu.ptr; - //cout<<"after alloc:1 "<second.second.ptr<second.size; - // cpu.ptr=malloc(cpu.swapSize); - // gpu=cpu; - // cudaMalloc(&gpu.ptr,cpu.swapSize); - - // cudaError_t err; - // err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice); - // printf("2. swapIn done.\n"); - -///below partial copy - // SwapMeta h_meta; - // SwapMeta* d_meta; - // h_meta.swapSize=Table_id2LookUpElement.find(data_)->second.size; - // h_meta.h_ptr=malloc(h_meta.swapSize); - // cudaMalloc(d_meta,sizeof(SwapMeta)); - // cudaMemcpy(d_meta,&h_meta,sizeof(SwapMeta),cudaMemcpyHostToDevice); - - // void** d_ptr; - // cudaMalloc() - // void* h_ptr=malloc(tempSize); - -///below is to swapIn swapped out items - // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // size_t tempSize = Table_id2LookUpElement.find(data_)->second.size; - // void** tempPtr; - // cudaMalloc(tempPtr,tempSize); - // cout<<"1. to swapIn."<second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice); - // if (err != cudaSuccess) - // { - // fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); - // exit(EXIT_FAILURE); - // } - // //cudaMemcpy(data_, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice); - // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // fstream file_block4("blockInfo_swapIn.text", ios::in|ios::out|ios::app); - // file_block4<second.realCpuPtr); - // cout<<"testing after SwapIn"< Swap::GetMemUsage() { +std::pair SwapPool::GetMemUsage() { //empty return std::make_pair(0, 0); } -Swap::~Swap(){ - //put in notes - fstream file_block1("blockInfo.text", ios::in|ios::out|ios::app); - for (int i=0; i< vec_block.size();i++){ - file_block1< Date: Wed, 31 Oct 2018 14:08:53 +0800 Subject: [PATCH 18/19] add documentation --- .DS_Store | Bin 8196 -> 0 bytes CMakeLists.txt | 2 +- examples/cifar10/{alexnet.py => cnn.py} | 0 examples/cifar10/train.py | 2 +- include/singa/core/common.h | 6 +- include/singa/core/device.h | 215 ++-- include/singa/core/memory.h | 209 ++- src/.DS_Store | Bin 8196 -> 0 bytes src/core/.DS_Store | Bin 6148 -> 0 bytes src/core/common/common.cc | 51 +- src/core/device/cuda_gpu.cc | 9 +- src/core/device/device.cc | 27 +- src/core/device/swap_gpu.cc | 1487 +++++++++------------- src/core/memory/memory.cc | 1543 +++++++++++------------ 14 files changed, 1562 insertions(+), 1989 deletions(-) delete mode 100644 .DS_Store rename examples/cifar10/{alexnet.py => cnn.py} (100%) delete mode 100644 src/.DS_Store delete mode 100644 src/core/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index e74703a65a58eddc3bf6ead31e5ced542081bb7d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMTWl0n82=R4;= z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&* zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O5cxhWj-)YuLJJ5A_yJ z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!& z7c=SpNUS&h86O)s)*ua3p~ zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS* z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i zH5|emo<g1!o2x-7Wi{SIsxr1_#?V_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z z_)SXmL#XnP>(1); } // Disabled as it is not used currently. @@ -90,7 +90,7 @@ class Block { void* data_ = nullptr; size_t size_ = 0; size_t offset_ = 0; - Device* ptrDevice_; + Device* ptr_device_; bool initialized_ = false; // Disabled as it is not used currently. // std::shared_ptr> ref_count_ = nullptr; diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 96bc8e9c41..e9dcc1402d 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -66,10 +66,8 @@ class Device { /// Called by Tensor. void FreeBlock(Block* block); - void AppendInfo(string blockInfo); - void* GetRealGpuPtrInfo(const Block* block_); - void SwapOutInfo(const Block* block_); - void SwapInInfo(const Block* block_); + void AppendInfo(string block_info); + void* UpdateGpuPtrInfo(const Block* block_ptr); /// Return the size (bytes) of memory in use /// TODO(wangwei) override this function for all devices. @@ -108,7 +106,7 @@ class Device { int id() const { return id_; } - virtual void* GetRealGpuPtr(const Block* block_) = 0; + virtual void* UpdateGpuPtr(const Block* block_ptr) = 0; private: Device() {}; @@ -125,11 +123,8 @@ class Device { /// Free device memory. virtual void Free(void* ptr) = 0; - virtual void MakeMetaTable(Block* block,void* data_,int size) = 0; - virtual void Append(string blockInfo) = 0; - - virtual void SwapOut(const Block* block_) = 0; - virtual void SwapIn(const Block* block_) = 0; + virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0; + virtual void Append(string block_info) = 0; protected: int id_ = 0; @@ -171,11 +166,10 @@ class CppCPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override {} - void* GetRealGpuPtr(const Block* block_) override {} - void SwapOut(const Block* block_) override {} - void SwapIn(const Block* block_) override {} + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} + }; @@ -206,11 +200,9 @@ class CudaGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override; - void* GetRealGpuPtr(const Block* block_) override; - void SwapOut(const Block* block_) override; - void SwapIn(const Block* block_) override; + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override; + void* UpdateGpuPtr(const Block* block_ptr) override; private: void Setup(); @@ -222,21 +214,21 @@ class CudaGPU : public Device { /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host. ///SwapGPU -struct onePieceMsg{ +struct DeviceOptInfo{ /* - members: [ptr, size, MallocFree, idx] + members: [ptr, size, operation_type, idx] */ string ptr; size_t size; - int MallocFree; + int operation_type; int idx; double t; - onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} }; struct BlockMeta{ /* - block Meta. + meta of swapping memory blocks */ Block* block_ = nullptr; void* data_ = nullptr; @@ -249,34 +241,39 @@ struct BlockMeta{ }; struct SwapBlock{ - + /* + meta of candidate blocks + */ string ptr; - string cat; //A1, A2, A3... + string cat; //sub category of the candidate blocks, read-read, write-read, etc. int name; size_t size; + //index of last read/write before swap out, and first read/write after swap in int r_idx; //out idx int d_idx; //in idx + //index of last read/write before swap out, and first read/write after swap in double r_time; // out time double d_time; //in time - double dt; //delta t: t2'-t1' - double pri; //look at here if big enough TODO(junzhe) - double dto; //t2-t1 - double wdto = 0; //t2-t1 weighted by swap_load - double r_idx_ready; //r_idx + buffer, could be set during selection. - //int free = -1; //when it is freed - //below as per planned. - int i1 = 0; - int i1p = 0; - int i2 = 0; - int i2p = 0; - double t1 = 0; - double t2 = 0; - double t1p = 0; - double t2p = 0; - SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): - ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {} + double DOA; //Duation of Absence + double AOA; //Area of Absence + double DOA_origin; //t2-t1, DOA without taking out time spent + double WDOA = 0; //weighted DOA + double majority_voting = 0; + int r_idx_ready; //r_idx + buffer + + //below are index and time for scheduling + int idx_out_start = 0; + int idx_out_end = 0; + int idx_in_end = 0; + int idx_in_start = 0; + double t_out_start = 0; + double t_out_end = 0; + double t_in_end = 0; + double t_in_start = 0; + SwapBlock(string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end): + ptr(p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {} }; -/// Device able to Swap memory between Nvidia GPU and Swap +/// Device able to Swap memory between Nvidia GPU and CPU class SwapGPU : public Device { public: ~SwapGPU(); @@ -300,98 +297,92 @@ class SwapGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - //Append at every index: malloc, free, read, mutable - void Append(string blockInfo) override; + //Append at every index: free, read, mutable + void Append(string block_info) override; - //append info after Malloc, pair. - void MakeMetaTable(Block* block,void* data_,int size) override; + //append info after Malloc, as Block* is not available till Malloc() done. + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; - //all the testing, without swap, during Append() - void Test_sched_switch_swap(); + //Detection and Plan + void DetectionPlan(); //test iteration, return GC - int swap_test(vectorvec_block,int &maxLen, int &location); + int Detection(vectorvec_block,int &iteration_length, int &location_of_2nd_iteration); - //entire plan, from swap_select() to swap_sched(), swap_deploy_tables() - void swap_plan(); + //entire plan, from SelectBlock() to Scheduling(), BuildMetaTables() + void Plan(); - //selection algo - vector swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode); + //block selection algo + vector SelectBlock(vectorvec_swap,vector temp_load,double mem_limit,string mode); //schedule algo - void swap_sched(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); + void Scheduling(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double mem_limit,string mode); - //make tables Table_sched and Table_meta - void swap_construct_tables(vectorvec_swap_selct); + //make tables table_sched and table_meta + void BuildMetaTables(vectorvec_swap_selct); - //update Table_meta, during Append() - void swap_update_tables(Block* tempBlock_); + //update table_meta, during Append() + void UpdateMetaTables(Block* block_ptr); //swap/sync during Append() void DeploySwap(); //exec DelpoySwap - void DeploySwap_exec(int r_gc); - - + void DeploySwapExec(int relative_counter); //load profile as per synchronous swap. - vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); + vector GetIdealLoad(vectorvec_load,vector vec_swap_selct); - //in case gpu ptr wrong. TODO(junzhe) to verify if needed. - void* GetRealGpuPtr(const Block* block_) override; + //in case gpu ptr wrong, updated it after swap_in ad hoc + void* UpdateGpuPtr(const Block* block_ptr) override; - void SwapOut(const Block* block_) override; - void SwapIn(const Block* block_) override; + //Swap Synchronous, for early iterations + void SwapOutSynchronous(const Block* block_ptr); + void SwapInSynchronous(const Block* block_ptr); - //changed to intake data_ instead - void SwapOut_idx(const int r_idx); - void SwapIn_idx(const int r_idx); + //Swap asynchronous, for middle iteraions + void SwapOut(const int idx); + void SwapIn(const int idx); private: void Setup(); - ///Tables needed - //r_idx->BlockMeta - mapTable_meta; - mapTable_block_meta; //TODO(junzhe) for measure speed only. - mapTable_not_at_device; //int refers to its r_idx of the block/meta - //mapTable_block_size; //Table block_ -> size TODO(junzhe) no need, can call block_->size() - - //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D. - map>Table_sched; // changed to with sync_r_idx - // vectorvec_swap_selct_global; + maptable_meta; + maptable_block_meta; //for measure speed only. + maptable_not_at_device; //int refers to its r_idx of the block/meta + map>table_sched; // changed to with sync_r_idx //vec_block - vectorvec_block; //iteration 0-3 - vectorvec_block_fresh; //iteration 4 5 6 - vectorvec_block_mf; //itr 8 9 10 - vectorglobal_load; // from begining - vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. - vectorvec_run; - vectoropsSequence; //sequence of operations of one middle iteration - vectorsizeSequence; //size of all operations of one middle iteration - int asyncSwapFlag = 0; //0 for sync, 1 for async. - int testFlag = 0; //0 means open for test, 1 means no need test anymore. - int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write. - int globeCounter = -1; - int maxLen = 0; - int location = 0; - int three_more_location = 0; //location at 3 more iterations later. - int three_more_globeCounter = -1; // - //design requirement TODO(junzhe) - float memLimit_ratio = 0.70; + vectorvec_block; //iterations for Detection, i.e. detect iterations. + vectorvec_block_fresh; //iterations that are used for Planning, + vectorvec_block_mf; //iterations used to construct pool + vectorglobal_load; // load from begining + vectororigin_load; //3 iteration load, for planning. + vectorvec_run; + vectoroperation_sequence; //sequence of operations of one middle iteration + vectorsize_sequence; //size of all operations of one middle iteration + + int async_swap_flag = 0; //0 for sync, 1 for async. + int past_test_flag = 0; //0 means need to test, 1 means no need test anymore. + int global_index = 0; //global counter, index, add 1 after each Malloc/Free/read/write. + int global_index_threshold = -1; + int iteration_length = 0; + int location_of_2nd_iteration = 0; //index of start of 2nd iteration + int location_of_5th_iteration = 0; //index of start of 5th iteration + int three_more_iteration_global_index_threshold = -1; + + //design specs + float mem_limit_ratio = 0.70; size_t smallest_block = 1<<20; //1 MB int data_buffer = 4; // used to control readyIdx int mutable_data_buffer = 6; - double maxLoad; - int maxIdx; - double total_swapInTime = 0; - double total_swapOutTime = 0; - double tempTime = 0; - double tempTime2 = 0; - double tempTime_baseline; //vec_run[0] time - int maxLen_threshold = 1000; + double max_load; + int max_idx; + double total_swap_in_time = 0; + double total_swap_out_time = 0; + double temp_time = 0; + double temp_time_baseline; //vec_run[0] time + int iteration_length_threshold = 1000; private: shared_ptr pool_; @@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device { /// Converts the void pointer into a Buffer object, then deletes the object. /// This has the effect of freeing up device memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override {} - void* GetRealGpuPtr(const Block* block_) override {} - void SwapOut(const Block* block_) override {} - void SwapIn(const Block* block_) override {} + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} private: diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h index 343b4449de..b3dfd672ec 100644 --- a/include/singa/core/memory.h +++ b/include/singa/core/memory.h @@ -53,8 +53,6 @@ class DeviceMemPool { virtual void PoolOpt(vector &vec_mf) = 0; - virtual void SwapOut(void* data_) = 0; - virtual void SwapIn(void* data_) = 0; /// Return a pair for free and total memory managed by this pool. virtual std::pair GetMemUsage() { return std::make_pair(0u, 0u); @@ -80,8 +78,6 @@ class CnMemPool : public DeviceMemPool { void PoolOpt(vector &vec_mf) override {} - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} std::pair GetMemUsage() override; // release all memory and set cnmem manager to unintialized @@ -110,135 +106,116 @@ class CudaMemPool : public DeviceMemPool { void PoolOpt(vector &vec_mf) override {} - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} }; -//for SmartMemPool -struct lookUpElement{ - /* - for memory pool Malloc look-up table. - */ - int r_idx; - int d_idx; - size_t size; - size_t offset; - void* ptr; - int Occupied; //0 is free, 1 is occupied. - int crossItr; - int Occupied_backup; +//for SmartMemPool and SwapPool +struct PoolBlockMeta{ + /* + for memory pool Malloc look-up table. + */ + int r_idx; + int d_idx; + size_t size; + size_t offset; + void* ptr; + int occupied; //0 is free, 1 is occupied. + int cross_iteration; + int occupied_backup; +}; + +///struct Vertex +struct Vertex{ + int name; + size_t size; + int r; //arrive + int d; //depart + int cross_iteration =0; + pair color_range; + vector> vec_color_preoccupied; + Vertex(int n, size_t s, int r1, int d1):name(n),size(s),r(r1),d(d1){} + }; -///class mem-pool SmartMemPool + +///SmartMemPool class SmartMemPool: public DeviceMemPool { public: - SmartMemPool(const MemPoolConf &conf); //constructor - //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. - void Malloc(void** ptr, const size_t size); - void Free(void* ptr); - ~SmartMemPool(); - void getMaxLoad(void); - std::pair GetMemUsage() override; - void Append(string blockInfo); - - void PoolOpt(vector &vec_mf) override {} - - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} -protected: - void Init(); -private: - MemPoolConf conf_; - // whether the (global) memory pool has been initialized - bool initialized_ = false; - // lock on the initialized variable - std::mutex mtx_; - - string colorMethod; - int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc - int gc =0; //global counter each time Malloc/Free, add 1. - int globeCounter=-1; - int loadLogFlag =1; //record when its 1. - void* ptrPool = NULL; - int idxRange = 0; - size_t offset = 0; - size_t offsetCrossItr=0; //cross iteration offset. - int maxLen =0; - int location=0; - vector vec; - vector vec_block_RW; - vector vec_block_RWMF; - mapTable_r2d; //full duration info, cross-iteration duration. - mapTable_d2r; - //mapTable_r2Ver; - vector>Vec_r2Ver; //b. replace Table_r2Ver - map>Table_load; //gc, - mapTable_p2s; //For tracking load in Free. add when allocate, delete when deallocate. - mapTable_p2r; //ptr for arrival idx, for look up Table during free - int checkPoint=300; //for reduce number of test. - size_t maxTotalLoad; - size_t maxMemUsage; - float memRatio; -}; + SmartMemPool(const MemPoolConf &conf); //constructor + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~SmartMemPool(); + std::pair GetMemUsage() override; + void GetMaxLoad(void); + void Append(string blockInfo); + vector Plan(vectorvec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method); + int Detection(vectorvec_string_test, int &iteration_length, int &location_2nd_iteration); + void PoolOpt(vector &vec_mf) override {} -//for Swap -struct swapLookUpElement{ - /* - book keep the block info and status - */ - void* data_ = nullptr; - void* realGpuPtr = nullptr; - void* realCpuPtr = nullptr; +protected: + void Init(); +private: + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; - int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C. - size_t size; //size may used as of now. + string color_method; + int malloc_flag = 0; //0 for cudaMalloc, 1 for coloringMalloc + int global_index = 0; //global counter each time Malloc/Free, add 1. + int global_index_threshold = -1; + int load_flag = 1; //record load at 1 + void* ptr_pool = NULL; + int idx_range = 0; + size_t offset = 0; + size_t offset_cross_iteration = 0; //cross iteration offset. + int iteration_length = 0; + int location_2nd_iteration = 0; + vector vec; + vector vec_block_rw; //read write only opt info + vector vec_block_rw_mf; //read write, malloc, free opt info + maptable_ridx_to_didx; //table match from r_idx to d_idx + maptable_didx_to_ridx; //table match from d_idx to r_idx + + vector>vec_block_meta; //vec of block meta, index in the vector refering to the r_idx + map>table_load; //global_index, + maptable_ptr_to_size; //for tracking load in Free. add when allocate, delete when deallocate. + maptable_ptr_to_ridx; //ptr for arrival idx, for look up Table during free + int check_point = 300; //for reduce number of test. + size_t max_total_load; + size_t max_mem_usage; }; -struct SwapMeta{ - /* - for copy between block and info. - */ - size_t swapSize; - void* ptr; - void* d_ptr; //not used for -}; +///SwapPool class SwapPool : public DeviceMemPool { public: - SwapPool(const MemPoolConf &conf); //constructor - //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. - void Malloc(void** ptr, const size_t size); - void Free(void* ptr); - ~SwapPool(); - void getMaxLoad(void); - std::pair GetMemUsage() override; - void Append(string blockInfo); - - void SwapOut(void* data_); - void SwapIn(void* data_); + SwapPool(const MemPoolConf &conf); //constructor + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~SwapPool(); + std::pair GetMemUsage() override; + void Append(string blockInfo); - //PoolOpt() construct pool based on MF info after Swap constructed. - void PoolOpt(vector &vec_mf); + //PoolOpt() construct pool based on MF info after Swap constructed. + void PoolOpt(vector &vec_mf); protected: - void Init(); + void Init(); private: - MemPoolConf conf_; - // whether the (global) memory pool has been initialized - bool initialized_ = false; - // lock on the initialized variable - std::mutex mtx_; - vector vec_block; - size_t swapLimit = 1<<23; //8MB - int poolFlag = 0; - int pc = 0; - int maxLen_mf = 0; - void* ptrPool = nullptr; - mapTable_p2r; //ptr for arrival idx, for look up Table during free - mapTable_r2v; //r-> vertex - vector>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe) - // mapTable_id2LookUpElement; //old TODO(junzhe) remove - // map>Table_Meta; + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; + + vector vec_block; + int pool_flag = 0; + int pool_index = 0; //like global counter in device class + int iteration_length_mf = 0; //max length of malloc free operation sequences. + void* ptr_pool = nullptr; + maptable_ptr_to_ridx; //map ptr to arrival idx, for look up Table during free + maptable_pool_meta; //table of pool block meta, key with r_idx }; #endif diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index a87953d857c6cf9354df29d312f03c027a6411d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMTWl0n82Ice&DxX3#+ zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj zFd;FlLk2Wo$+=jZ|kmqpQD;LO0oV# zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc7`ATOBLfA~ zHPe=nw6!s#5+@a9u6%t}bS78K0pg9Lz~r|jO)rVOG_J; zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3 z!rrFqhRztPOKg_w?bA?~h|8Wr+er;2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZPTtm#zUmPB=+MV z9>*cfph4;zMH6$Rz5R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC diff --git a/src/core/.DS_Store b/src/core/.DS_Store deleted file mode 100644 index 018d123c47540aef56047ebb82a09e1df9135f84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKK~BUl476c|7IE1l$9!Rb5Ut7!_5(<*1P4m@z~1``zQ!B)1v9omDnwk6(1PqL z87H>WWU3~yi0JM(Y(+LAvPK~)m4>jpX*vt%6;N%ANgf~Ww)=h>4~>CNv5RxRkX|Mk z@wDf^u#Z#M4acWm3~fcC@Vnm8AecUZYK*GnZgY39Ny$BPX>yhkPIvtIQQm;=l>0UnP!r|49Qk9kPQ4Y26WZ!yB!u4 zXX}^c@vJQ^2g AjsO4v diff --git a/src/core/common/common.cc b/src/core/common/common.cc index f3d144a9c8..d6e9c5a301 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -22,34 +22,28 @@ #include #include #include -//TODO(junzhe) ifdef to counter verify -///only include mutable_data() and data() namespace singa { void* Block::mutable_data() { - //TODO(junzhe) go back to enable it after device done - //std::cout<<"mutable_data() "<AppendInfo(temp); + string temp_str4 = strm4.str(); + string temp = "Mutable "+temp_str2+" "+temp_str4; + ptr_device_->AppendInfo(temp); } - //TODO(junzhe) this should not happen, can verify and remove + + //update ptr after swap in done, if variable is not swapped back yet as expected. if (data_ == nullptr) { - //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"print returned tempData_ "<UpdateGpuPtrInfo(this); return static_cast(tempData_) + offset_; } @@ -59,41 +53,38 @@ void* Block::mutable_data() { const void* Block::data() const { CHECK(initialized_) << "Must initialize data before reading it"; - //TODO(junzhe) go back to enable it after device done - if (ptrDevice_!=nullptr){ + + //Append block info: opt_type, ptr, time_stamp + if (ptr_device_!=nullptr){ //Append info. stringstream strm2; strm2<AppendInfo(temp); + string temp_str4 = strm4.str(); + string temp = "Read "+temp_str2+" "+temp_str4; + ptr_device_->AppendInfo(temp); } - //TODO(junzhe) this should not happen, can verify and remove + //update ptr after swap in done, if variable is not swapped back yet as expected. if (data_ == nullptr) { - //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"print returned tempData_ "<UpdateGpuPtrInfo(this); return static_cast(tempData_) + offset_; } - return static_cast(data_) + offset_; } void* Block::get_data() { + //get data without calling data(), to avoid append block info. return data_; } void Block::update_data(void* data_new) { + //update data_, after the swap in completes. data_ = data_new; - std::cout<<"results update_data:: "<Append(blockInfo); } -void* CudaGPU::GetRealGpuPtr(const Block* block_){ +void* CudaGPU::UpdateGpuPtr(const Block* block_){ return nullptr; } -void CudaGPU::SwapOut(const Block* block_){ - -} - -void CudaGPU::SwapIn(const Block* block_){ - -} } // namespace singa #endif // USE_CUDA \ No newline at end of file diff --git a/src/core/device/device.cc b/src/core/device/device.cc index b2988f3615..59faddc5c6 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -41,9 +41,7 @@ Block* Device::NewBlock(int size) { if (size > 0) { void* ptr = Malloc(size); Block* block_ = new Block(ptr, size,0,this); - //std::cout<<"(reference) from device.cc after, data_, block_ device: "<mutable_data(); - //cout<<"FreeBlock: "<mutable_data()); - //Add Append for free here. + //append block info for free operation. stringstream strm1; strm1< SplitOptString(string s, string delimiter) { + // string delimiter + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); -/// string delimiter -vector swap_split(string s, string delimiter) { - size_t pos_start = 0, pos_end, delim_len = delimiter.length(); - string token; - vector res; - while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { - token = s.substr(pos_start, pos_end - pos_start); - pos_start = pos_end + delim_len; - res.push_back(token); - } - res.push_back(s.substr(pos_start)); - return res; + return res; } -///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg -//vector of pairMsg is used in run. -//vector of iterMsg is used in test. -vector swap_strVec_2_pieceMsgVec(vector vec, int &idxRange){ +vector DeviceOptSeqStrToStruct(vector vec, int &idx_range){ /* - convert vector of string into vector of onePieceMsg, sorted by ptr - and then idx, and update idxRange to pieceMsgVec size. - format of onePieceMsg [ptr, size/-1, flag, idx, timestamp] + convert vector of string into vector of DeviceOptInfo, sorted by ptr + and then idx, and update idx_range to pieceMsgVec size. + format of DeviceOptInfo [ptr, size/-1, flag, idx, timestamp] flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable - version on 5/29, with equval blockInfo length: flag, block_, size, t - */ - vectoronePieceMsgVec_; + */ + vectorvec_opt_info; for (int i=0;i v = swap_split(vec[i], " "); - int MallocFree; + vector v = SplitOptString(vec[i], " "); + int operation_type; if (v[0]=="Malloc"){ - MallocFree = 1; + operation_type = 1; }else if (v[0]=="Free"){ - MallocFree = -1; + operation_type = -1; }else if (v[0]=="Mutable"){ - MallocFree = 4; + operation_type = 4; }else if (v[0]=="Read"){ - MallocFree = 2; + operation_type = 2; }else if (v[0]=="Layer"){ - MallocFree = 3; + operation_type = 3; } - //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + //DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} size_t result; stringstream convert(v[2]); if (!(convert>>result)){ - result =-1; - cout<<"error for converting size from str to int."<>tempTime; - tempMsg.t =tempTime; - onePieceMsgVec_.push_back(tempMsg); + convert2>>temp_time; + itm.t =temp_time; + vec_opt_info.push_back(itm); } - sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx()); - idxRange = static_cast(onePieceMsgVec_.size()); - - return onePieceMsgVec_; -}// end of strVec_2_pieceMsgVec function - - -vector Swap_piece2rep (vectoronePieceMsgVec_){ - vectoroneIterMsgVec_; - string tempStr; - int tempIdx=0; - for (int i=0;i(vec_opt_info.size()); + + return vec_opt_info; +} + + +vector DeviceOptSeqRepeatableTestPreProcess(vectorvec_opt_info){ + /* + pre process Device Operation Sequence Struct info for repeatable test, + return a vector of int for fast detection. + */ + vectorvec_opt_simplified_info; + string temp_str; + int temp_idx=0; + for (int i=0;irep; // vector of size_delta, name it as rep for simlisity. - for (int i =0; ivec_rep; // vector of size_delta, name it as vec_rep for simlisity. + for (int i =0; irep, int &maxLen, int &location, int maxLen_threshold, int gc ){ - int idxRange = (int)rep.size(); - int threshold = maxLen_threshold; - vector>maxLen_location; - - for (int i=0; ithreshold){ - break; - } - for (int len=1; len<(idxRange-i);len++){ - if (maxLen>threshold){ - break; - } - if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLenrep, int &iteration_length, int &location_of_2nd_iteration, int iteration_length_threshold, int global_index ){ + /* + repeatable test, input vector of int, + in-place update max_legth (length of iteration) + and location_of_2nd_iteration (where 2nd iteration starts) + */ + int idx_range = (int)rep.size(); + int threshold = iteration_length_threshold; + vector>iteration_length_location_of_2nd_iteration; + + for (int i=0; ithreshold){ + break; + } + for (int len=1; len<(idx_range-i);len++){ + if (iteration_length>threshold){ + break; + } + if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_lengthstruct2.dto); + return (struct1.DOA_origin>struct2.DOA_origin); } }; -struct less_than_wdto{ +struct sort_by_WDOA_descending{ /* - sort SwapBlock by weighted dto, descending + sort SwapBlock by weighted DOA_origin, descending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.WDOA>struct2.WDOA); + } +}; + +struct sort_by_AOA_descending{ + /* + sort SwapBlock by pri, descending */ inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) { - return (struct1.wdto>struct2.wdto); + return (struct1.AOA>struct2.AOA); } }; -// struct less_than_r_idx_ready{ -// /* -// sort SwapBlock by r_idx_ready, ascending -// */ -// inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) -// { -// return (struct1.r_idx_readystruct2.pri); - } +struct sort_by_idx_ascending_swap{ + /* + sort DeviceOptInfo_Swap by idx. + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.r_idxstruct2.d_idx); + } }; -struct less_than_Idx_Swap_rvs{ - /* - sort onePieceMsg_Swap by idx. reverse - */ - inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) - { - return (struct1.d_idx>struct2.d_idx); - } +struct sort_by_majority_voting_ascending{ + /* + sort majority voting, ascending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.majority_voting load_over_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){ - //input: vec_load, memLimit, range [start_idx, end_idx) - //return range overlimit [first_over_limit, first_below_limit) +pair GetOptIdxAboveLoadLimit(vectorvec_load, size_t mem_limit, int start_idx, int end_idx,int iteration_length){ + /* + get operation index (range) that above the load limit. + input: vec_load, mem_limit, range [start_idx, end_idx) + return range overlimit [first_over_limit, first_below_limit) + */ int first_over_limit = start_idx; int first_below_limit = end_idx; - for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){ - if (vec_load[i] > memLimit){ - first_over_limit = i-maxLen; + for (int i = start_idx+iteration_length; i < end_idx+iteration_length; i++){ + if (vec_load[i] > mem_limit){ + first_over_limit = i-iteration_length; break; } } - for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){ - if (vec_load[i] > memLimit){ - first_below_limit = i-1-maxLen; + for (int i = end_idx+iteration_length; i > first_over_limit+iteration_length; i--){ + if (vec_load[i] > mem_limit){ + first_below_limit = i-1-iteration_length; break; } } + if (first_over_limit == start_idx) first_over_limit = -1; + if (first_below_limit == end_idx) first_below_limit = -1; return std::make_pair(first_over_limit, first_below_limit); } -// pair load_below_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){ -// //input: vec_load, memLimit, range [start_idx, end_idx] -// //return range overlimit [first_over_limit, first_below_limit) -// int first_below_limit = maxIdx; -// int last_below_limit = maxIdx; - -// for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){ -// if (vec_load[i] > memLimit){ -// first_below_limit = i+1-maxLen; -// break; -// } -// } - -// for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){ -// if (vec_load[i] > memLimit){ -// last_below_limit = i-1-maxLen; -// break; -// } -// } - -// return std::make_pair(first_below_limit, last_below_limit); -// } - -pair load_peak(vectorvec_load_test,int maxLen){ - double maxLoad_test = 0; - int maxIdx_test = 0; - for (int i = maxLen; i < maxLen*2; i++){ - if (maxLoad_test < vec_load_test[i]){ - maxLoad_test = vec_load_test[i]; - maxIdx_test = i - maxLen; + +pair GetLoadPeak(vectorvec_load_test,int iteration_length){ + /* + return value and index of load peak + */ + double max_load_test = 0; + int max_idx_test = 0; + for (int i = iteration_length; i < iteration_length*2; i++){ + if (max_load_test < vec_load_test[i]){ + max_load_test = vec_load_test[i]; + max_idx_test = i - iteration_length; } } - return std::make_pair(maxLoad_test,maxIdx_test); + return std::make_pair(max_load_test,max_idx_test); } -void load_update(vector& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){ - //update load [start_idx, end_idx) by plusMinus*size - for (int i = start_idx+maxLen; i(size) * plusMinus; +void UpdateLoad(vector& vec_load,int start_idx, int end_idx, int plus_minus, size_t size,int iteration_length){ + /* + update load [start_idx, end_idx) by plus_minus*size + */ + for (int i = start_idx+iteration_length; i(size) * plus_minus; } } -vector SwapGPU::swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode){ + +///define SwapGPU member functions +vector SwapGPU::SelectBlock(vectorvec_swap,vector temp_load,double mem_limit,string mode){ vectorvec_swap_selct; - //vectorvec_swap_reject; - if (mode == "dto"){ - sort(vec_swap.begin(),vec_swap.end(),less_than_dto()); + /* + select swapping blocks based on a cetain priority score or BO score; + with load updated + */ + if (mode == "DOA_origin"){ + sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending()); } - if (mode == "pri"){ - sort(vec_swap.begin(),vec_swap.end(),less_than_pri()); + + if (mode == "AOA"){ + sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending()); } - if (mode == "wdto"){ - //TODO(junzhe) time complexity + + if (mode == "WDOA"){ for (int i = 0; i < vec_swap.size(); i++){ auto itm = vec_swap[i]; for (int j = itm.r_idx; j < itm.d_idx; j++){ - itm.wdto += origin_load[i+maxLen] - memLimit; + itm.WDOA += origin_load[i+iteration_length] - mem_limit; } } - sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); + sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); } - cout<<"===============select block one by one================="< SwapGPU::swap_load_ideal(vectorvec_load,vector vec_swap_selct){ +vector SwapGPU::GetIdealLoad(vectorvec_load,vector vec_swap_selct){ + /* + get load_ideal, which is equivalent to load by synchronous swapping. + */ auto vec_load_return = vec_load; for (int i =0; i&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode){ +void SwapGPU::Scheduling(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double mem_limit,string mode){ /* - update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit. + Swap Scheduling algo + update idx_out_end, idx_in_start + compute overhead time + mode selection: no overhead or stick to limit. */ - //TODO(junzhe) wordy, can merge in common part. + overhead = 0; - cout<<"----------------swap_sched----------------"< 0){ - readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end); } - cout<<" -> "< vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity. - readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead. + + itm.idx_out_start = ready_idx; + itm.t_out_start = vec_run[ready_idx+iteration_length].t; + itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size); + total_swap_out_time+=SwapOutTime(itm.size); + while (itm.t_out_end > vec_run[ready_idx+iteration_length].t){ + //ready means when able to finish swapOut, w/ or w/o overhead. + ready_idx++; } - //get min compare with maxIdx and readyIdx. - readyIdx = std::min(maxIdx,readyIdx); - cout<<" || "< "< 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } - cout<<" -> "< 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); } + itm.idx_in_end = need_idx; + double prepareTime = vec_run[need_idx+iteration_length].t - SwapInTime(itm.size); + total_swap_in_time+=SwapInTime(itm.size); + while (prepareTime < vec_run[need_idx+iteration_length].t){ + need_idx--; } - needIdx = std::max(needIdx,maxIdx+1); - cout<<" || "< 0){ - // cout< itm.t2p)) { - overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p); - cout<<"==== overhead added "< itm.t_in_start)) { + overhead+=(vec_run[temp_over_limit_3.second+iteration_length].t - itm.t_in_start); + UpdateLoad(vec_load_temp,itm.idx_in_start,temp_over_limit_3.second+1,-1,itm.size,iteration_length); + itm.idx_in_start = temp_over_limit_3.second+1; + auto temp_over_limit_4 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length); } - cout<<" -> "< 0){ - readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end); } - itm.i1 = readyIdx; - itm.t1 = vec_run[readyIdx].t; - itm.t1p = itm.t1 + SwapOutTime(itm.size); - while (itm.t1p > vec_run[readyIdx].t){ - readyIdx++; + itm.idx_out_start = ready_idx; + itm.t_out_start = vec_run[ready_idx].t; + itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size); + while (itm.t_out_end > vec_run[ready_idx].t){ + ready_idx++; } - itm.i1p = readyIdx; + itm.idx_out_end = ready_idx; vec_swap_selct[i] = itm; } - //update i2p - sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs()); + //update idx_in_start + sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap()); for (int i =0; i 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } - itm.i2 = needIdx; - double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size); - while (prepareTime < vec_run[needIdx].t){ - needIdx--; + int need_idx = itm.d_idx; + if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); } + itm.idx_in_end = need_idx; + double prepareTime = vec_run[need_idx].t - SwapInTime(itm.size); + while (prepareTime < vec_run[need_idx].t){ + need_idx--; } - itm.i2p = needIdx; - itm.t2p = prepareTime; + itm.idx_in_start = need_idx; + itm.t_in_start = prepareTime; vec_swap_selct[i] = itm; - load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary + UpdateLoad(vec_load_temp,itm.idx_out_end,itm.idx_in_start+1,-1,itm.size,iteration_length); } } @@ -548,365 +532,219 @@ void SwapGPU::swap_sched(vector&vec_swap_selct, vector&vec_lo } -void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ +void SwapGPU::BuildMetaTables(vectorvec_swap_selct){ + /* + construct tables: table_sched, and table_meta + */ cudaStream_t stream1; cudaStream_t stream2; - cout<<"---------------print all 1, 1', 2', 2-----------"<(vec_swap_selct.size()-1);i>=0; i--){ for (int i =0; i= 0){ - //TODO(junzhe) for time being, remove negative r_idx itms. - cout<(Table_sched.find(itm.i1)->second) = itm.r_idx; - std::get<1>(Table_sched.find(itm.i1)->second) = 0; - } - //i2p swap - if (Table_sched.find(itm.i2p) == Table_sched.end()){ - Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1); - } else { - std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx; - std::get<1>(Table_sched.find(itm.i2p)->second) = 1; - } - // i1p sync - if (Table_sched.find(itm.i1p) == Table_sched.end()){ - Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0); - } else { - std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx; - std::get<3>(Table_sched.find(itm.i1p)->second) = 0; - } - //i2 sync - if (Table_sched.find(itm.i2) == Table_sched.end()){ - Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1); - } else { - std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx; - std::get<3>(Table_sched.find(itm.i2)->second) = 1; - } - ///Make Table_meta - void* tempPtr = nullptr; - cudaMallocHost(&tempPtr,itm.size); //pinned memory. - BlockMeta meta; - meta.size = itm.size; - meta.cpu_ptr = tempPtr; - meta.out_stream = stream1; - meta.in_stream = stream2; - //meta.last_out_idx = vec_swap_selct[i].last_out_idx; - //meta.last_in_idx = vec_swap_selct[i].last_in_idx; - //meta.i2 = vec_swap_selct[i].i2; - Table_meta[itm.r_idx] = meta; - // } - - } - cout<<"---------------print all 1, 1', 2', 2-----------DONE"<"; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<(table_sched.find(itm.idx_out_start)->second) = itm.r_idx; + std::get<1>(table_sched.find(itm.idx_out_start)->second) = 0; } + //idx_in_start swap + if (table_sched.find(itm.idx_in_start) == table_sched.end()){ + table_sched[itm.idx_in_start] = std::make_tuple(itm.r_idx,1,-1,-1); + } else { + std::get<0>(table_sched.find(itm.idx_in_start)->second) = itm.r_idx; + std::get<1>(table_sched.find(itm.idx_in_start)->second) = 1; + } + // idx_out_end sync + if (table_sched.find(itm.idx_out_end) == table_sched.end()){ + table_sched[itm.idx_out_end] = std::make_tuple(-1,-1,itm.r_idx,0); + } else { + std::get<2>(table_sched.find(itm.idx_out_end)->second) = itm.r_idx; + std::get<3>(table_sched.find(itm.idx_out_end)->second) = 0; + } + //i2 sync + if (table_sched.find(itm.idx_in_end) == table_sched.end()){ + table_sched[itm.idx_in_end] = std::make_tuple(-1,-1,itm.r_idx,1); + } else { + std::get<2>(table_sched.find(itm.idx_in_end)->second) = itm.r_idx; + std::get<3>(table_sched.find(itm.idx_in_end)->second) = 1; + } + + ///Make table_meta + void* temp_ptr = nullptr; + cudaMallocHost(&temp_ptr,itm.size); //pinned memory. + BlockMeta meta; + meta.size = itm.size; + meta.cpu_ptr = temp_ptr; + meta.out_stream = stream1; + meta.in_stream = stream2; + table_meta[itm.r_idx] = meta; } } -void SwapGPU::swap_update_tables(Block* tempBlock_){ - // update Table_meta's block_ and data_; update once atfer swap test is passed. - // enable to update negative r_idx. - // it's safe in below procedure, as r_gc and r_gc_n should never be the same. - if (testFlag == 1) { +void SwapGPU::UpdateMetaTables(Block* block_ptr){ + /* + update table_meta's block_ and data_; update once atfer swap test is passed. + enable to update negative r_idx. + it's safe in below procedure, as r_global_index and relative_counter should never be the same. + */ + + if (past_test_flag == 1) { //update positive r_idx - int r_gc = (gc-location)%maxLen; - if (!(Table_meta.find(r_gc)==Table_meta.end())){ - //cout<<"r_gc, gc and size ot Table_meta "<get_data()<second.block_ = tempBlock_; - Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); + int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length; + if (!(table_meta.find(r_global_index)==table_meta.end())){ + table_meta.find(r_global_index)->second.block_ = block_ptr; + table_meta.find(r_global_index)->second.data_ = block_ptr->get_data(); } //update negative r_idx - int r_gc_n = r_gc - maxLen; - if (!(Table_meta.find(r_gc_n)==Table_meta.end())){ - //cout<<"r_gc, gc and size ot Table_meta "<get_data()<second.block_ = tempBlock_; - Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data(); + int relative_counter = r_global_index - iteration_length; + if (!(table_meta.find(relative_counter)==table_meta.end())){ + table_meta.find(relative_counter)->second.block_ = block_ptr; + table_meta.find(relative_counter)->second.data_ = block_ptr->get_data(); } } } -int SwapGPU::swap_test(vectorvec_block,int &maxLen, int &location){ +int SwapGPU::Detection(vectorvec_block,int &iteration_length, int &location_of_2nd_iteration){ + /* + test repeatability, detect iteration, and return global_index_threshold. + */ + + ///vec_str (vec_block) to vec_opt_info, sort by ptr and idx. + int idx_range = 0; + vector vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range); - ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx. - int idxRange = 0; - vector vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); - cout<<"size of vec_pieceMsg & vec_block: "< vec_rep = Swap_piece2rep(vec_pieceMsg); - //int idxRange3=0; //rename TODO(junzhe) - //int maxLen=0, location =0; - repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc); - cout<<"maxLen and location are: "< vec_rep = DeviceOptSeqRepeatableTestPreProcess(vec_opt_info); + RepeatableTest(vec_rep,iteration_length,location_of_2nd_iteration,iteration_length_threshold,global_index); + + //Note here location_of_2nd_iteration not exactly start of one iteration, //adjust to nearly start of one by restricting "Malloc" int shift_counter = 0; - for (int i=0;i v = swap_split(vec_block[location+i], " "); + for (int i=0;i v = SplitOptString(vec_block[location_of_2nd_iteration+i], " "); if (v[0]=="Malloc"){ shift_counter = i; break; } } - location =location+shift_counter; - cout<<"shift_counter is "< vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); - cout<<"size of vec_pieceMsg & vec_block: "< vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range); + sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_idx_ascending()); + // scale down idx, to middle iteration. - tempTime_baseline = vec_pieceMsg[three_more_location].t; - for (int i=0; ione_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]); + vectorone_itr(&vec_opt_info[location_of_2nd_iteration+4*iteration_length],&vec_opt_info[location_of_2nd_iteration+5*iteration_length]); for (int i =0; itemp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]); + + //3 iterations of vec_run and vec_load, max_idx and max_load + vectortemp_vec_run(&vec_opt_info[location_of_2nd_iteration+3*iteration_length],&vec_opt_info[location_of_2nd_iteration+6*iteration_length]); vec_run = temp_vec_run; - fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app); - for (int i =0; itemp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]); + vectortemp_vec_run2(&vec_opt_info[location_of_2nd_iteration],&vec_opt_info[location_of_2nd_iteration+3*iteration_length]); auto vec_run2 = temp_vec_run2; - fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app); - for (int i =0; ivec_load(&global_load[location],&global_load[location+3*maxLen]); + + vectorvec_load(&global_load[location_of_2nd_iteration],&global_load[location_of_2nd_iteration+3*iteration_length]); origin_load = vec_load; - //3 iterations - fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app); - for (int i=0; ivec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]); - // auto origin_load2 = vec_load2; - // //3 iterations - // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app); - // for (int i=0; ivec_swap; - // size_t load_swap = 0; + for (int i =1; i= smallest_block) && (vec_run_dup[i-1].idxmaxIdx) + //SwapBlock(string p, size_t s, int idx_out_start, int i2, double t1, double t2): + //ptr(p), size(s), r_idx(idx_out_start),d_idx(i2),r_time(t1), d_time(t2) {} + if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idxmax_idx) && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) - && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4))) + && ((vec_run_dup[i-1].operation_type==3) or (vec_run_dup[i-1].operation_type==2) or (vec_run_dup[i-1].operation_type==4))) { SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t); - itm.dto = itm.d_time-itm.r_time; - itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); - if (itm.dt>=0){ - itm.pri = itm.dt * itm.size; + itm.DOA_origin = itm.d_time-itm.r_time; + itm.DOA = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); + if (itm.DOA>=0){ + itm.AOA = itm.DOA * itm.size; } else { - itm.pri = itm.dt * 1/itm.size; + itm.AOA = itm.DOA * 1/itm.size; } //cat A - if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } - if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} - if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} + if (vec_run_dup[i-1].operation_type == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } + if (vec_run_dup[i-1].operation_type == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} + if (vec_run_dup[i-1].operation_type == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} vec_swap.push_back(itm); - // load_swap+=itm.size; - cout< for building SwapGPU, which doesnt matter. pool_ = std::make_shared(conf); Setup(); @@ -988,21 +825,21 @@ void* SwapGPU::Malloc(int size) { CUDA_CHECK(cudaSetDevice(id_)); pool_->Malloc((void**)&ptr, size); - ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Malloc "; + ///append vec_block_mf:for swap & pool + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Malloc "; stringstream strm2; strm2<Free(ptr); - ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Free "; + ///append vec_block_mf: for swap & pool + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Free "; stringstream strm2; strm2< maxLen_threshold) { - testFlag = 1; - three_more_globeCounter = globeCounter + 3*maxLen; - three_more_location = location + 3*maxLen; - cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"< iteration_length_threshold) { + past_test_flag = 1; + three_more_iteration_global_index_threshold = global_index_threshold + 3*iteration_length; + location_of_5th_iteration = location_of_2nd_iteration + 3*iteration_length; } } ///switch flag; next idx - if ((gc+1) == three_more_globeCounter){ - swap_plan(); - asyncSwapFlag = 1; - // vectorvec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]); - // origin_load = vec_load2; - // //load before swap, write in - // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); - // for (int i=0; i= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) { - cout<<"condition B"<= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) { + DeploySwapExec(r_global_index_n); } - if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) { - cout<<"condition C"<= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index) == table_sched.end()))) { + DeploySwapExec(r_global_index); } } } -void SwapGPU::DeploySwap_exec(int r_gc){ - cout<<"--------sched action at "<(Table_sched.find(r_gc)->second); - auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); - auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); - auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); +void SwapGPU::DeploySwapExec(int r_global_index){ + //execute DeploySwap + auto swap_idx = std::get<0>(table_sched.find(r_global_index)->second); + auto swap_dir = std::get<1>(table_sched.find(r_global_index)->second); + auto sync_idx = std::get<2>(table_sched.find(r_global_index)->second); + auto sync_dir = std::get<3>(table_sched.find(r_global_index)->second); if (swap_dir == 0){ - SwapOut_idx(swap_idx); - cout<<"----Swap Out "<second; + auto last_meta = table_meta.find(sync_idx)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.in_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + table_not_at_device[last_meta.block_] = sync_idx; last_meta.block_->update_data(nullptr); - // cout<<"to free data_"<Free(last_meta.data_); ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Free "; + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Free "; stringstream strm2; strm2<second = last_meta; + last_meta.data_ = nullptr; + table_meta.find(sync_idx)->second = last_meta; } if (sync_dir == 1){ ///sync swap-in, including sync, update block's data_ to new gpu address, update meta. - //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe) - auto last_meta = Table_meta.find(sync_idx)->second; + auto last_meta = table_meta.find(sync_idx)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.out_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device.erase(last_meta.block_); + table_not_at_device.erase(last_meta.block_); last_meta.block_->update_data(last_meta.data_); - cout<<"----sync in "<second = last_meta; + table_meta.find(sync_idx)->second = last_meta; } } -void SwapGPU::Append(string blockInfo){ +void SwapGPU::Append(string block_info){ + /* + Append Operation block info after each operation + Meantime execute following operations: + insert size for non-malloc operations + update global memory load + control swap flag on and off + update table_meta and table_sched + deploy swap at every index. + test moved from start of malloc/free to end of append, only global_index+1 changed + call PoolOpt to Construct Pool + */ - vector v = swap_split(blockInfo, " "); - void* tempPtr; + vector v = SplitOptString(block_info, " "); + void* temp_ptr; stringstream convert(v[1]); - convert>>tempPtr; - auto tempBlock_ = static_cast(tempPtr); + convert>>temp_ptr; + auto block_ptr = static_cast(temp_ptr); // insert size, malloc : flag, block_, size, t; others: insert size t. if (v.size() != 4) { stringstream strm1; - strm1<size(); - string tempStr1 = strm1.str(); - blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2]; + strm1<size(); + string temp_str1 = strm1.str(); + block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2]; } // update global load - if (maxLen < maxLen_threshold){ + if (iteration_length < iteration_length_threshold){ if (v[0] == "Malloc"){ if (global_load.size()>0){ - global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size()); + global_load.push_back(global_load[global_load.size()-1]+block_ptr->size()); } else { - global_load.push_back(tempBlock_->size()); + global_load.push_back(block_ptr->size()); } } else if (v[0] == "Free"){ - global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size()); + global_load.push_back(global_load[global_load.size()-1]-block_ptr->size()); } else { global_load.push_back(global_load[global_load.size()-1]); } } //append into vec_block - vec_block.push_back(blockInfo); - - - //cout<size()<maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){ - // fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app); - // for (int i =0; imaxLen_threshold) && ((gc-location)%(maxLen) == 0)){ - if (tempTime != 0){ - fstream file_time("itr_time.csv", ios::in|ios::out|ios::app); - auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_time<<(float)(t_now - tempTime)/(float)(1000000)<size() != size_sequence[r_global_index]){ + async_swap_flag = 0; + cout<<"!!!! async_swap_flag changed back to 0"<size() != sizeSequence[r_gc]){ - asyncSwapFlag = 0; - cout<<"!!!! asyncSwapFlag changed back to 0"<PoolOpt(vec_block_mf); - cout<<"==================to call PoolOpt done"<PoolOpt(vec_block_mf); } } -void* SwapGPU::GetRealGpuPtr(const Block* block_){ - // in case that block is at host memory, swapIn ad hoc. - auto r_idx = Table_not_at_device.find(block_)->second; - - // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); +void* SwapGPU::UpdateGpuPtr(const Block* block_ptr){ + /* + in case that block is not at device memory, swapIn ad hoc. + used in block class to update ptr after swap in done, if variable is not swapped back yet as expected. + */ + auto r_idx = table_not_at_device.find(block_ptr)->second; cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(r_idx)->second; cudaEventCreate (&meta.in_event); - //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); - //cout<<"expected results update_data:: "<update_data(last_meta.data_); - // cout<<"----sync in "<second = last_meta; - Table_meta.find(r_idx)->second = meta; - - // //here should be not update_data() - // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second; - // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // cudaEventSynchronize(reading_meta.in_event); - // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); - // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<second<update_data(static_cast(ptr)); - - cout<<"print ptr from function GetRealGpuPtr() "<second = meta; - return ptr; //TODO(junzhe) attention, based on no change here. + return ptr; } -void SwapGPU::SwapOut_idx(const int r_idx){ - //cout<<"doing asynchrous swapOut of r_idx: "< CPU, and update meta. + */ cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(idx)->second; cudaEventCreate (&meta.out_event); - //cout<<"right before cudaMemcpyAsync Out"<second = meta; - //cout<<"time for asynchrous: "<second = meta; } -void SwapGPU::SwapIn_idx(const int r_idx){ - //logic: extra meta, swap, update meta in Table - //TODO(junzhe) to clean up free(), make it in somewhere else. - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); +void SwapGPU::SwapIn(const int idx){ + /* + memory copy asynchronously CPU -> GPU, and update meta. + */ + cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(idx)->second; cudaEventCreate (&meta.in_event); - //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); + ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Malloc "; + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Malloc "; stringstream strm2; strm2<second = meta; - //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update. - //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - //cout<<"time for asynchrous: "<second = meta; } -void SwapGPU::SwapOut(const Block* block_){ - if (gc < 1000 && block_->size() > 1<<20) { +void SwapGPU::SwapOutSynchronous(const Block* block_ptr){ + /* + for synchronous swap, collect speed info + */ + if (global_index < 1000 && block_ptr->size() > 1<<20) { fstream file_block5("speed.csv", ios::in|ios::out|ios::app); BlockMeta meta; meta.data_ = meta.block_->get_data(); - void* tempPtr = nullptr; - cudaMallocHost(&tempPtr,block_->size()); //pinned memory. - meta.cpu_ptr = tempPtr; - Table_block_meta[block_] = meta; + void* temp_ptr = nullptr; + cudaMallocHost(&temp_ptr,block_ptr->size()); //pinned memory. + meta.cpu_ptr = temp_ptr; + table_block_meta[block_ptr] = meta; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaError_t err; - err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost); + err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_ptr->size(),cudaMemcpyDeviceToHost); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_block5<<"Out "<size()<<' '<size()<<' '<size() > 1<<20) { +void SwapGPU::SwapInSynchronous(const Block* block_ptr){ + /* + for synchronous swap, collect speed info + */ + if (global_index < 1000 && block_ptr->size() > 1<<20) { fstream file_block5("speed.csv", ios::in|ios::out|ios::app); - BlockMeta meta = Table_block_meta.find(block_)->second; + BlockMeta meta = table_block_meta.find(block_ptr)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaError_t err; - err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice); + err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_ptr->size(),cudaMemcpyHostToDevice); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_block5<<"In "<size()<<' '<size()<<' '< colorRange; - vector> colorOccupied; -}; -Vertex::Vertex(int n, size_t s, int r1, int d1){ - name =n; - size = s; - r = r1; - d = d1; -}//end of class Vertex - ///Section for structs and respective sorting function: -// onePieceMsg_pool, onePairMsg, oneIterMsg, version 11/30 3pm -struct onePieceMsg_pool{ +// PoolOptInfo, PoolBlockLifeTime, PoolOptSimplifiedInfo +struct PoolOptInfo{ /* - members: [ptr, size, MallocFree, idx] + members: [ptr, size, operation_type, idx] */ string ptr; size_t size; - int MallocFree; + int operation_type; int idx; - onePieceMsg_pool(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + PoolOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} }; -struct less_than_ptrIdx{ - /* - sort onePieceMsg_pool by ptr and then idx. - */ - inline bool operator() (const onePieceMsg_pool& struct1, const onePieceMsg_pool& struct2) - { - return ((struct1.ptrstruct2.size); - } -}; - -struct less_than_size_rIdx{ - /* - sort onePairMsg by descending size and r_idx - */ - inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2) - { - return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idxstruct2.size); + } }; -struct less_than_lookupIdx{ - /* - sort lookUpElement by idx. - */ - inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2) - { - return (struct1.r_idxstruct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx split(string s, string delimiter) { - size_t pos_start = 0, pos_end, delim_len = delimiter.length(); - string token; - vector res; - while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { - token = s.substr(pos_start, pos_end - pos_start); - pos_start = pos_end + delim_len; - res.push_back(token); - } - res.push_back(s.substr(pos_start)); - return res; +vector SplitString(string s, string delimiter) { + /// string delimiter + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; } -///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg -//vector of pairMsg is used in run. -//vector of iterMsg is used in test. -vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ +vector PoolOptSeqStrToStruct(vector vec, int &idx_range){ /* - convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size. + convert vector of string into vector of PoolOptInfo, + sorted by ptr and then idx, and update idx_range to pieceMsgVec size. */ - vectoronePieceMsg_poolVec_; + vectorvec_pool_opt_info; for (int i=0;i v = split(vec[i], " "); + vector v = SplitString(vec[i], " "); if (v[0]=="Malloc"){ //convert v[2] from str to size_t size_t result; @@ -262,410 +230,346 @@ vector strVec_2_pieceMsgVec(vector vec, int &idxRange) result =-1; cout<<"error for converting size from str to int."<(onePieceMsg_poolVec_.size()); + sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending()); + idx_range = static_cast(vec_pool_opt_info.size()); - return onePieceMsg_poolVec_; -}// end of strVec_2_pieceMsgVec function + return vec_pool_opt_info; +} -pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vectoronePieceMsg_poolVec_, int idxRange){ - /* - pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks. - */ - vectoronePairMsgVec_1; - vectoronePairMsgVec_2; - int i=0; - - //while loop processes a pair at each time, if got a pair. - while (i<(onePieceMsg_poolVec_.size()-1)){ - //condition A: start with free. do nothing. - if (onePieceMsg_poolVec_[i].MallocFree==-1){ - i+=1; - } - //condition B: start with Malloc, next item same ptr and is free. - if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){ - onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx); - onePairMsgVec_1.push_back(tempPair); - i+=2; - } - // condition C: start with Malloc, no free. - if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){ - onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange); - onePairMsgVec_2.push_back(tempPair); - i+=1; - } - }//end of while - //condition D: if still left with the last item - if ((i,vector> PoolOptInfoToBlockLifeTime(vectorvec_pool_opt_info, int idx_range){ + /* + convert vector of opt info into vector of block life time + return a pair of vectors: 1. normal blocks 2. cross-iteration blocks. + */ + vectorvec_block_life_time1; + vectorvec_block_life_time2; + int i=0; + + //while loop processes a pair at each time, if got a pair. + while (i<(vec_pool_opt_info.size()-1)){ + //condition: start with free. do nothing. + if (vec_pool_opt_info[i].operation_type==-1){ i+=1; } - - //sort both pairVec - sort(onePairMsgVec_1.begin(),onePairMsgVec_1.end(),less_than_size_rIdx()); - sort(onePairMsgVec_2.begin(),onePairMsgVec_2.end(),less_than_size_rIdx()); - pair,vector>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2); - - return pairOfPairMsgVec_; -}//end of pieceMsgVec_2_pairOfPairMsgVec function - -///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph. -vector> mergeSeg(vector> colorOccupied){ - /* - version 12/9 11am -- modify to accomodate unsigned int/size_t - input:the collection of color ranges that is once occupied by some block during a block's life time. - function: merge consecutive/overlapping segments of colorOccupied - output: merged segments in ascending order. - time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied. - */ - sort(colorOccupied.begin(), colorOccupied.end()); - - if(colorOccupied.size()<=1){ - return colorOccupied; - } - - int m = 0; - while (m<(colorOccupied.size()-1)){ - - if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){ - pairtempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second)); - //remove m+1 and m - colorOccupied.erase(colorOccupied.begin()+m+1); - colorOccupied.erase(colorOccupied.begin()+m); - //insert the combined range - colorOccupied.insert(colorOccupied.begin()+m,tempItem); - }else{ - m+=1; - } - }//end of while loop - - //verify if mergeSeg is completed. O(n^2) optional -// if(colorOccupied.size()>1){ -// for (int i=0;i<(colorOccupied.size()-1);i++){ -// if(colorOccupied[i].second>=colorOccupied[i+1].first){ -// cout<<"error in mergeSeg"< FFallocation(vector> colorMerged,size_t size, size_t local_offset){ - /* - version 12/2 4pm - First Fit weighted coloring - return a pair standing for colorRange. - local_offset shifts the returned colorRange, allowing multiple run(). - local_offset not changable, whereas offset is changable. - */ - // condition A: if no occupied, put after the local_offset - if (colorMerged.size()==0){ - return pair(0+local_offset,size-1+local_offset); - } - - // condition B: able to fit before first block, after the local_offset - if ((size+local_offset)<(colorMerged[0].first+1)){ - return pair(0+local_offset,size-1+local_offset); + //condition: start with Malloc, next item same ptr and is free. + if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)&&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){ + PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx); + vec_block_life_time1.push_back(temp_block_life_time); + i+=2; } - - size_t yLocation= -1; - if (colorMerged.size()>1) { - int n = 0; - while (n<(colorMerged.size()-1)){ - // condition C: able to fit in between middle blocks. - if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){ - yLocation = colorMerged[n].second+1; - break; - } - n+=1; - }//end of while loop. - // condition D: allocate after the last block. - if (yLocation == -1){ - yLocation = colorMerged[colorMerged.size()-1].second+1; - } - }// end of if loop, conditon C and D. - - // condition E: colorMeger len =1, allocate after the last block. - if (colorMerged.size()==1){ - yLocation = colorMerged[0].second+1; - } - - if (yLocation==-1){ - cout<<"error in FFallocation!!!"<(yLocation,yLocation+size-1); -}//end of FFallocation function + }//end of while + //condition: if still left with the last item + if ((i,vector>pair_vec_block_life_time(vec_block_life_time1,vec_block_life_time2); + + return pair_vec_block_life_time; +} -pair BFallocation(vector> colorMerged,size_t size, size_t local_offset){ - /* - version 12/11 1pm - Best Fit allocation, input and output same as FFallocation - */ - // condition A: if no occupied, put after the local_offset - if (colorMerged.size()==0){ - return pair(0+local_offset,size-1+local_offset); - } - //condition B: if size=1, able to fit before the first block - if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){ - return pair(0+local_offset,size-1+local_offset); - } - - //condition C: else of B - if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){ - return pair(colorMerged[0].second+1,colorMerged[0].second+size); - } - - //condition D and E: - size_t yLocation=-1; - pairtempHole(-1,-1); // n, hole size between n and n+1 - if (colorMerged.size()>1) { - int n = 0; - while (n<(colorMerged.size()-1)){ - // condition C: able to fit in between middle blocks. select smallest. - if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)> MergeColoredSegments(vector> vec_color_preoccupied){ + /* + merge consecutive/overlapping segments of vec_color_preoccupied + input:the collection of color ranges that is once occupied by some block during a block's life time. + output: merged segments in ascending order. + time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of vec_color_preoccupied. + */ + sort(vec_color_preoccupied.begin(), vec_color_preoccupied.end()); + + if(vec_color_preoccupied.size()<=1){ + return vec_color_preoccupied; + } + + int m = 0; + while (m<(vec_color_preoccupied.size()-1)){ + if ((vec_color_preoccupied[m].second +2)> vec_color_preoccupied[m+1].first){ + pairtempItem(vec_color_preoccupied[m].first,max(vec_color_preoccupied[m].second,vec_color_preoccupied[m+1].second)); + //remove m+1 and m + vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m+1); + vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m); + //insert the combined range + vec_color_preoccupied.insert(vec_color_preoccupied.begin()+m,tempItem); + }else{ + m+=1; } - - return pair(yLocation,yLocation+size-1); + }//end of while loop + + return vec_color_preoccupied; } -vector colorSomeVertices(vector pairMsgVec_, size_t &offset,string colorMethod){ - /* - color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset. - time complexity: O(n^2). - */ - size_t local_offset = offset; //feed into FFallocation, shall never change. - int m = static_cast(pairMsgVec_.size()); - //init all vertices - vectorvertices; - for (int i=0; i FirstFitAllocation(vector> vec_color_merged,size_t size, size_t local_offset){ + /* + First Fit weighted coloring + return a pair standing for color_range. + local_offset shifts the returned color_range, allowing multiple Plan(). + local_offset not changable, whereas offset is changable. + */ + // condition: if no occupied, put after the local_offset + if (vec_color_merged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + + // condition: able to fit before first block, after the local_offset + if ((size+local_offset)<(vec_color_merged[0].first+1)){ + return pair(0+local_offset,size-1+local_offset); + } + + size_t y_location= -1; + if (vec_color_merged.size()>1) { + int n = 0; + while (n<(vec_color_merged.size()-1)){ + // condition: able to fit in between middle blocks. + if ((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size){ + y_location = vec_color_merged[n].second+1; + break; + } + n+=1; + }//end of while loop. + // condition: allocate after the last block. + if (y_location == -1){ + y_location = vec_color_merged[vec_color_merged.size()-1].second+1; } + }// end of if loop, conditon C and D. + + // condition: colorMeger len =1, allocate after the last block. + if (vec_color_merged.size()==1){ + y_location = vec_color_merged[0].second+1; + } + + if (y_location==-1){ + cout<<"error in FirstFitAllocation!!!"<(y_location,y_location+size-1); +} - int **adj; - adj = new int*[m]; //TODO(junzhe) should be deleted somewhere. - // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop. - for (int i=0; i>colorMerged = mergeSeg(vertices[i].colorOccupied); - - if(colorMethod=="FF"){ - vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); - - }else{ //BF - vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset); - } - //update of offset, largest memory footprint as well. - if (vertices[i].colorRange.second >=offset){ - offset = vertices[i].colorRange.second+1; - } - }//end of for loop. +pair BestFitAllocation(vector> vec_color_merged,size_t size, size_t local_offset){ + /* + Best Fit allocation, input and output same as FirstFitAllocation + */ + // condition: if no occupied, put after the local_offset + if (vec_color_merged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + //condition: if size=1, able to fit before the first block + if ((vec_color_merged.size()==1)&&((size+local_offset)<(vec_color_merged[0].first+1))){ + return pair(0+local_offset,size-1+local_offset); + } + + //condition: lese of second condition + if ((vec_color_merged.size()==1)&&((size+local_offset)>=(vec_color_merged[0].first+1))){ + return pair(vec_color_merged[0].second+1,vec_color_merged[0].second+size); + } + + size_t y_location=-1; + pairtemp_hole(-1,-1); // n, hole size between n and n+1 + if (vec_color_merged.size()>1) { + int n = 0; + while (n<(vec_color_merged.size()-1)){ + // condition: able to fit in between middle blocks. select smallest. + if (((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size)&&((vec_color_merged[n+1].first-vec_color_merged[n].second-1)(y_location,y_location+size-1); } +vector AssignColorToVertices(vector vec_block_life_time, size_t &offset,string color_method){ + /* + color all or 1/2 vertices using MergeColoredSegments() and FirstFitAllocation(), with updated offset. + time complexity: O(n^2). + */ + size_t local_offset = offset; //feed into FirstFitAllocation, shall never change. + int m = static_cast(vec_block_life_time.size()); + //init all vertices + vectorvertices; + for (int i=0; i,map> cross_itr_durations(vectorvec_double, int location, int maxLen, int &doubleRange){ - - vectoronePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange); - pair,vector>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange); - - mapTable_r2d; //full duration info, cross-iteration duration. - mapTable_d2r; - for (int i=0;i,map>(Table_r2d,Table_d2r); -} - -/// main run funtion -vector run(vectorvec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){ - /* - run function, input vector of strings, return colored vertices, - update idxRange, offset. - time complexity: O(n^2) where n is maxLen. - */ - vectoronePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange); - //1. normal blocks 2. cross-iteration blocks. - vectorpairMsgVec_1 = pairOfPairMsgVec_.first; - vectorpairMsgVec_2 = pairOfPairMsgVec_.second; - - vectorvertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod); - for (int i=0; i>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied); + + if(color_method=="FF"){ + vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset); + + }else{ //BF + vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, local_offset); } - offsetCrossItr = offset; - offset = offsetCrossItr*2; - vectorvertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod); - //merge - vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end()); - return vertices; + //update of offset, largest memory footprint as well. + if (vertices[i].color_range.second >=offset){ + offset = vertices[i].color_range.second+1; + } + }//end of for loop. + + return vertices; } -///Section of test functions. -vector pairOfPairMsgVec_2_repSeq(pair,vector>pairOfPairMsgVec_){ - int counter_1M=0; int counter_1F=0; int counter_2=0; - vectoronePairMsgVec_1 = pairOfPairMsgVec_.first; - vectoronePairMsgVec_2 = pairOfPairMsgVec_.second; - vectoroneIterMsgVec_; - for (int i =0; i(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx); - oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx); - oneIterMsgVec_.push_back(tempIterF); - counter_1F++; - } - - for (int i =0; irep; // vector of size_delta, name it as rep for simlisity. - for (int i =0; i,map> GetCrossIterationBlocks(vectorvec_double, int location_2nd_iteration, int iteration_length, int &double_range){ + ///get cross-iteration duration blocks + vectorvec_pool_opt_info2 = PoolOptSeqStrToStruct(vec_double,double_range); + pair,vector>pair_vec_block_life_time2=PoolOptInfoToBlockLifeTime(vec_pool_opt_info2,double_range); + + maptable_ridx_to_didx; //full duration info, cross-iteration duration. + maptable_didx_to_ridx; + for (int i=0;i,map>(table_ridx_to_didx,table_didx_to_ridx); +} - return rep; -}//end of pairOfPairMsgVec_2_repSeq function +///Section of test functions. +vector PoolOptSeqRepeatableTestPreProcess(pair,vector>pair_vec_block_life_time){ + /* + pre process pair of vector of block life time info, for ease of repeatable test. + */ + vectorvec_block_life_time1 = pair_vec_block_life_time.first; + vectorvec_block_life_time2 = pair_vec_block_life_time.second; + vectorvec_pool_opt_simplified_info; + + //process Malloc and Free pair, i.e. normal blocks + for (int i =0; i(vec_block_life_time1[i].d_idx-vec_block_life_time1[i].r_idx); + PoolOptSimplifiedInfo tempIterF(temp_s_d,-1,vec_block_life_time1[i].d_idx); + vec_pool_opt_simplified_info.push_back(tempIterF); + } + + //process Malloc-only blocks, i.e. cross-iteration blocks + for (int i =0; ivec_rep; // vector of size_delta, name it as vec_rep for simlisity. + for (int i =0; i maxRepeatedSeg(vectorrep, int idxRange, int &maxLen, int &location){ - /* - get max repeated non-overlapping Seg of a vector, return the repeated segment, - update maxLen, and location of where Seg starts to repeat. - brtue force method using equal() - time complexity O(n^2) - */ - for (int i=0; isubSeq(&rep[location],&rep[location+maxLen]); - if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){ - cout<<"error in get the maxRep"<subSeq, int &maxLen, int &location){ - /* - to cut, in case the repeated Seg contains multiple iterations. - */ - int tempMaxLen=0; - int tempLocation =0; - int tempIdxRange = maxLen; - - vectortempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation); - //TODO(junzhe), tunable threshold. - int threshold =50; - if (tempMaxLen>threshold){ - maxLen = tempMaxLen; - location += tempLocation; - cout<<"max length get cut"< PoolRepeatableTest(vectorrep, int idx_range, int &iteration_length, int &location_2nd_iteration){ + /* + get max repeated non-overlapping Seg of a vector, return the repeated segment, + update iteration_length, and location_2nd_iteration of where Seg starts to repeat. + brtue force method using equal() + time complexity O(n^2) + */ + for (int i=0; isub_sequence(&rep[location_2nd_iteration],&rep[location_2nd_iteration+iteration_length]); + if(!(equal(rep.begin()+location_2nd_iteration,rep.begin()+iteration_length-1+location_2nd_iteration,sub_sequence.begin()) && equal(rep.begin()+location_2nd_iteration+iteration_length,rep.begin()+2*iteration_length-1+location_2nd_iteration,sub_sequence.begin()))){ + cout<<"error in get the maxRep"<vec3, int &maxLen, int &location){ +void VerifyRepeatableTest(vectorsub_sequence, int &iteration_length, int &location_2nd_iteration){ /* - main function of test, returns globeCounter, which is when flag shall be switched, - update maxLen and location of where the repeated Seg starts. - */ - cout<<"====================== test ========================="<onePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3); - vectorrep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_); + to cut, in case the repeated Segment returned by PoolRepeatableTest contains multiple iterations. + */ + int temp_iteration_length = 0; + int temp_location_2nd_iteration = 0; + int temp_idx_range = iteration_length; + + //verify by testing its subsequence again + vectortempsub_sequence = PoolRepeatableTest(sub_sequence,temp_idx_range,temp_iteration_length, temp_location_2nd_iteration); - //get repeated sub vector. - vectorsubSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location); - //cout<100){ //TODO(junzhe) tunable threshold. - cout<<"new location and maxLen: "<threshold){ + iteration_length = temp_iteration_length; + location_2nd_iteration += temp_location_2nd_iteration; } - return globeCounter; } + ///verify if coloring got overlapping -void overlap_test(vector vertices){ +void OverlapVerification(vector vertices){ size_t s = vertices.size(); int i,j; for (i=0; i vertices){ SmartMemPool::SmartMemPool(const MemPoolConf &conf){ - //TODO(junzhe) to figure out what to do here. - colorMethod = "BF"; + color_method = "BF"; conf_ = conf; } void SmartMemPool::Init(){ - //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. mtx_.lock(); if(!initialized_){ initialized_ =true; @@ -689,291 +591,332 @@ void SmartMemPool::Init(){ } + +int SmartMemPool::Detection(vectorvec_string_test, int &iteration_length, int &location_2nd_iteration){ + /* + Testing repeatability from raw operation sequence + returns global_index_threshold, which is when flag shall be switched, + update iteration_length and location_2nd_iteration of where the repeated Seg starts. + */ + int idx_range_test=0; + vectorvec_pool_opt_info3 = PoolOptSeqStrToStruct(vec_string_test,idx_range_test); + pair,vector>pair_vec_block_life_time = PoolOptInfoToBlockLifeTime(vec_pool_opt_info3,idx_range_test); + vectorvec_rep = PoolOptSeqRepeatableTestPreProcess(pair_vec_block_life_time); + + //repeatable test with verification + vectorsub_sequence = PoolRepeatableTest(vec_rep,idx_range_test,iteration_length,location_2nd_iteration); + VerifyRepeatableTest(sub_sequence, iteration_length, location_2nd_iteration); + + //update global_index_threshold if test past, i.e. iteration_length exceed certain threshold + if (iteration_length>100){ //tunable threshold. + global_index_threshold = idx_range_test+iteration_length-(idx_range_test-location_2nd_iteration)%iteration_length; + } + return global_index_threshold; +} + + +/// main run funtion +vector SmartMemPool::Plan(vectorvec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method){ + /* + Planning, i.e. Assign Color to Vertices from raw operation sequence info. + input vector of strings, return colored vertices, + update idx_range, offset. + time complexity: O(n^2) where n is iteration_length. + */ + + vectorvec_pool_opt_info = PoolOptSeqStrToStruct(vec,idx_range); + pair,vector>pair_vec_block_life_time=PoolOptInfoToBlockLifeTime(vec_pool_opt_info,idx_range); + + //coloring normal blocks and cross-iteration blocks separately, cannot be miss-matched. + vectorvec_block_life_time1 = pair_vec_block_life_time.first; + vectorvec_block_life_time2 = pair_vec_block_life_time.second; + + //color cross-iteration blocks + vectorvertices_2 = AssignColorToVertices(vec_block_life_time2,offset,color_method); + + for (int i=0; ivertices = AssignColorToVertices(vec_block_life_time1,offset,color_method); + + //merge after coloring + vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end()); + + return vertices; +} + + ///Malloc void SmartMemPool::Malloc(void** ptr, const size_t size){ - /* - 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool. - 2. if flag=0, malloc/cudaMalloc, collect vec string - 3. if flag=1, look up table, malloc/cudaMalloc if not in the Table - 4. test repeated sequence every 100 blocks, update globeCounter. - */ - - //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. - //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a. - //file<vec_raw_opt_info(&vec[location_2nd_iteration],&vec[location_2nd_iteration+iteration_length]); - if (gc == globeCounter){ - /// 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool. - - mallocFlag=1; - cout<<"switched to color-malloc"<vec_run(&vec[location],&vec[location+maxLen]); - - vectorvertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod); + //color vertices + vectorvertices = Plan(vec_raw_opt_info,idx_range,offset,offset_cross_iteration,color_method); - //here to verify if the coloring got overlapping. TODO(junzhe) optional - //overlap_test(vertices); - - //obtain the cross-iteration duration info - int doubleRange=0; - vectorvec_double(&vec[location],&vec[location+2*maxLen]); - pair,map>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange); - Table_r2d = pairs.first; - Table_d2r = pairs.second; - - //update ptrPool - cudaMalloc(&ptrPool,offset); //poolSize or memory foot print offset. - cout<<"ptrPool is: "<second; - temp.size =vertices[i].size; - temp.offset=vertices[i].colorRange.first; - temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char)); - temp.Occupied =0; - temp.crossItr = vertices[i].crossItr; - temp.Occupied_backup =0; - //build tables for lookup. - Vec_r2Ver[vertices[i].r].second= temp; - } - } + //here to verify if the coloring got overlapping. for verify purpose only. + //OverlapVerification(vertices); - if(mallocFlag==0){ - /// 2. if flag=0, malloc/cudaMalloc - cudaMalloc(ptr, size); - allocatedPtr = *ptr; - //update load - if(loadLogFlag==1){ - if (gc>0){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second); - }else{ //very first block - Table_load[gc]=make_pair(size,0); - } - } - //push_back the string for later test and run. - string tempStr1 ="Malloc "; - stringstream strm2; - strm2<second.first,Table_load.find(gc-1)->second.second+size); - } - //file<<" Condition M2, addr: "<<*ptr<second.first,Table_load.find(gc-1)->second.second+size); - } - //file<<" Condition M4, addr: "<<*ptr<second.first+size,Table_load.find(gc-1)->second.second); - } - //file<<" Condition M3, addr: "<<*ptr<vec_double(&vec[location_2nd_iteration],&vec[location_2nd_iteration+2*iteration_length]); + pair,map>pairs =GetCrossIterationBlocks(vec_double,location_2nd_iteration,iteration_length,double_range); + table_ridx_to_didx = pairs.first; + table_didx_to_ridx = pairs.second; - ///4. test repeated sequence every 100 blocks, update globeCounter. - if (((gc+1)%300==0) && (mallocFlag==0) && (globeCounter==-1)&&(gc+2>checkPoint)){ - cout<<"gc and GC before test: "<0)){ - getMaxLoad(); - loadLogFlag=0; + for (int i=0; isecond; + temp.size =vertices[i].size; + temp.offset=vertices[i].color_range.first; + temp.ptr = (void*)((char*)ptr_pool+temp.offset*sizeof(char)); + temp.occupied =0; + temp.cross_iteration = vertices[i].cross_iteration; + temp.occupied_backup =0; + //build tables for lookup. + vec_block_meta[vertices[i].r].second= temp; } - - gc++; - Table_p2s[allocatedPtr]=size; //update it for load tracking purpose. - *ptr = allocatedPtr; - ///update block_RWMF - string tempStr1 ="Malloc "; + } + /// 2. if flag=0, malloc/cudaMalloc, accumulate vec_info at the beginning iterations. + if(malloc_flag ==0){ + cudaMalloc(ptr, size); + allocated_ptr = *ptr; + //update load + if(load_flag==1){ + if (global_index>0){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second); + }else{ //very first block + table_load[global_index]=make_pair(size,0); + } + } + //push_back the string for later test and run. + string temp_str1 ="Malloc "; stringstream strm2; - strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); - stringstream strm4; - strm4<second.first,table_load.find(global_index-1)->second.second+size); + } + }else if ((vec_block_meta[lookup_idx].second.cross_iteration==1) && (vec_block_meta[lookup_idx].second.occupied==1) && (vec_block_meta[lookup_idx].second.occupied_backup ==0)) { + //condition: cross_iteration's backup + allocated_ptr = (void*)((char*)vec_block_meta[lookup_idx].second.ptr+offset_cross_iteration*sizeof(char)); + vec_block_meta[lookup_idx].second.occupied_backup=1; + table_ptr_to_ridx[allocated_ptr]=lookup_idx; + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size); + } + } + }else{ + //condition: size not proper or both occupied. + cudaMalloc(ptr, size); + allocated_ptr = *ptr; + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second); + } + } + } //end of loop for flag=1 + + ///4. test repeated sequence every 300 index, update global_index_threshold. + if (((global_index+1)%300==0) && (malloc_flag ==0) && (global_index_threshold==-1)&&(global_index+2>check_point)){ + global_index_threshold = Detection(vec,iteration_length,location_2nd_iteration); + check_point=check_point*2; + } + + ///get load info, when global_index == global_index+2iteration_length + if (global_index==(global_index_threshold+2*iteration_length)&& (global_index_threshold>0)){ + GetMaxLoad(); + load_flag=0; + } + + global_index++; + //update it for load tracking purpose. + table_ptr_to_size[allocated_ptr]=size; + + //update *ptr + *ptr = allocated_ptr; + + ///update block_RWMF + string temp_str1 ="Malloc "; + stringstream strm2; + strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<second; + size_t deallocatedSize = table_ptr_to_size.find(ptr)->second; + + /// at the begining iterations, via cudaFree, accumulate opt info. + if ((global_index_threshold==-1)||(global_indexsecond.first-deallocatedSize,Table_load.find(gc-1)->second.second); - } - /// before flag switch, for sure all free shall be done by free() - cudaFree(ptr); - }else{ - if (!(Table_p2r.find(ptr)==Table_p2r.end())){ - int resp_rIdx = Table_p2r.find(ptr)->second; - Table_p2r.erase(ptr); - - if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){ - //Condition F2, from M2 - Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again. - //file<<" Condition F2, addr: "<0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){ - Vec_r2Ver[resp_rIdx].second.Occupied_backup =0; - }else{ - Vec_r2Ver[resp_rIdx].second.Occupied =0; - } - } - //update load - if(loadLogFlag==1){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize); - } + //update load before free + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second); + } + // before flag switch, for sure all free shall be done by free() + cudaFree(ptr); + }else{ + /// cases that no need accumulating opt info + + /// free a ptr that is in the memory pool + if (!(table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end())){ + int resp_rIdx = table_ptr_to_ridx.find(ptr)->second; + table_ptr_to_ridx.erase(ptr); + + if (ptr == vec_block_meta[resp_rIdx].second.ptr){ + vec_block_meta[resp_rIdx].second.occupied =0; //freed, able to allocate again. + }else if (ptr == (void*)((char*)vec_block_meta[resp_rIdx].second.ptr+offset_cross_iteration*sizeof(char))){ + vec_block_meta[resp_rIdx].second.occupied_backup =0; + } else{ + if (((float)((char*)ptr-((char*)ptr_pool+offset_cross_iteration*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptr_pool+2*offset_cross_iteration*sizeof(char)))<0)){ + vec_block_meta[resp_rIdx].second.occupied_backup =0; }else{ - //update load - if(loadLogFlag==1){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second); - } - //file<<" Condition F3, addr: "<second.first,table_load.find(global_index-1)->second.second-deallocatedSize); + } + }else{ + /// free a ptr that is NOT in the memory pool + + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second); + } + cudaFree(ptr); } - gc++; - ///update block_RWMF - string tempStr1 ="Free "; - stringstream strm2; - strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); - stringstream strm4; - strm4<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<cudaLoadLog; - for (int i=0; isecond.first); - } - size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end()); - int idxMaxCudaLoad = static_cast(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end()))); - - vectorcolorLoadLog; - for (int i=0; isecond.second); - } - size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end()); - int idxMaxColorLoad = static_cast(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end()))); - size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first; - - maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad); - maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad); - memRatio = (float)maxMemUsage/(float)maxTotalLoad; - - cout<<"=============================memory usage stats print: ================================"<vec_load_log; + for (int i=0; isecond.first); + } + size_t max_cuda_load = *max_element(vec_load_log.begin(),vec_load_log.end()); + int idx_max_cuda_load = static_cast(distance(vec_load_log.begin(),max_element(vec_load_log.begin(),vec_load_log.end()))); + + vectorvec_color_load; + for (int i=0; isecond.second); + } + size_t max_color_load = *max_element(vec_color_load.begin(),vec_color_load.end()); + int idx_max_color_load = static_cast(distance(vec_color_load.begin(),max_element(vec_color_load.begin(),vec_color_load.end()))); + size_t offset_color_load = table_load.find(idx_max_color_load)->second.first; + + max_total_load = max(max_cuda_load,max_color_load+offset_color_load); + max_mem_usage = max(max_cuda_load,offset+offset_color_load); + } std::pair SmartMemPool::GetMemUsage() { - //TODO(junzhe) note here the pair is different from that of CnMemPool. - return std::make_pair(maxMemUsage, maxTotalLoad); + //note here the pair is different from that of CnMemPool. + return std::make_pair(max_mem_usage, max_total_load); } void SmartMemPool::Append(string blockInfo) { - //TODO(junzhe) add idx later - vec_block_RW.push_back(blockInfo); - vec_block_RWMF.push_back(blockInfo); + vec_block_rw.push_back(blockInfo); + vec_block_rw_mf.push_back(blockInfo); } ///SwapPool @@ -992,121 +935,86 @@ void SwapPool::Init(){ void SwapPool::PoolOpt(vector &vec_mf) { - //TODO(junzhe) redo 9/17 - - ///process vec_mf of 3itr into blocks,maxLen - //assume format of string: MF ptr size; - //onePieceMsg_pool verified - // for (int i = 0; i< vec_mf.size();i++){ - // cout<<"print mf "<onePieceMsg_poolVec_; - maxLen_mf = vec_mf.size()/3; - cout<<"maxLen_mf "<vec_pool_opt_info; + iteration_length_mf = vec_mf.size()/3; //cos input vec_mf is of 3 iteration + + //convert raw opt info into struct: PoolOptInfo for (int i = 0;i < vec_mf.size();i++){ - vector v = split(vec_mf[i], " "); - // cout<<"print mf "< v = SplitString(vec_mf[i], " "); if (v[0]=="Malloc"){ - //convert v[2] from str to size_t size_t result; stringstream convert(v[2]); - // cout<<"1"<>result)){ - result =-1; + result = -1; cout<<"error for converting size from str to int."<pairMsgVec_; + //convert into block lifetime + vectorvec_block_life_time; int i = 0; - // cout<<"before while loop"<=0 && onePieceMsg_poolVec_[i].idx =0 && onePieceMsg_poolVec_[i+1].idx =0 && vec_pool_opt_info[i].idx =0 && vec_pool_opt_info[i+1].idx (pairMsgVec_.size()); + int m = static_cast(vec_block_life_time.size()); vectorvertices; for (int i=0; i &vec_mf) { } } - vector>colorMerged = mergeSeg(vertices[i].colorOccupied); + vector>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied); - // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); - vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset); + // vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset); + vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, offset); //update of offset, largest memory footprint as well. - if (vertices[i].colorRange.second >=offset){ - offset = vertices[i].colorRange.second+1; + if (vertices[i].color_range.second >=offset){ + offset = vertices[i].color_range.second+1; } }//end of for loop. - cout<<"offset is "<second.size))){ + //pool_flag = 1 + if (pool_index < iteration_length_mf){ + if ((table_pool_meta.find(pool_index - iteration_length_mf) == table_pool_meta.end()) || (!(size == table_pool_meta.find(pool_index - iteration_length_mf)->second.size))){ //not in table of negative r_idx cudaError_t status = cudaMalloc(ptr, size); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ //in the table of negative r_idx - auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second; - allocatedPtr = tempMeta.ptr; - *ptr = allocatedPtr; - Table_p2r[allocatedPtr]=pc-maxLen_mf; + auto temp_meta = table_pool_meta.find(pool_index - iteration_length_mf)->second; + allocated_ptr = temp_meta.ptr; + *ptr = allocated_ptr; + table_ptr_to_ridx[allocated_ptr]=pool_index - iteration_length_mf; } } else{ - //8 9 10 - int r_pc = pc%maxLen_mf; - if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){ + //8 9 10th iteration + int r_pool_index = pool_index%iteration_length_mf; + if ((table_pool_meta.find(r_pool_index) == table_pool_meta.end()) || (!(size == table_pool_meta.find(r_pool_index)->second.size))){ //not here, should be abnormal cudaError_t status = cudaMalloc(ptr, size); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ //in the table - auto tempMeta = Table_r2v.find(r_pc)->second; - allocatedPtr = tempMeta.ptr; - *ptr = allocatedPtr; - Table_p2r[allocatedPtr]=r_pc; - + auto temp_meta = table_pool_meta.find(r_pool_index)->second; + allocated_ptr = temp_meta.ptr; + *ptr = allocated_ptr; + table_ptr_to_ridx[allocated_ptr]=r_pool_index; } } } - - pc++; + pool_index++; } void SwapPool::Free(void *ptr) { - if (poolFlag == 0){ + if (pool_flag == 0){ cudaError_t status = cudaFree(ptr); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ - if (Table_p2r.find(ptr)==Table_p2r.end()){ + if (table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end()){ cudaError_t status = cudaFree(ptr); CHECK_EQ(status, cudaError_t::cudaSuccess); } - } } @@ -1215,15 +1118,7 @@ void SwapPool::Append(string blockInfo) { } -void SwapPool::SwapOut(void* data_){ - //NA -} - -void SwapPool::SwapIn(void* data_){ - //NA -} - -void getMaxLoad (){ +void GetMaxLoad (){ //empty } From 8e8a7e15fca593bea7fed35e1073e7f1dd408877 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Sun, 20 Jan 2019 15:58:01 +0800 Subject: [PATCH 19/19] Replace the strings with struct for function Append. --- include/singa/core/common.h | 13 ++++++++ include/singa/core/device.h | 22 +++++++------- src/core/common/common.cc | 33 +++++++++------------ src/core/device/cuda_gpu.cc | 4 --- src/core/device/device.cc | 19 +++++------- src/core/device/swap_gpu.cc | 59 +++++++++++++++---------------------- 6 files changed, 68 insertions(+), 82 deletions(-) diff --git a/include/singa/core/common.h b/include/singa/core/common.h index 47c1068db0..ee6f07ce3d 100644 --- a/include/singa/core/common.h +++ b/include/singa/core/common.h @@ -53,6 +53,9 @@ typedef struct _Opencl { } Opencl; } // namespace lang class Device; +struct DeviceOptInfoToAppend; + + /// Block represent a chunk of memory (on device or host). class Block { public: @@ -97,6 +100,16 @@ class Block { std::atomic ref_count_; }; +// struct for Append purpose in device class. +struct DeviceOptInfoToAppend{ + string operation_type; + string block_ptr; + int size; + long t = (std::chrono::system_clock::now()).time_since_epoch().count(); + + DeviceOptInfoToAppend(string opt_type, string ptr,int s):operation_type(opt_type),block_ptr(ptr),size(s){} +}; + typedef struct _Context { std::mt19937 random_generator; #ifdef USE_CUDA diff --git a/include/singa/core/device.h b/include/singa/core/device.h index e9dcc1402d..7d9ed57757 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -66,7 +66,6 @@ class Device { /// Called by Tensor. void FreeBlock(Block* block); - void AppendInfo(string block_info); void* UpdateGpuPtrInfo(const Block* block_ptr); /// Return the size (bytes) of memory in use @@ -107,7 +106,7 @@ class Device { int id() const { return id_; } virtual void* UpdateGpuPtr(const Block* block_ptr) = 0; - + virtual void Append(DeviceOptInfoToAppend dev_opt_info) = 0; private: Device() {}; @@ -124,7 +123,7 @@ class Device { /// Free device memory. virtual void Free(void* ptr) = 0; virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0; - virtual void Append(string block_info) = 0; + protected: int id_ = 0; @@ -154,6 +153,7 @@ class CppCPU : public Device { std::shared_ptr host() const override { return defaultDevice;} void SetRandSeed(unsigned seed) override; + void Append(DeviceOptInfoToAppend dev_opt_info) override {} protected: void DoExec(function&& fn, int executor) override; @@ -167,7 +167,7 @@ class CppCPU : public Device { /// Free cpu memory. void Free(void* ptr) override; void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} - void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} }; @@ -188,6 +188,8 @@ class CudaGPU : public Device { void SetRandSeed(unsigned seed) override; size_t GetAllocatedMem() override; + void Append(DeviceOptInfoToAppend dev_opt_info) override {} + protected: void DoExec(function&& fn, int executor) override; @@ -201,7 +203,6 @@ class CudaGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} - void Append(string block_info) override; void* UpdateGpuPtr(const Block* block_ptr) override; private: @@ -284,6 +285,8 @@ class SwapGPU : public Device { void SetRandSeed(unsigned seed) override; size_t GetAllocatedMem() override; + //Append at every index: free, read, mutable + void Append(DeviceOptInfoToAppend dev_opt_info) override; protected: void DoExec(function&& fn, int executor) override; @@ -295,10 +298,7 @@ class SwapGPU : public Device { void* Malloc(int size) override; /// Free cpu memory. - void Free(void* ptr) override; - - //Append at every index: free, read, mutable - void Append(string block_info) override; + void Free(void* ptr) override; //append info after Malloc, as Block* is not available till Malloc() done. void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; @@ -408,7 +408,7 @@ class OpenclDevice : public singa::Device { virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, CopyDirection direction, int dst_offset = 0, int src_offset = 0) override; - + void Append(DeviceOptInfoToAppend dev_opt_info) override {} protected: /// The OpenCL device that this object represents. /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the @@ -439,7 +439,7 @@ class OpenclDevice : public singa::Device { /// This has the effect of freeing up device memory. void Free(void* ptr) override; void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} - void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} diff --git a/src/core/common/common.cc b/src/core/common/common.cc index d6e9c5a301..692c1c451f 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -30,15 +30,13 @@ void* Block::mutable_data() { //Append block info: opt_type, ptr, time_stamp if (ptr_device_!=nullptr){ - stringstream strm2; - strm2<AppendInfo(temp); + stringstream strm; + strm<Append(dev_opt_info); } //update ptr after swap in done, if variable is not swapped back yet as expected. @@ -56,16 +54,13 @@ const void* Block::data() const { //Append block info: opt_type, ptr, time_stamp if (ptr_device_!=nullptr){ - //Append info. - stringstream strm2; - strm2<AppendInfo(temp); + stringstream strm; + strm<Append(dev_opt_info); } //update ptr after swap in done, if variable is not swapped back yet as expected. diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc index 7ec8a9deb5..523986f4f7 100644 --- a/src/core/device/cuda_gpu.cc +++ b/src/core/device/cuda_gpu.cc @@ -123,10 +123,6 @@ void CudaGPU::Free(void* ptr) { } } -void CudaGPU::Append(string blockInfo){ - pool_->Append(blockInfo); -} - void* CudaGPU::UpdateGpuPtr(const Block* block_){ return nullptr; } diff --git a/src/core/device/device.cc b/src/core/device/device.cc index 59faddc5c6..5a1ac270ac 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -55,23 +55,18 @@ void Device::FreeBlock(Block* block) { Free(tempPtr); //append block info for free operation. - stringstream strm1; - strm1<size()); + auto t = (std::chrono::system_clock::now()).time_since_epoch().count(); + dev_opt_info.t = t; + Append(dev_opt_info); delete block; } } -void Device::AppendInfo(string blockInfo){ - Append(blockInfo); -} void* Device::UpdateGpuPtrInfo(const Block* block_){ return UpdateGpuPtr(block_); diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc index 85a4061f30..4228f6e16a 100644 --- a/src/core/device/swap_gpu.cc +++ b/src/core/device/swap_gpu.cc @@ -904,18 +904,13 @@ void SwapGPU::AppendAfterMalloc(Block* block_ptr,void* data_ptr,int size){ */ //append info - stringstream strm1; - strm1< v = SplitOptString(block_info, " "); + //convert block_ptr from string to Block* void* temp_ptr; - stringstream convert(v[1]); + stringstream convert(dev_opt_info.block_ptr); convert>>temp_ptr; auto block_ptr = static_cast(temp_ptr); - - // insert size, malloc : flag, block_, size, t; others: insert size t. - if (v.size() != 4) { - stringstream strm1; - strm1<size(); - string temp_str1 = strm1.str(); - block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2]; - } // update global load if (iteration_length < iteration_length_threshold){ - if (v[0] == "Malloc"){ + if (dev_opt_info.operation_type == "Malloc"){ if (global_load.size()>0){ global_load.push_back(global_load[global_load.size()-1]+block_ptr->size()); } else { global_load.push_back(block_ptr->size()); } - } else if (v[0] == "Free"){ + } else if (dev_opt_info.operation_type == "Free"){ global_load.push_back(global_load[global_load.size()-1]-block_ptr->size()); } else { global_load.push_back(global_load[global_load.size()-1]); @@ -1037,6 +1013,15 @@ void SwapGPU::Append(string block_info){ } //append into vec_block + stringstream strm1; + strm1<