From e7f695dc71d87fce4f6becd4ddf43f3f7d3b6b5d Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 15:11:52 +0800
Subject: [PATCH 01/19] change / into * reverse number

---
 python/singa/tensor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index 441431fc25..ae9ba5ba83 100644
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -472,9 +472,9 @@ def __idiv__(self, x):
             x (float or Tensor):
         '''
         if isinstance(x, Tensor):
-            self.data /= x.data
+            self.data *= 1/x.data
         else:
-            self.data /= float(x)
+            self.data *= 1/float(x)
         return self
 
     '''

From 2cc6ba497c5b6631b11e33c0db8f581597c505b2 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 15:29:39 +0800
Subject: [PATCH 02/19] align train.py with vc12

---
 examples/cifar10/train.py | 50 +++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index b2ab4af689..d785aa84c1 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -31,24 +31,25 @@
 import os
 import argparse
 
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
 from singa import utils
 from singa import optimizer
 from singa import device
 from singa import tensor
+from singa.proto import core_pb2
 from caffe import caffe_net
 
-import cnn
+import alexnet
 import vgg
 import resnet
 
+from datetime import datetime
+import time
 
 def load_dataset(filepath):
     print('Loading data file %s' % filepath)
     with open(filepath, 'rb') as fd:
-        try:
-            cifar10 = pickle.load(fd, encoding='latin1')
-        except TypeError:
-            cifar10 = pickle.load(fd)
+        cifar10 = pickle.load(fd)
     image = cifar10['data'].astype(dtype=np.uint8)
     image = image.reshape((-1, 3, 32, 32))
     label = np.asarray(cifar10['labels'], dtype=np.uint8)
@@ -129,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu()
+        dev = device.create_cuda_gpu_on(2)
 
     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
@@ -137,16 +138,27 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         opt.register(p, specs)
 
     tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
-    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
     train_x, train_y, test_x, test_y = data
     num_train_batch = train_x.shape[0] // batch_size
     num_test_batch = test_x.shape[0] // batch_size
     idx = np.arange(train_x.shape[0], dtype=np.int32)
-    for epoch in range(max_epoch):
+    fileTimeLog =open("epochTimeLog.text","a")
+    for epoch in range(1):
         np.random.shuffle(idx)
         loss, acc = 0.0, 0.0
         print('Epoch %d' % epoch)
-        for b in range(num_train_batch):
+        print(datetime.now().timetz()) # miliseconds
+        print(int(round(time.time()*1000)))
+        fileTimeLog.write('Epoch %d: ' % epoch)
+        fileTimeLog.write(str(int(round(time.time()*1000))))
+        fileTimeLog.write('\n')
+        for b in range(10): #num_train_batch):
+            print ("start of iteration %d: " %b)
+            #time.sleep(1)
+            fileTimeLog.write('iteration %d: ' % b)
+            fileTimeLog.write(str(int(round(time.time()*1000))))
+            fileTimeLog.write('\n')
             x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
             y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
             tx.copy_from_numpy(x)
@@ -164,7 +176,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         print(info)
 
         loss, acc = 0.0, 0.0
-        for b in range(num_test_batch):
+        for b in range(0):
             x = test_x[b * batch_size: (b + 1) * batch_size]
             y = test_y[b * batch_size: (b + 1) * batch_size]
             tx.copy_from_numpy(x)
@@ -175,14 +187,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
 
         print('test loss = %f, test accuracy = %f' %
               ((loss / num_test_batch), (acc / num_test_batch)))
+    fileTimeLog.close()
     net.save('model', 20)  # save model params into checkpoint file
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train dcnn for cifar10')
-    parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'],
-                        default='vgg')
+    parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'],
+                        default='alexnet')
     parser.add_argument('data', default='cifar-10-batches-py')
     parser.add_argument('--use_cpu', action='store_true')
+    parser.add_argument('batch_size',type=int, default=100)
     args = parser.parse_args()
     assert os.path.exists(args.data), \
         'Pls download the cifar10 dataset via "download_data.py py"'
@@ -194,22 +208,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         net = caffe_net.create_net(args.use_cpu)
         # for cifar10_full_train_test.prototxt
         train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
         # for cifar10_quick_train_test.prototxt
         # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004,
         #      use_cpu=args.use_cpu)
-    elif args.model == 'cnn':
+    elif args.model == 'alexnet':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
-        net = cnn.create_net(args.use_cpu)
+        net = alexnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
     elif args.model == 'vgg':
         train_x, test_x = normalize_for_vgg(train_x, test_x)
         net = vgg.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
     else:
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = resnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
\ No newline at end of file

From 611ad3ccf15f8a281465445798257440dbe370b2 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 15:33:39 +0800
Subject: [PATCH 03/19] revert back train.py

---
 examples/cifar10/train.py | 50 ++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index d785aa84c1..b2ab4af689 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -31,25 +31,24 @@
 import os
 import argparse
 
-# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
 from singa import utils
 from singa import optimizer
 from singa import device
 from singa import tensor
-from singa.proto import core_pb2
 from caffe import caffe_net
 
-import alexnet
+import cnn
 import vgg
 import resnet
 
-from datetime import datetime
-import time
 
 def load_dataset(filepath):
     print('Loading data file %s' % filepath)
     with open(filepath, 'rb') as fd:
-        cifar10 = pickle.load(fd)
+        try:
+            cifar10 = pickle.load(fd, encoding='latin1')
+        except TypeError:
+            cifar10 = pickle.load(fd)
     image = cifar10['data'].astype(dtype=np.uint8)
     image = image.reshape((-1, 3, 32, 32))
     label = np.asarray(cifar10['labels'], dtype=np.uint8)
@@ -130,7 +129,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu_on(2)
+        dev = device.create_cuda_gpu()
 
     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
@@ -138,27 +137,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         opt.register(p, specs)
 
     tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
-    ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
     train_x, train_y, test_x, test_y = data
     num_train_batch = train_x.shape[0] // batch_size
     num_test_batch = test_x.shape[0] // batch_size
     idx = np.arange(train_x.shape[0], dtype=np.int32)
-    fileTimeLog =open("epochTimeLog.text","a")
-    for epoch in range(1):
+    for epoch in range(max_epoch):
         np.random.shuffle(idx)
         loss, acc = 0.0, 0.0
         print('Epoch %d' % epoch)
-        print(datetime.now().timetz()) # miliseconds
-        print(int(round(time.time()*1000)))
-        fileTimeLog.write('Epoch %d: ' % epoch)
-        fileTimeLog.write(str(int(round(time.time()*1000))))
-        fileTimeLog.write('\n')
-        for b in range(10): #num_train_batch):
-            print ("start of iteration %d: " %b)
-            #time.sleep(1)
-            fileTimeLog.write('iteration %d: ' % b)
-            fileTimeLog.write(str(int(round(time.time()*1000))))
-            fileTimeLog.write('\n')
+        for b in range(num_train_batch):
             x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
             y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
             tx.copy_from_numpy(x)
@@ -176,7 +164,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         print(info)
 
         loss, acc = 0.0, 0.0
-        for b in range(0):
+        for b in range(num_test_batch):
             x = test_x[b * batch_size: (b + 1) * batch_size]
             y = test_y[b * batch_size: (b + 1) * batch_size]
             tx.copy_from_numpy(x)
@@ -187,16 +175,14 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
 
         print('test loss = %f, test accuracy = %f' %
               ((loss / num_test_batch), (acc / num_test_batch)))
-    fileTimeLog.close()
     net.save('model', 20)  # save model params into checkpoint file
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train dcnn for cifar10')
-    parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'],
-                        default='alexnet')
+    parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'],
+                        default='vgg')
     parser.add_argument('data', default='cifar-10-batches-py')
     parser.add_argument('--use_cpu', action='store_true')
-    parser.add_argument('batch_size',type=int, default=100)
     args = parser.parse_args()
     assert os.path.exists(args.data), \
         'Pls download the cifar10 dataset via "download_data.py py"'
@@ -208,22 +194,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         net = caffe_net.create_net(args.use_cpu)
         # for cifar10_full_train_test.prototxt
         train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu,batch_size=args.batch_size)
+              use_cpu=args.use_cpu)
         # for cifar10_quick_train_test.prototxt
         # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004,
         #      use_cpu=args.use_cpu)
-    elif args.model == 'alexnet':
+    elif args.model == 'cnn':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
-        net = alexnet.create_net(args.use_cpu)
+        net = cnn.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu,batch_size=args.batch_size)
+              use_cpu=args.use_cpu)
     elif args.model == 'vgg':
         train_x, test_x = normalize_for_vgg(train_x, test_x)
         net = vgg.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
-              use_cpu=args.use_cpu,batch_size=args.batch_size)
+              use_cpu=args.use_cpu)
     else:
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = resnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
-              use_cpu=args.use_cpu,batch_size=args.batch_size)
\ No newline at end of file
+              use_cpu=args.use_cpu)

From 9974d5dc751a7643fbb0d548d604df85d150ec28 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 15:37:31 +0800
Subject: [PATCH 04/19] revert back train.py

---
 examples/cifar10/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index b2ab4af689..6e343540c3 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -212,4 +212,4 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = resnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu)
\ No newline at end of file

From 7309dc0ffc7981db35af5eeb10aababe0e284d65 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 15:41:06 +0800
Subject: [PATCH 05/19] update train iteration number

---
 examples/cifar10/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index 6e343540c3..4a4d94fe30 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -142,11 +142,11 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
     num_train_batch = train_x.shape[0] // batch_size
     num_test_batch = test_x.shape[0] // batch_size
     idx = np.arange(train_x.shape[0], dtype=np.int32)
-    for epoch in range(max_epoch):
+    for epoch in range(1):
         np.random.shuffle(idx)
         loss, acc = 0.0, 0.0
         print('Epoch %d' % epoch)
-        for b in range(num_train_batch):
+        for b in range(20):
             x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
             y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
             tx.copy_from_numpy(x)
@@ -164,7 +164,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         print(info)
 
         loss, acc = 0.0, 0.0
-        for b in range(num_test_batch):
+        for b in range(0):
             x = test_x[b * batch_size: (b + 1) * batch_size]
             y = test_y[b * batch_size: (b + 1) * batch_size]
             tx.copy_from_numpy(x)

From 25fe79dbe17382ccad07c03a8b6956d7af0b8d51 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 16:03:50 +0800
Subject: [PATCH 06/19] change cnmem src, common.h, common.cc and the cmakelist

---
 .DS_Store                   | Bin 0 -> 8196 bytes
 CMakeLists.txt              |   2 +-
 include/singa/core/common.h |  26 +++++-----
 src/.DS_Store               | Bin 0 -> 8196 bytes
 src/CMakeLists.txt          |   1 +
 src/core/.DS_Store          | Bin 0 -> 6148 bytes
 src/core/common/common.cc   |  98 ++++++++++++++++++++++++++++++++++++
 7 files changed, 114 insertions(+), 13 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 src/.DS_Store
 create mode 100644 src/core/.DS_Store
 create mode 100644 src/core/common/common.cc

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..e74703a65a58eddc3bf6ead31e5ced542081bb7d
GIT binary patch
literal 8196
zcmeHMTWl0n82<llfinZ8rwFv`x^$&fxfIya%FTk?n^+2#*e$(qDYH9cJ8?R*?96T{
zrKXZ-)EJGI#6&b2;u|JBkr-c0yhn|f*m#Q?6W@F?z8T}?KXaxnZRvwAYG6(>=R4;=
z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe
zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc
zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn
zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&*
zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu
zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd
zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H
z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O<D7hK~$g96N=
z(MDk|N9EFrHSzVgZAsp-_pY|7vX$k$VwI{65VN*t-ftTDo>5cxhWj-)YuLJJ5A_yJ
z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!&
z7c=SpNUS<m%h)ktZ_OK2OS8H$wprmCOS_bayv0^^yTV;1{Qxbe4BeqNGOjB<mX<Lr
zY1*akVZ2l6ElM_}u}R&}DzHxJ%Q_<?l&34r&9PR-KVT=t+=n#R>&h86O)s)*ua3p~
zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i
zH5|emo<<ivJckeCBls9Tj?dzY_!7R1ui#~T9pA#Y@f~~@Kfy2XJNW}6(_%d=`_nak
zPp#l#B(Vk`(;_FYCDwTR-3R_7v3|2cD)gq+Yu43l+`3~|Q}fIs&Re&uCFH9{%EFh+
zv>g1!o2x-7Wi{SIsxr<jrf9xQ=!Z=S@9IQ6&V_JfcS9|8BcyHF-LyHb@)A<6;yZ8K
zuEzNi5~&<*XjD~RN-CD4yZ6xOQqn7%idI&wRrjkYMLc30{9vUvt}^AHYwsdlf;Zqj
z_=q(3CHw$C!7uO|>1_#?V<oBW7L4OMyd5{;7OcaqxD$7i?(QVzwc`Qoz=Nc|G#<r1
z?8gDjqCwglMicX-y&{g{8GH!O`x<-{pT=iMh0l=+FXE+nn(LTWn?!tVR&&Mfylpx5
z5Lu=bI@dYuT~`d0$ZV>_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z
z_)SXmL#XnP<AnZkoN(q3Lux0;Rm7wMoRmlwD*yV2fU0P?|NG2K`osNymW&&z^*1B&
B*USI_

literal 0
HcmV?d00001

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b630497b95..7acb5feea4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 IF (USE_CUDA)
     include(ExternalProject)
     ExternalProject_Add(cnmem
-        GIT_REPOSITORY "https://github.com/nusdbsystem/cnmem.git"
+        GIT_REPOSITORY "https://github.com/junzhezhang/cnmem.git"
         GIT_TAG "master"
         SOURCE_DIR "cnmem/"
         CONFIGURE_COMMAND "${CMAKE_COMMAND}"
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 2c6d1d8668..7ea45259bb 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -24,7 +24,7 @@
 #include <atomic>
 #include <memory>
 #include "singa/utils/logging.h"
-
+#include <string>
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
@@ -52,24 +52,25 @@ typedef struct _Cuda { } Cuda;
 typedef struct _Opencl { } Opencl;
 }  // namespace lang
 
+class Device;
 /// Block represent a chunk of memory (on device or host).
 class Block {
  public:
-  Block(void* ptr, size_t size, size_t offset = 0)
-      : data_(ptr), size_(size), offset_(offset) {
+  Block(void* ptr, size_t size, size_t offset = 0, Device* ptrDevice = nullptr)
+      : data_(ptr), size_(size), offset_(offset), ptrDevice_(ptrDevice) {
     ref_count_ = 1;  // std::make_shared<std::atomic<int>>(1);
   }
   // Disabled as it is not used currently.
   // Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>>
   //  ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}
-  void* mutable_data() {
-    initialized_ = true;
-    return static_cast<char*>(data_) + offset_;
-  }
-  const void* data() const {
-    CHECK(initialized_) << "Must initialize data before reading it";
-    return static_cast<char*>(data_) + offset_;
-  }
+  void* mutable_data() ;
+ 
+  const void* data() const;
+  
+  void* get_data() ;
+
+  void update_data(void* data_new) ;
+    
   size_t size() const { return size_; }
   size_t offset() const { return offset_; }
   int IncRefCount() {
@@ -89,6 +90,7 @@ class Block {
   void* data_ = nullptr;
   size_t size_ = 0;
   size_t offset_ = 0;
+  Device* ptrDevice_;
   bool initialized_ = false;
   // Disabled as it is not used currently.
   // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
@@ -114,4 +116,4 @@ typedef struct _Context {
 } Context;
 
 }  // namespace singa
-#endif  // SINGA_CORE_COMMON_H_
+#endif  // SINGA_CORE_COMMON_H_
\ No newline at end of file
diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a87953d857c6cf9354df29d312f03c027a6411d3
GIT binary patch
literal 8196
zcmeHMTWl0n82<mZz?p&4Qv}+&E?p^AE-h?Z1d4#$n^+2#*rhE6y3Fp3?ZoNKvNOAF
zDK(WuqsC~wBqpNK5Z`LT6N&M~#Cz0uiH*0YG4ahO<C`&F{xfH`DU?3=q6X$9bG~!_
z^Z#@9|IRmaa%KSl+w*!8Ks5j`x&-o-R9&Npad9mwC4r@k5Gfwg<8C%>Ice&DxX3#+
zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr
zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj
zFd;FlLk2<y7Be6opEAgR1354jp5N11%XGbX{3|ReEnBvH1ydBJl(SQPqt1Arr+Im|
zUCW*Dd!}xrgLcw!togQ<?l+C`6kprn*q&zEMs9)o$pO>Wo$+=jZ|kmqpQD;LO0oV#
zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La
z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn
z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc<yC+Btej)Nw}3R{ZvyldQNP=Hx5
zIw{QMsLZZf7u|T<w)h=;@9LZ@UtPf~*Qn|cacg_#{icySHg4+P=yA=>7`ATOBLfA~
zHPe=nw6!s#5+@a9u6%t}b<NF@$WpF@`LuZ^Z+erwj<Gqux}sdIjohm6%j~31zv76a
zi<xwPBvKoxXY7QqcjOGJrCHq^*{X1jC0$BH-l9?6p>S78K0pg9Lz~r|jO)rVOG_J;
zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3
z!rrFqhRztPOKg_w?bA?~h|8Wr+er;<r)}#VI0*)1VG^EzC*di$0I$L&xD0Q?`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!IiiQYj7jh;U<h@EAGX8xF6fG2M^=D*o#R#g~NCn
zH5|b#o<SEqJckeCBls9Tj?dzY_!7R1uizzo9pA#Y@f~~@Kfy2XJNW}kigG<7`^6f+
zr`G%fNpcN7sFe&~ORmwbyAS?Ha{XqN)aXrX*KMfV+}ON(U)%gLUbS)yMda&73d0vn
zQ4s#}U8zDTW;J?@bY)B|rD?uGD2UAo@7h>2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm
zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C
z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZP<W~xC{4?^6n(<b>Ttm#zUmPB=+MV
z9>*cfph4;zMH6$Rz5<Tp1U`i4eHA{6PvbMB!{<nc7jgC~)pZy3CJ|p-sJcR5&bAzT
zglt8XUg;k8t_ucAWfs-q{NJ_o_y6l6*D#5Yfslb~Gk|5CeVy$zw$$5RoV8<gou*3^
x;Ws1E524CGjuZOFal-jO45=L>R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC

literal 0
HcmV?d00001

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7dd9bf7751..aa8d41f05b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,6 +30,7 @@ AUX_SOURCE_DIRECTORY(io io_source)
 AUX_SOURCE_DIRECTORY(io/network io_source)
 LIST(APPEND singa_sources ${io_source})
 
+AUX_SOURCE_DIRECTORY(core/common core_source)
 AUX_SOURCE_DIRECTORY(core/device core_source)
 AUX_SOURCE_DIRECTORY(core/memory core_source)
 AUX_SOURCE_DIRECTORY(core/scheduler core_source)
diff --git a/src/core/.DS_Store b/src/core/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001

diff --git a/src/core/common/common.cc b/src/core/common/common.cc
new file mode 100644
index 0000000000..b232398d27
--- /dev/null
+++ b/src/core/common/common.cc
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DISABLE_WARNINGS
+
+#include "singa/core/common.h"
+#include "singa/core/device.h"
+#include <iostream>
+#include <fstream>
+#include <string>
+//TODO(junzhe) ifdef to counter verify
+///only include mutable_data() and data()
+
+namespace singa {
+
+void* Block::mutable_data() {
+    //std::cout<<"mutable_data() "<<this<<' '<<data_<<std::endl;
+    initialized_ = true;
+    if (ptrDevice_!=nullptr){
+      //Append info.
+      stringstream strm2;
+      strm2<<this;
+      string tempStr2 = strm2.str();
+      stringstream strm4;
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      strm4<<t2;
+      string tempStr4 = strm4.str();
+      string temp = "Mutable "+tempStr2+" "+tempStr4;   
+      ptrDevice_->AppendInfo(temp);
+    }
+    //TODO(junzhe) this should not happen, can verify and remove
+    if (data_ == nullptr) {
+      //cout<<"to sleep"<<endl;
+      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+      cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
+    }
+    
+    return static_cast<char*>(data_) + offset_;
+  }
+
+
+const void* Block::data() const {
+    CHECK(initialized_) << "Must initialize data before reading it";
+    //std::cout<<"data() "<<this<<' '<<data_<<std::endl;
+    if (ptrDevice_!=nullptr){
+      //Append info.
+      stringstream strm2;
+      strm2<<this;
+      string tempStr2 = strm2.str();
+      stringstream strm4;
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      strm4<<t2;
+      string tempStr4 = strm4.str();
+      string temp = "Read "+tempStr2+" "+tempStr4;
+      ptrDevice_->AppendInfo(temp);
+    }
+
+    //TODO(junzhe) this should not happen, can verify and remove
+    if (data_ == nullptr) {
+      //cout<<"to sleep"<<endl;
+      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+      cout<<"slept to get data_ updated"<<endl;
+    }
+    //TODO(junzhe) for measurement only
+    // ptrDevice_->SwapOutInfo(this);
+    // ptrDevice_->SwapInInfo(this);
+
+    return static_cast<char*>(data_) + offset_;
+  }
+
+void* Block::get_data() {
+  return data_;
+}
+
+void Block::update_data(void* data_new) {
+  data_ = data_new;
+  std::cout<<"results update_data:: "<<this<<' '<<data_<<std::endl;
+}
+
+
+}  // namespace singa
+#endif
\ No newline at end of file

From fecf34fa0a9f049309c9e9b26a86105f9aa97dc4 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 16:21:27 +0800
Subject: [PATCH 07/19] disable common.cc appendInfo, for device src done
 first.

---
 src/core/common/common.cc | 83 +++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index b232398d27..73ec0b0c6e 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -28,27 +28,28 @@
 namespace singa {
 
 void* Block::mutable_data() {
+    //TODO(junzhe) go back to enable it after device done
     //std::cout<<"mutable_data() "<<this<<' '<<data_<<std::endl;
     initialized_ = true;
-    if (ptrDevice_!=nullptr){
-      //Append info.
-      stringstream strm2;
-      strm2<<this;
-      string tempStr2 = strm2.str();
-      stringstream strm4;
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Mutable "+tempStr2+" "+tempStr4;   
-      ptrDevice_->AppendInfo(temp);
-    }
-    //TODO(junzhe) this should not happen, can verify and remove
-    if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
-    }
+    // if (ptrDevice_!=nullptr){
+    //   //Append info.
+    //   stringstream strm2;
+    //   strm2<<this;
+    //   string tempStr2 = strm2.str();
+    //   stringstream strm4;
+    //   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    //   strm4<<t2;
+    //   string tempStr4 = strm4.str();
+    //   string temp = "Mutable "+tempStr2+" "+tempStr4;   
+    //   ptrDevice_->AppendInfo(temp);
+    // }
+    // //TODO(junzhe) this should not happen, can verify and remove
+    // if (data_ == nullptr) {
+    //   //cout<<"to sleep"<<endl;
+    //   cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+    //   auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+    //   cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
+    // }
     
     return static_cast<char*>(data_) + offset_;
   }
@@ -56,30 +57,28 @@ void* Block::mutable_data() {
 
 const void* Block::data() const {
     CHECK(initialized_) << "Must initialize data before reading it";
-    //std::cout<<"data() "<<this<<' '<<data_<<std::endl;
-    if (ptrDevice_!=nullptr){
-      //Append info.
-      stringstream strm2;
-      strm2<<this;
-      string tempStr2 = strm2.str();
-      stringstream strm4;
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Read "+tempStr2+" "+tempStr4;
-      ptrDevice_->AppendInfo(temp);
-    }
+    //TODO(junzhe) go back to enable it after device done
+    // if (ptrDevice_!=nullptr){
+    //   //Append info.
+    //   stringstream strm2;
+    //   strm2<<this;
+    //   string tempStr2 = strm2.str();
+    //   stringstream strm4;
+    //   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    //   strm4<<t2;
+    //   string tempStr4 = strm4.str();
+    //   string temp = "Read "+tempStr2+" "+tempStr4;
+    //   ptrDevice_->AppendInfo(temp);
+    // }
+
+    // //TODO(junzhe) this should not happen, can verify and remove
+    // if (data_ == nullptr) {
+    //   //cout<<"to sleep"<<endl;
+    //   cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+    //   auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+    //   cout<<"slept to get data_ updated"<<endl;
+    // }
 
-    //TODO(junzhe) this should not happen, can verify and remove
-    if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"slept to get data_ updated"<<endl;
-    }
-    //TODO(junzhe) for measurement only
-    // ptrDevice_->SwapOutInfo(this);
-    // ptrDevice_->SwapInInfo(this);
 
     return static_cast<char*>(data_) + offset_;
   }

From c07dcc6c622bc0aec0e48e7b1beab63a38c83601 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 16:54:12 +0800
Subject: [PATCH 08/19] update device and memory family src

---
 include/singa/core/device.h |  171 +++++-
 include/singa/core/memory.h |  143 ++++-
 src/core/device/cuda_gpu.cc |   21 +-
 src/core/device/device.cc   |   46 +-
 src/core/device/platform.cc |    6 +-
 src/core/device/swap_gpu.cc | 1144 +++++++++++++++++++++++++++++++++++
 src/core/memory/memory.cc   | 1079 ++++++++++++++++++++++++++++++++-
 7 files changed, 2595 insertions(+), 15 deletions(-)
 create mode 100644 src/core/device/swap_gpu.cc

diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 1a960d8ae7..36b10d0d56 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -24,6 +24,7 @@
 #include <string>
 #include <functional>
 #include <memory>
+#include <map>
 
 #include "singa/singa_config.h"
 #include "singa/core/common.h"
@@ -64,6 +65,11 @@ class Device {
 
   /// Called by Tensor.
   void FreeBlock(Block* block);
+  
+  void AppendInfo(string blockInfo);
+  void* GetRealGpuPtrInfo(const Block* block_);
+  void SwapOutInfo(const Block* block_);
+  void SwapInInfo(const Block* block_);
 
   /// Return the size (bytes) of memory in use
   /// TODO(wangwei) override this function for all devices.
@@ -102,6 +108,8 @@ class Device {
 
   int id() const { return id_; }
 
+  virtual void* GetRealGpuPtr(const Block* block_) = 0;
+
  private:
   Device() {};
 
@@ -117,6 +125,11 @@ class Device {
 
   /// Free device memory.
   virtual void Free(void* ptr) = 0;
+  virtual void MakeMetaTable(Block* block,void* data_,int size) = 0;
+  virtual void Append(string blockInfo) = 0;
+  
+  virtual void SwapOut(const Block* block_) = 0;
+  virtual void SwapIn(const Block* block_) = 0;
 
  protected:
   int id_ = 0;
@@ -158,6 +171,11 @@ class CppCPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+  void MakeMetaTable(Block* block,void* data_,int size) override {}
+  void Append(string blockInfo) override {}
+  void* GetRealGpuPtr(const Block* block_) override {}
+  void SwapOut(const Block* block_) override {}
+  void SwapIn(const Block* block_) override {}
 };
 
 
@@ -188,16 +206,159 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+  void MakeMetaTable(Block* block,void* data_,int size) override {}
+  void Append(string blockInfo) override;
+  void* GetRealGpuPtr(const Block* block_) override;
+  void SwapOut(const Block* block_) override;
+  void SwapIn(const Block* block_) override;
 
  private:
   void Setup();
 
  private:
-	shared_ptr<DeviceMemPool> pool_;
+  shared_ptr<DeviceMemPool> pool_;
 };
 
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
 
+///SwapGPU
+struct onePieceMsg{
+    /*
+     members: [ptr, size, MallocFree, idx]
+     */
+    string ptr;
+    size_t size;
+    int MallocFree;
+    int idx;
+    double t;
+    onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+};
+
+struct BlockMeta{
+    /*
+     block Meta.
+     */
+    Block* block_ = nullptr;
+    void* data_ = nullptr;
+    void* cpu_ptr = nullptr;
+    size_t size = 0;
+    cudaEvent_t out_event; 
+    cudaEvent_t in_event;
+    cudaStream_t out_stream;
+    cudaStream_t in_stream; 
+};
+
+struct SwapBlock{
+
+    string ptr;
+    string cat;  //A1, A2, A3...
+    int name;
+    size_t size;
+    int r_idx; //out idx
+    int d_idx; //in idx
+    double r_time; // out time
+    double d_time; //in time
+    double dt; //delta t: t2'-t1'
+    double pri;  //look at here if big enough TODO(junzhe)
+    double dto; //t2-t1
+    double wdto = 0; //t2-t1 weighted by swap_load
+    double r_idx_ready; //r_idx + buffer, could be set during selection.
+    //int free = -1; //when it is freed 
+    //below as per planned.
+    int i1;
+    int i1p;
+    int i2;
+    int i2p;
+    double t1;
+    double t2;
+    double t1p;
+    double t2p;
+    SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
+    ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
+};
+/// Device able to Swap memory between Nvidia GPU and Swap
+class SwapGPU : public Device {
+ public:
+  ~SwapGPU();
+  /// Construct the device using default mem pool setting.
+  SwapGPU(int id = 0);
+  /// Construct the device given the physical device ID and memory pool.
+  SwapGPU(int id, std::shared_ptr<DeviceMemPool> pool);
+
+  void SetRandSeed(unsigned seed) override;
+  size_t GetAllocatedMem() override;
+
+ protected:
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) override;
+
+  /// Allocate cpu memory.
+  void* Malloc(int size) override;
+
+  /// Free cpu memory.
+  void Free(void* ptr) override;
+  void MakeMetaTable(Block* block,void* data_,int size) override;
+  int swap_test(vector<string>vec_block,int &maxLen, int &location);
+  void swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
+  void swap_plan();
+  vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,double maxLoad,double memLimit,string mode);
+  vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
+  void Test_sched_switch_swap();
+  void DeploySwap();
+  void Append(string blockInfo) override;
+  void* GetRealGpuPtr(const Block* block_) override;
+  void SwapOut(const Block* block_) override;
+  void SwapIn(const Block* block_) override;
+
+  //changed to intake data_ instead
+  void SwapOut_idx(const int r_idx);
+  void SwapIn_idx(const int r_idx);
+
+ private:
+  void Setup();
+  ///Tables needed
+  //r_idx->BlockMeta
+  map<int,BlockMeta>Table_meta;
+  map<const Block*,BlockMeta>Table_block_meta; //TODO(junzhe) for measure speed only.
+  map<const Block*, int>Table_not_at_device;  //int refers to its r_idx of the block/meta
+  //map<const Block*, size_t>Table_block_size;  //Table block_ -> size TODO(junzhe) no need, can call block_->size()
+
+  //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
+  map<int,std::tuple<int,int,int,int>>Table_sched; // changed to with sync_r_idx
+
+
+  //vec_block
+  vector<string>vec_block; //iteration 0-3
+  vector<string>vec_block_fresh; //iteration 4 5 6
+  vector<double>global_load;
+  vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
+  vector<onePieceMsg>vec_run;
+  int asyncSwapFlag = 0; //0 for sync, 1 for async.
+  int testFlag = 0; //0 means open for test, 1 means no need test anymore.
+  int gc = 0; //global counter each time Malloc/Free, add 1.
+  int globeCounter = -1;
+  int maxLen = 0;
+  int location = 0;
+  //design requirement
+  float memLimit_ratio = 0.70; 
+  size_t smallest_block = 1<<20; //1 MB
+  int data_buffer = 4; // used to control readyIdx
+  int mutable_data_buffer = 6;
+  double maxLoad;
+  int maxIdx;
+  double total_swapInTime = 0;
+  double total_swapOutTime = 0;
+  double tempTime = 0;
+  double tempTime2 = 0;
+  double tempTime_baseline; //vec_run[0] time
+  int maxLen_threshold = 1000;
+
+ private:
+  shared_ptr<DeviceMemPool> pool_;
+};
+
 #endif  // USE_CUDA
 
 #ifdef USE_OPENCL
@@ -248,6 +409,12 @@ class OpenclDevice : public singa::Device {
   /// Converts the void pointer into a Buffer object, then deletes the object.
   /// This has the effect of freeing up device memory.
   void Free(void* ptr) override;
+  void MakeMetaTable(Block* block,void* data_,int size) override {}
+  void Append(string blockInfo) override {}
+  void* GetRealGpuPtr(const Block* block_) override {}
+  void SwapOut(const Block* block_) override {}
+  void SwapIn(const Block* block_) override {}
+
 
 private:
 
@@ -338,4 +505,4 @@ class Platform {
 
 }  // namespace singa
 
-#endif  // SINGA_CORE_DEVICE_H_
+#endif  // SINGA_CORE_DEVICE_H_
\ No newline at end of file
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index f664f95ced..cc40d0433c 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -23,6 +23,17 @@
 #include <atomic>
 #include "singa/proto/core.pb.h"
 #include "singa/singa_config.h"
+//for SmartMemPool
+#include <stdio.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <fstream>
+#include <stdlib.h>     /* malloc, free, rand */
+#include <map>
+using namespace std;
 
 #ifdef USE_CUDA
 #include "cnmem.h"
@@ -38,7 +49,10 @@ class DeviceMemPool {
  public:
   virtual void Malloc(void** ptr, const size_t size)  = 0;
   virtual void Free(void* ptr)  = 0;
-
+  virtual void Append(string blockInfo) = 0;
+  
+  virtual void SwapOut(void* data_) = 0;
+  virtual void SwapIn(void* data_) = 0;
   /// Return a pair for free and total memory managed by this pool.
   virtual std::pair<size_t, size_t> GetMemUsage() {
     return std::make_pair(0u, 0u);
@@ -60,7 +74,10 @@ class CnMemPool : public DeviceMemPool {
 
   void Malloc(void** ptr, const size_t size);
   void Free(void* ptr);
-
+    void Append(string blockInfo){}
+    
+  void SwapOut(void* data_) override {}
+  void SwapIn(void* data_) override {}
   std::pair<size_t, size_t> GetMemUsage() override;
 
   // release all memory and set cnmem manager to unintialized
@@ -85,7 +102,127 @@ class CudaMemPool : public DeviceMemPool {
  public:
   void Malloc(void** ptr, const size_t size) override;
   void Free(void* ptr) override;
+void Append(string blockInfo){}
+
+  void SwapOut(void* data_) override {}
+  void SwapIn(void* data_) override {}
+};
+
+//for SmartMemPool
+struct lookUpElement{
+    /*
+     for memory pool Malloc look-up table.
+     */
+    int r_idx;
+    int d_idx;
+    size_t size;
+    size_t offset;
+    void* ptr;
+    int Occupied; //0 is free, 1 is occupied.
+    int crossItr; 
+    int Occupied_backup; 
+};
+
+///class mem-pool SmartMemPool
+class SmartMemPool: public DeviceMemPool {
+public:
+    SmartMemPool(const MemPoolConf &conf); //constructor
+    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
+    void Malloc(void** ptr, const size_t size);
+    void Free(void* ptr);
+    ~SmartMemPool();
+    void getMaxLoad(void);
+    std::pair<size_t, size_t> GetMemUsage() override;
+    void Append(string blockInfo);
+    
+  void SwapOut(void* data_) override {}
+  void SwapIn(void* data_) override {}
+protected:
+    void Init();
+private:
+    MemPoolConf conf_;
+    // whether the (global) memory pool has been initialized
+    bool initialized_ = false;
+    // lock on the initialized variable
+    std::mutex mtx_;
+
+    string colorMethod;
+    int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc
+    int gc =0; //global counter each time Malloc/Free, add 1.
+    int globeCounter=-1;
+    int loadLogFlag =1; //record when its 1.
+    void* ptrPool = NULL;
+    int idxRange = 0;
+    size_t offset = 0;
+    size_t offsetCrossItr=0; //cross iteration offset.
+    int maxLen =0;
+    int location=0;
+    vector<string> vec;
+    vector<string> vec_block_RW;
+    vector<string> vec_block_RWMF;
+    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
+    map<int,int>Table_d2r;
+    //map<int,lookUpElement>Table_r2Ver;
+    vector<pair<int,lookUpElement>>Vec_r2Ver; //b. replace Table_r2Ver
+    map<int, pair<size_t,size_t>>Table_load; //gc, <cudaLoad, colorLoad>
+    map<void*,size_t>Table_p2s; //For tracking load in Free. add when allocate, delete when deallocate.
+    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
+    int checkPoint=300; //for reduce number of test.
+    size_t maxTotalLoad;
+    size_t maxMemUsage;
+    float memRatio;
+};
+
+
+//for Swap
+struct swapLookUpElement{
+    /*
+    book keep the block info and status
+     */
+    void* data_ = nullptr;
+    void* realGpuPtr = nullptr;
+    void* realCpuPtr = nullptr;
+
+    int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C.
+    size_t size; //size may used as of now.
+};
+
+struct SwapMeta{
+    /*
+     for copy between block and info.
+     */
+    size_t swapSize;
+    void* ptr;
+    void* d_ptr; //not used for
+};
+
+class Swap : public DeviceMemPool {
+public:
+    Swap(const MemPoolConf &conf); //constructor
+    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
+    void Malloc(void** ptr, const size_t size);
+    void Free(void* ptr);
+    ~Swap();
+    void getMaxLoad(void);
+    std::pair<size_t, size_t> GetMemUsage() override;
+    void Append(string blockInfo);
+    
+    void SwapOut(void* data_);
+    void SwapIn(void* data_);
+protected:
+    void Init();
+private:
+    MemPoolConf conf_;
+    // whether the (global) memory pool has been initialized
+    bool initialized_ = false;
+    // lock on the initialized variable
+    std::mutex mtx_; 
+    vector<string> vec_block;
+    size_t swapLimit = 1<<23; //8MB
+    map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
+    map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
 };
+
 #endif
 }  // namespace singa
-#endif  // SINGA_CORE_MEMORY_H_
+#endif  // SINGA_CORE_MEMORY_H_
\ No newline at end of file
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index f6603d3632..16de1ef5ca 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -23,6 +23,7 @@
 #include <curand.h>
 #include <chrono>
 #include <iostream>
+#include <string>
 #include "singa/core/device.h"
 #include "singa/utils/cuda_utils.h"
 namespace singa {
@@ -47,7 +48,7 @@ const int kNumCudaStream = 1;
 CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
   MemPoolConf conf;
   conf.add_device(id);
-  pool_ = std::make_shared<CnMemPool>(conf);
+  pool_ = std::make_shared<Swap>(conf);
   Setup();
 }
 
@@ -122,5 +123,21 @@ void CudaGPU::Free(void* ptr) {
   }
 }
 
+void CudaGPU::Append(string blockInfo){
+    pool_->Append(blockInfo);
+}
+
+void* CudaGPU::GetRealGpuPtr(const Block* block_){
+  return nullptr;
+}
+
+void CudaGPU::SwapOut(const Block* block_){
+  
+}
+
+void CudaGPU::SwapIn(const Block* block_){
+  
+}
+
 }  // namespace singa
-#endif  // USE_CUDA
+#endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index cda1b9f942..b2988f3615 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -17,6 +17,9 @@
  */
 
 #include "singa/core/device.h"
+#include <chrono>
+#include <iostream>
+#include <fstream>
 
 namespace singa {
 Device::Device(int id, int num_executors)
@@ -37,7 +40,11 @@ Block* Device::NewBlock(int size) {
     << "from size_t to int. In that case, the size is too large.";
   if (size > 0) {
     void* ptr = Malloc(size);
-    return new Block(ptr, size);
+    Block* block_ = new Block(ptr, size,0,this);
+    //std::cout<<"(reference) from device.cc after, data_, block_ device: "<<ptr<<" "<<block_<<' '<<this<<std::endl;
+    MakeMetaTable(block_,ptr,size); // make table and append vec_block.
+    //cout<<"NewBlock: "<<block_<<' '<<ptr<<endl;
+    return block_;
   } else {
     return nullptr;
   }
@@ -46,11 +53,44 @@ Block* Device::NewBlock(int size) {
 // TODO(wangwei) return Block to the memory manager
 void Device::FreeBlock(Block* block) {
   if (block != nullptr) {
-    Free(block->mutable_data());
+    //TODO(junzhe) to merge it
+    auto tempPtr = block->mutable_data();
+    //cout<<"FreeBlock: "<<block<<' '<<tempPtr<<endl;
+    Free(tempPtr);
+    //cout<<"SwapGPU::Free() returned"<<endl;
+    //Free(block->mutable_data());
+    
+    //Add Append for free here.
+    stringstream strm1;
+    strm1<<block;
+    string tempStr1 = strm1.str();
+    stringstream strm4;
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    strm4<<t2;
+    string tempStr4 = strm4.str();
+    string blockInfo ="Free "+tempStr1+" "+tempStr4;
+    Append(blockInfo);
+
     delete block;
   }
 }
 
+void Device::AppendInfo(string blockInfo){
+  Append(blockInfo);
+}
+
+void* Device::GetRealGpuPtrInfo(const Block* block_){
+  return GetRealGpuPtr(block_);
+}
+
+void Device::SwapOutInfo(const Block* block_){
+  SwapOut(block_);
+}
+
+void Device::SwapInInfo(const Block* block_){
+  SwapIn(block_);
+}
+
 void Device::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                             CopyDirection direct, int dst_offset,
                             int src_offset) {
@@ -73,4 +113,4 @@ void Device::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
        {}, {dst});
 }
 void Device::Sync() {}
-}  // namespace singa
+}  // namespace singa
\ No newline at end of file
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
index 8ae15f8604..d64b2594c2 100644
--- a/src/core/device/platform.cc
+++ b/src/core/device/platform.cc
@@ -132,7 +132,7 @@ Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {
 
   vector<shared_ptr<Device> > ret;
   for (auto device : devices) {
-    auto dev = std::make_shared<CudaGPU>(device, pool);
+    auto dev = std::make_shared<SwapGPU>(device, pool);
     ret.push_back(dev);
   }
   return ret;
@@ -170,14 +170,12 @@ Platform::CreateOpenclDevices(const size_t num_devices) {
   }
   return (int)total_num_devices;
 }
-
 static const std::vector<std::shared_ptr<Device>>
 Platform::CreateOpenclDevices(const std::vector<int> &id) {
-
 }
 */
 #endif // USE_OPENCL
 
 }  // namespace singa
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
new file mode 100644
index 0000000000..694d1d4806
--- /dev/null
+++ b/src/core/device/swap_gpu.cc
@@ -0,0 +1,1144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <chrono>
+#include <thread>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <tuple>        // std::tuple, std::get, std::tie, std::ignore
+#include "singa/core/device.h"
+#include "singa/utils/cuda_utils.h"
+
+
+using namespace std;
+namespace singa {
+
+const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
+                                   cudaMemcpyDeviceToHost,
+                                   cudaMemcpyDeviceToDevice};
+
+///functions to be used
+///Section for structs and respective sorting function:
+// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm
+
+
+
+struct less_than_ptrIdx{
+    /*
+     sort onePieceMsg by ptr and then idx.
+     */
+    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    {
+        return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
+    }
+};
+
+
+struct oneIterMsg{
+    /*
+     members: [idx, MallocFree, size_delta]
+     */
+    size_t size_delta;// type as size_t in case size if large.
+    int MallocFree;
+    int idx;
+    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+};
+
+
+struct less_than_iterIdx{
+    /*
+     sort oneIterMsg by Idx.
+     */
+    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
+    {
+        return (struct1.idx<struct2.idx);
+    }
+};
+
+struct less_than_lookupIdx{
+    /*
+     sort lookUpElement by idx.
+     */
+    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
+    {
+        return (struct1.r_idx<struct2.r_idx);
+    }
+};
+
+
+/// string delimiter
+vector<string> swap_split(string s, string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    string token;
+    vector<string> res;
+    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+    }
+    res.push_back(s.substr(pos_start));
+    return res;
+}
+
+///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
+//vector of pairMsg is used in run.
+//vector of iterMsg is used in test.
+
+vector<onePieceMsg> swap_strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+    /*
+     convert vector of string into vector of onePieceMsg, sorted by ptr 
+     and then idx, and update idxRange to pieceMsgVec size.
+     format of onePieceMsg [ptr, size/-1, flag, idx, timestamp]
+     flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable
+     version on 5/29, with equval blockInfo length: flag, block_, size, t
+     */
+    vector<onePieceMsg>onePieceMsgVec_;
+
+    for (int i=0;i<vec.size();i++) {
+      vector<string> v = swap_split(vec[i], " ");
+      int MallocFree;
+      if (v[0]=="Malloc"){
+        MallocFree = 1;
+      }else if (v[0]=="Free"){
+        MallocFree = -1;
+      }else if (v[0]=="Mutable"){
+        MallocFree = 4;
+      }else if (v[0]=="Read"){ 
+        MallocFree = 2;
+      }else if (v[0]=="Layer"){
+        MallocFree = 3;
+      }
+      //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+      size_t result;
+      stringstream convert(v[2]);
+      if (!(convert>>result)){
+          result =-1;
+          cout<<"error for converting size from str to int."<<endl;
+      }
+      onePieceMsg tempMsg(v[1],result, MallocFree, i);
+      double tempTime;
+      stringstream convert2(v[3]);
+      convert2>>tempTime;
+      tempMsg.t =tempTime;
+      onePieceMsgVec_.push_back(tempMsg);
+    }
+ 
+    sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx());
+    idxRange = static_cast<int>(onePieceMsgVec_.size());
+
+    return onePieceMsgVec_;
+}// end of strVec_2_pieceMsgVec function
+
+
+vector<size_t> Swap_piece2rep (vector<onePieceMsg>onePieceMsgVec_){
+    vector<oneIterMsg>oneIterMsgVec_;
+    string tempStr;
+    int tempIdx=0;
+    for (int i=0;i<onePieceMsgVec_.size();i++){
+        if (onePieceMsgVec_[i].MallocFree==1){
+            //update tempStr and idx.
+            tempStr = onePieceMsgVec_[i].ptr;
+            tempIdx = onePieceMsgVec_[i].idx;
+            oneIterMsg tempMsg(onePieceMsgVec_[i].size,1,onePieceMsgVec_[i].idx);
+            oneIterMsgVec_.push_back(tempMsg);
+        } else {
+            oneIterMsg tempMsg(onePieceMsgVec_[i].idx-tempIdx,onePieceMsgVec_[i].MallocFree,onePieceMsgVec_[i].idx);
+            tempIdx = onePieceMsgVec_[i].idx;
+            oneIterMsgVec_.push_back(tempMsg);
+        }
+        //cout<<oneIterMsgVec_[i].size_delta<<' '<<oneIterMsgVec_[i].MallocFree<<' '<<oneIterMsgVec_[i].idx<<endl;
+    }
+    
+    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
+    //only after sort then can create rep.
+    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
+    for (int i =0; i<oneIterMsgVec_.size(); i++){
+        rep.push_back(oneIterMsgVec_[i].size_delta);
+        //cout<<rep[i]<<endl;
+    }
+    cout<<"rep size: "<<rep.size()<<endl;
+    return rep;
+}
+void repPatternDetector(vector<size_t>rep, int &maxLen, int &location, int maxLen_threshold, int gc ){
+    int idxRange = (int)rep.size();
+    int threshold = maxLen_threshold;
+    vector<pair<int,int>>maxLen_location;
+    
+    for (int i=0; i<idxRange;i++){
+        if (maxLen>threshold){
+            break;
+        }
+        for (int len=1; len<(idxRange-i);len++){
+            if (maxLen>threshold){
+                break;
+            }
+            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
+                maxLen = len;
+                location = i;
+                maxLen_location.push_back(make_pair(maxLen,location));
+                // cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
+            }
+        }
+    }
+}// end of repPatternDetector
+
+struct less_than_Idx{
+    /*
+     sort onePieceMsg by ptr and then idx.
+     */
+    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    {
+        return (struct1.idx<struct2.idx);
+    }
+};
+
+
+int SwapOutTime(size_t size){
+    int ans =0; //TODO(junzhe) used to be 0.29; new param as per vgg
+    if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
+    return ans;
+}
+
+int SwapInTime(size_t size){
+    //yet to get the formula
+    int ans =0; //TODO(junzhe) used to be 0.13; new param as per vgg
+    if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
+    return ans;
+}
+
+struct less_than_dto{
+  /*
+   sort SwapBlock by dto, descending
+   */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.dto>struct2.dto);
+  }
+};
+
+struct less_than_wdto{
+  /*
+   sort SwapBlock by weighted dto, descending
+   */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.wdto>struct2.wdto);
+  }
+};
+
+struct less_than_r_idx_ready{
+  /*
+   sort SwapBlock by r_idx_ready, ascending
+   */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.r_idx_ready<struct2.r_idx_ready);
+  }
+};
+
+struct less_than_pri{
+    /*
+     sort SwapBlock by pri, descending
+     */
+    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+    {
+        return (struct1.pri>struct2.pri);
+    }
+};
+
+struct less_than_Idx_Swap{
+    /*
+     sort onePieceMsg_Swap by idx.
+     */
+    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+    {
+        return (struct1.r_idx<struct2.r_idx);
+    }
+};
+
+struct less_than_Idx_Swap_rvs{
+    /*
+     sort onePieceMsg_Swap by idx. reverse
+     */
+    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+    {
+        return (struct1.d_idx>struct2.d_idx);
+    }
+};
+
+
+pair<int,int> load_over_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){
+  //input: vec_load, memLimit, range [start_idx, end_idx)
+  //return range overlimit [first_over_limit, first_below_limit)
+  int first_over_limit = start_idx;
+  int first_below_limit = end_idx;
+
+  for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){
+    if (vec_load[i] > memLimit){
+      first_over_limit = i-maxLen;
+      break;
+    }
+  }
+
+  for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){
+    if (vec_load[i] > memLimit){
+      first_below_limit = i-1-maxLen;
+      break;
+    }
+  }
+  if (first_over_limit == start_idx) first_over_limit = -1;
+  if (first_below_limit == end_idx) first_below_limit = -1;
+
+  return std::make_pair(first_over_limit, first_below_limit);
+}
+
+pair<int,int> load_below_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){
+  //input: vec_load, memLimit, range [start_idx, end_idx]
+  //return range overlimit [first_over_limit, first_below_limit)
+  int first_below_limit = maxIdx;
+  int last_below_limit = maxIdx;
+
+  for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){
+    if (vec_load[i] > memLimit){
+      first_below_limit = i+1-maxLen;
+      break;
+    }
+  }
+
+  for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){
+    if (vec_load[i] > memLimit){
+      last_below_limit = i-1-maxLen;
+      break;
+    }
+  }
+
+  return std::make_pair(first_below_limit, last_below_limit);
+}
+
+pair<double,int> load_peak(vector<double>vec_load_test,int maxLen){
+  double maxLoad_test = 0;
+  int maxIdx_test = 0;
+  for (int i = maxLen; i < maxLen*2; i++){
+    if (maxLoad_test < vec_load_test[i]){
+      maxLoad_test = vec_load_test[i];
+      maxIdx_test = i - maxLen;
+    } 
+  }
+  return std::make_pair(maxLoad_test,maxIdx_test);
+}
+
+void load_update(vector<double>& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){
+  //update load [start_idx, end_idx) by plusMinus*size
+  //if (start_idx < end_idx){
+  for (int i = start_idx+maxLen; i<end_idx+maxLen; i++){
+    vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
+  }
+  // } else {
+  //   for (int i = start_idx; i < maxLen; i++){
+  //     vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
+  //   }
+  //   for (int i = 0; i < end_idx; i++){ //TODO(junzhe) NOTE, end_idx excluded
+  //     vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
+  //   }
+  // }
+}
+
+vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,double maxLoad,double memLimit,string mode){
+  vector<SwapBlock>vec_swap_selct;
+  vector<SwapBlock>vec_swap_reject;
+  if (mode == "dto"){
+    sort(vec_swap.begin(),vec_swap.end(),less_than_dto());  
+  }
+  if (mode == "pri"){
+    sort(vec_swap.begin(),vec_swap.end(),less_than_pri());  
+  }
+  if (mode == "wdto"){
+    //TODO(junzhe) time complexity
+    for (int i = 0; i < vec_swap.size(); i++){
+      auto itm = vec_swap[i];
+      for (int j = itm.r_idx; j < itm.d_idx; j++){
+        itm.wdto += origin_load[i+maxLen] - memLimit;
+      }
+    }
+    sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); 
+  }
+
+  size_t load_swap_selct = 0;
+  if (mode != "r_idx"){  
+    for (int i =0; i<vec_swap.size(); i++){
+      if ((maxLoad-memLimit)>load_swap_selct){
+        vec_swap_selct.push_back(vec_swap[i]);
+        load_swap_selct+=vec_swap[i].size;  
+        //cout<<"Item selected: (r_idx, d_idx, dto) "<<vec_swap[i].r_idx<<"  "<<vec_swap[i].d_idx<<"  "<<static_cast<int>(vec_swap[i].dto/1000000)<<endl;
+      } else {
+        //break; TODO(junzhe) to resume break
+        vec_swap_reject.push_back(vec_swap[i]);
+      }
+    }
+  } else {
+    //TODO(junzhe) other modes not done yet
+    // sort(vec_swap.begin(),vec_swap.end(),less_than_r_idx_ready());
+    // auto temp_load = origin_load;
+    // int i = 0;
+    // while ((maxLoad-memLimit)>load_swap_selct){
+
+    // }
+    // for (int i =0; i<vec_swap.size(); i++){
+
+    // }
+
+  }
+  
+  return vec_swap_selct;
+}
+
+vector<double> SwapGPU::swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct){
+  auto vec_load_return = vec_load;
+  for (int i =0; i<vec_swap_selct.size(); i++){
+    int auto_buffer = 0;
+    auto itm = vec_swap_selct[i];
+    if (itm.cat == "A2") auto_buffer = data_buffer;
+    if (itm.cat == "A3") auto_buffer = mutable_data_buffer;
+    load_update(vec_load_return, itm.r_idx+auto_buffer, itm.d_idx, -1, itm.size, maxLen);
+  }
+  return vec_load_return;
+}
+
+void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode){
+  /*
+    update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit.
+  */ 
+  //TODO(junzhe) wordy, can merge in common part.
+  if (mode == "no-overhead"){
+    //update i1p
+    //sort by r_idx for i1p update
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    for (int i = 0; i<vec_swap_selct.size(); i++){
+      auto itm = vec_swap_selct[i];
+      int readyIdx = 0;
+      if (itm.cat == "A1") { readyIdx = itm.r_idx; }
+      if (itm.cat == "A2") { readyIdx = itm.r_idx + data_buffer; }
+      if (itm.cat == "A3") { readyIdx = itm.r_idx + mutable_data_buffer; }
+
+      if (i > 0){
+        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+      }
+      itm.i1 = readyIdx;
+      itm.t1 = vec_run[readyIdx].t;
+      itm.t1p = itm.t1 + SwapOutTime(itm.size);
+      while (itm.t1p > vec_run[readyIdx].t){
+        readyIdx++;
+      }
+      itm.i1p = readyIdx;
+      vec_swap_selct[i] = itm;
+    }
+    //update i2p
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+    for (int i =0; i<vec_swap_selct.size(); i++){
+      auto itm = vec_swap_selct[i];
+      int needIdx = itm.d_idx;
+      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
+      itm.i2 = needIdx;
+      double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
+      while (prepareTime < vec_run[needIdx].t){
+        needIdx--;
+      }
+      itm.i2p = needIdx;
+      itm.t2p = prepareTime;
+      vec_swap_selct[i] = itm;
+      load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+    }
+
+    }
+  if (mode == "stick-to-limit"){
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    for (int i = 0; i<vec_swap_selct.size(); i++){
+      auto itm = vec_swap_selct[i];
+      int readyIdx = itm.r_idx_ready;
+      cout<<"change of readyIdx: "<<readyIdx;
+      if (i > 0){
+        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+      }
+      cout<<"||compare with last i1p "<<readyIdx;
+      itm.i1 = readyIdx;
+      itm.t1 = vec_run[readyIdx].t;
+      itm.t1p = itm.t1 + SwapOutTime(itm.size);
+      total_swapOutTime+=SwapOutTime(itm.size);
+      while (itm.t1p > vec_run[readyIdx].t){ //TODO(junzhe) reduce time complexity.
+        readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead.
+      }
+      //get min compare with maxIdx and readyIdx.
+      readyIdx = std::min(maxIdx,readyIdx);
+      cout<<"||count swap time "<<readyIdx;
+      load_update(vec_load_temp,readyIdx+1,itm.d_idx,-1,itm.size,maxLen);
+      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      if ((tempOverLimit_.first != -1) && (tempOverLimit_.first <= readyIdx)) { 
+        load_update(vec_load_temp,tempOverLimit_.first-1,readyIdx+1,-1,itm.size,maxLen);
+        cout<<"((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
+        readyIdx = tempOverLimit_.first -1; //TODO(junzhe) boundary
+
+      }
+      cout<<"||count over limit "<<readyIdx<<endl;
+      auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      itm.i1p = readyIdx;
+      cout<<"end: overlimit first and i1p "<<tempOverLimit_2.first<<' '<<itm.i1p<<endl;
+      vec_swap_selct[i] = itm;
+    }
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+    cout<<"right over limit ===================="<<endl;
+    for (int i =0; i<vec_swap_selct.size(); i++){
+      auto itm = vec_swap_selct[i];
+      int needIdx = itm.d_idx;
+      cout<<"change of needIdx: "<<needIdx;
+      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
+      cout<<"||compare with last i2p "<<needIdx;
+      itm.i2 = needIdx;
+      double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
+      total_swapInTime+=SwapInTime(itm.size);
+      while (prepareTime < vec_run[needIdx].t){
+        needIdx--;
+      }
+      needIdx = std::max(needIdx,maxIdx+1);
+      cout<<"||count swap in "<<needIdx;
+      itm.i2p = needIdx;
+      itm.t2p = prepareTime;
+      auto tempOverLimit_0 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      cout<<"(((before come back (right over limit): "<<tempOverLimit_0.second<<endl;
+      load_update(vec_load_temp,itm.i2p,itm.d_idx,1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      cout<<"|||after come back (right over limit): "<<tempOverLimit_.second<<endl;
+      if (tempOverLimit_.second > 0){
+        cout<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<itm.i1p<<' '<<itm.i2p<<' '<<itm.i2<<")))"<<endl;
+      }
+
+      if ((tempOverLimit_.second != -1) && (vec_run[tempOverLimit_.second].t > itm.t2p)) {
+        overhead+=(vec_run[tempOverLimit_.second].t - itm.t2p);
+        load_update(vec_load_temp,itm.i2p,tempOverLimit_.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+        itm.i2p = tempOverLimit_.second+1;
+        auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      }
+      cout<<"||count overlimit "<<itm.i2p<<endl;
+      //cout<<"after consider overlimit (right over limit): "<<tempOverLimit_2.second<<endl;
+      vec_swap_selct[i] = itm;
+    }
+  }
+  
+}
+
+
+int SwapGPU::swap_test(vector<string>vec_block,int &maxLen, int &location){
+
+  ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx.
+  int idxRange = 0;
+  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
+  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
+  ///rep test
+  vector<size_t> vec_rep = Swap_piece2rep(vec_pieceMsg);
+  //int idxRange3=0; //rename TODO(junzhe)
+  //int maxLen=0, location =0;
+  repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc);
+  cout<<"maxLen and location are: "<<maxLen<<' '<<location<<endl;
+  cout<<"test rep"<<endl;
+  //Note here location not exactly start of one iteration, 
+  //adjust to nearly start of one by restricting "Malloc"
+  int shift_counter = 0;
+  for (int i=0;i<maxLen;i++){
+    vector<string> v = swap_split(vec_block[location+i], " ");
+    if (v[0]=="Malloc"){
+      shift_counter = i; 
+      break;
+    }
+  }
+  location =location+shift_counter;
+  cout<<"shift_counter is "<<shift_counter<<endl;
+  cout<<"location changed to "<<location<<endl;
+
+  if (maxLen<maxLen_threshold) {return -1;}
+
+  return gc+maxLen-(gc-location)%maxLen;
+} 
+
+void SwapGPU::swap_plan(){
+  cout<<":::::::::::::::::::::::::start swap_plan()"<<endl;
+
+  int idxRange = 0;
+  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
+  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
+  sort(vec_pieceMsg.begin(),vec_pieceMsg.end(),less_than_Idx());
+  // scale down idx, to middle iteration.
+  tempTime_baseline = vec_pieceMsg[location].t;
+  for (int i=0; i<vec_pieceMsg.size();i++){
+    vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - location - maxLen;
+    vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline;
+  }
+  //3 iterations of vec_run and vec_load, maxIdx and maxLoad
+  vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]);
+  vec_run = temp_vec_run;
+  fstream file_vec_run("vec_run.csv", ios::in|ios::out|ios::app);
+  for (int i =0; i<vec_run.size();i++){
+    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
+    file_vec_run<<i<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
+    tempTime2 = vec_run[i].t;
+  }
+  vector<double>vec_load(&global_load[location],&global_load[location+3*maxLen]);
+  origin_load = vec_load;
+  //load before swap, write in
+  fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
+  for (int i=0; i<origin_load.size(); i++){
+    file_load_origin<<origin_load[i]<<endl;
+  }
+
+  fstream file_load_current("load_current.csv", ios::in|ios::out|ios::app);
+  for (int i=0; i<maxLen; i++){
+    file_load_current<<vec_load[i]<<endl;
+  }
+  cout<<"build from scratch:::::::::::::::;"<<endl;
+  auto max_current = load_peak(vec_load,maxLen);
+  maxLoad = max_current.first;
+  maxIdx = max_current.second;
+  cout<<"------------------print max_load: (current) "<<maxLoad<<" "<<maxIdx<<endl;
+
+  //sort by ptr & idx
+  sort(vec_run.begin(),vec_run.end(),less_than_ptrIdx());
+  ///formulate swappable items.
+  cout<<"==============================print swappable items "<<maxIdx<<endl;
+  vector<SwapBlock>vec_swap;
+  size_t load_swap = 0;
+  for (int i =1; i<vec_run.size(); i++){
+    //SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
+    //ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
+    if ((vec_run[i].size >= smallest_block) && (vec_run[i-1].idx<maxIdx) && (vec_run[i].idx>maxIdx) 
+      && (vec_run[i-1].ptr ==vec_run[i].ptr) 
+      && ((vec_run[i-1].MallocFree==3) or (vec_run[i-1].MallocFree==2) or (vec_run[i-1].MallocFree==4)))
+    {
+      SwapBlock itm(vec_run[i].ptr, vec_run[i].size, vec_run[i-1].idx, vec_run[i].idx, vec_run[i-1].t, vec_run[i].t);
+      itm.dto = itm.d_time-itm.r_time;
+      itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
+      if (itm.dt>=0){
+        itm.pri = itm.dt * itm.size;
+      } else {
+        itm.pri = itm.dt * 1/itm.size;
+      }
+      //cat A
+      if (vec_run[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
+      if (vec_run[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
+      if (vec_run[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
+
+      vec_swap.push_back(itm);
+      load_swap+=itm.size;
+      cout<<"Items Swappable: (r_idx, d_idx, cat, MB, dt/us, PS) || "<<itm.r_idx<<' '<<itm.d_idx;
+      cout<<" ||  "<<itm.cat<<"  ||"<<(float)(itm.size)/(float)(1024*1024);
+      cout<<' ||'<<itm.dt/1000<<' '<<itm.pri<<endl;
+    } 
+  }
+  cout<<"size vec_swap: "<<vec_swap.size()<<endl;
+
+  ///load ideal, swap all vec_swap, lest possible memory by one-swap
+  auto vec_load_ideal = swap_load_ideal(vec_load,vec_swap);
+  fstream file_load_ideal("load_ideal.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_load_ideal<<vec_load_ideal[i]<<endl;
+  }
+
+  auto max_ideal = load_peak(vec_load_ideal,maxLen);
+  size_t maxLoad_ideal = max_ideal.first;
+  int maxIdx_ideal = max_ideal.second;
+  cout<<"------------------print max_load: (ideal) "<<maxLoad_ideal<<" "<<maxIdx_ideal<<endl;
+  //maxLoad_ideal = 400;
+  /// select till maxLoad_ideal, dto
+  auto vec_swap_dto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"dto");
+  cout<<"size of vec_swap_dto: "<<vec_swap_dto.size()<<endl;
+  auto vec_load_dto_ideal = swap_load_ideal(vec_load,vec_swap_dto);
+    fstream file_load_dto_ideal("load_dto_ideal.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_load_dto_ideal<<vec_load_dto_ideal[i]<<endl;
+  }
+  auto tempMax_ = load_peak(vec_load_dto_ideal,maxLen);
+  cout<<"------------------print max_load: (dto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+
+  /// select till maxLoad_ideal, pri
+  auto vec_swap_pri = swap_select(vec_swap,maxLoad,maxLoad_ideal,"pri");
+  cout<<"size of vec_swap_pri: "<<vec_swap_dto.size()<<endl;
+  auto vec_load_pri_ideal = swap_load_ideal(vec_load,vec_swap_pri);
+    fstream file_load_pri_ideal("load_pri_ideal.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_load_pri_ideal<<vec_load_pri_ideal[i]<<endl;
+  }
+  tempMax_ = load_peak(vec_load_pri_ideal,maxLen);
+  cout<<"------------------print max_load: (pri ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+
+  /// select till maxLoad_ideal, wdto
+  auto vec_swap_wdto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"wdto");
+  cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
+  auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
+    fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
+  }
+  tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
+  cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+
+  /// load_1 no overhead, based on pri
+  //auto vec_swap_pri = vec_swap_pri;
+  auto vec_load_pri = origin_load;
+  auto vec_load_dto = origin_load;
+  auto vec_load_wdto = origin_load;
+  string mode = "stick-to-limit";
+  double overhead_pri = 0;
+  swap_sched(vec_swap_pri, vec_load_pri,overhead_pri,450<<20, mode);
+  
+  double overhead_dto = 0;
+  swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
+  
+  double overhead_wdto = 0;
+  swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,450<<20,mode);
+  
+
+
+  fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_block10<<vec_load_pri[i]<<endl;
+  }
+  fstream file_block11("load_1_dto.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_block11<<vec_load_dto[i]<<endl;
+  }
+  fstream file_block12("load_1_wdto.csv", ios::in|ios::out|ios::app);
+  for (int i=maxLen; i<maxLen*2; i++){
+    file_block12<<vec_load_wdto[i]<<endl;
+  }
+  auto max_1 = load_peak(vec_load_pri,maxLen);
+  size_t maxLoad_1 = max_1.first;
+  int maxIdx_1 = max_1.second;
+  cout<<"------------------print max_load: (1) "<<maxLoad_1<<" "<<maxIdx_1<<endl;
+  //change back order by Idx.
+  sort(vec_run.begin(),vec_run.end(),less_than_Idx());
+  cout<<"done with swap_plan..."<<endl;
+  cout<<"load 2 overhead pri: "<<(float)(overhead_pri)/(float)(1000000)<<endl;
+  cout<<"load 2 overhead dto: "<<(float)(overhead_dto)/(float)(1000000)<<endl;
+  cout<<"load 2 overhead wdto: "<<(float)(overhead_wdto)/(float)(1000000)<<endl;
+  cout<<"total_swapOutTime: "<<(float)(total_swapOutTime)/(float)(3000000)<<endl;
+  cout<<"total_swapInTime: "<<(float)(total_swapInTime)/(float)(3000000)<<endl;
+  auto t1 = vec_run[0].t;
+  auto t2 = vec_run[maxLen].t;
+  auto t3 = vec_run[maxLen*2].t;
+  auto t4 = vec_run[maxLen*3-1].t;
+  cout<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
+  fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
+  file_time<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
+  file_time<<"iteration time spent: "<<t2-t1<<" "<<t3-t2<<" "<<t4-t3<<endl;
+  file_time<<"idx "<<0<<" "<<maxLen<<" "<<maxLen*2<<" "<<maxLen*3-1<<endl;
+  file_time<<"value "<<t1<<" "<<t2<<" "<<t3<<" "<<t4<<endl;
+}
+
+
+
+
+
+
+SwapGPU::~SwapGPU() {
+  //print out push-info TODO(junzhe) can remove
+  fstream file_block_full("vec_block_full.csv", ios::in|ios::out|ios::app);
+  for (int i =0; i<vec_block.size();i++){
+    file_block_full<<vec_block[i]<<endl;
+  }
+
+  fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
+  file_time<<"=============================="<<endl;
+  //main body
+  if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
+  if (ctx_.curand_generator)
+    CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
+#ifdef USE_CUDNN
+  if (ctx_.cudnn_handle) {
+    auto status = cudnnDestroy(ctx_.cudnn_handle);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+  }
+#endif
+}
+const int kNumCudaStream = 1;
+
+SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) {
+
+  MemPoolConf conf;
+  conf.add_device(id);
+  pool_ = std::make_shared<Swap>(conf);
+  Setup();
+
+}
+
+SwapGPU::SwapGPU(int id, std::shared_ptr<DeviceMemPool> pool)
+    : Device(id, kNumCudaStream) {
+  CHECK(pool != nullptr);
+  pool_ = pool;
+  Setup();
+}
+
+void SwapGPU::Setup() {
+  lang_ = kCuda;
+  ctx_.stream = NULL;  // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUDA_CHECK(cudaSetDevice(id_));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+}
+
+void SwapGPU::SetRandSeed(unsigned seed) {
+  CHECK(ctx_.curand_generator);
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+}
+
+void SwapGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
+
+void SwapGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                         CopyDirection direction, Context* ctx) {
+  cudaMemcpy(dst, src, nBytes, copyKind[direction]);
+  // TODO(wangwei) use async copy
+  // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
+}
+
+size_t SwapGPU::GetAllocatedMem() {
+  if (pool_ != nullptr) {
+    auto ret = pool_->GetMemUsage();
+    return ret.second - ret.first;
+  }
+  LOG(ERROR) << "The memory pool is not set";
+  return 0u;
+}
+
+/// Allocate gpu memory.
+void* SwapGPU::Malloc(int size) {
+
+  void* ptr = nullptr;
+  if (size > 0) {
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Malloc((void**)&ptr, size);
+    // TODO(wangwei) remove the memset.
+    CUDA_CHECK(cudaMemset(ptr, 0, size));
+  }
+  //cout<<"malloc done"<<endl;
+  return ptr;
+}
+
+/// Free gpu memory.
+void SwapGPU::Free(void* ptr) {
+
+  if (ptr != nullptr) {
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Free(ptr);
+  }
+
+  //cout<<"free done"<<endl; 
+}
+
+void SwapGPU::Test_sched_switch_swap(){
+  /*
+    v1: do Test_sched_switch_swap during (before) Malloc and Free.
+    swap removed to DeploySwap
+    v2: test after every index, at Append. order and index changed.
+  */
+  ///test & schedule
+  if (((gc+1)%(maxLen_threshold) == 0) && (asyncSwapFlag == 0) && (testFlag == 0)){
+    //TODO(junzhe) not lean, chances are globeCounter found more than 300 idx ago: redudant test.
+    cout<<"gc, GC and vec_len before test: "<<gc<<' '<<globeCounter<<' '<<vec_block.size()<<endl;
+    globeCounter = swap_test(vec_block,maxLen,location);
+    maxLen_threshold = std::max(maxLen_threshold,gc/10);
+    maxLen_threshold = std::min(2000,maxLen_threshold);
+    if (maxLen > maxLen_threshold) {
+      testFlag = 1;
+      cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<<endl;
+      cout<<"size of Table_sched: "<<Table_sched.size()<<endl;
+      cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
+      cout<<"impt numbers (maxLen, location, GC) "<<maxLen<<' '<<location<<' '<<globeCounter<<endl;
+      
+   }
+ }
+ ///switch flag; next idx
+ if ((gc+1) == globeCounter){
+    // swap_plan();
+    vector<double>vec_load2(&global_load[location],&global_load[location+3*maxLen]);
+    origin_load = vec_load2;
+    //load before swap, write in
+    fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
+    for (int i=0; i<origin_load.size(); i++){
+      file_load_origin<<origin_load[i]<<endl;
+    }
+    asyncSwapFlag = 1;
+    cout<<"switched flag for at "<<globeCounter<<endl;
+ }
+}
+
+void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){
+  /*
+  Append info right after Malloc; make block_ - data_ pair wise table.
+  */
+
+  //append info
+  stringstream strm1;
+  strm1<<size;
+  string tempStr1 = strm1.str();
+  stringstream strm3;
+  strm3<<block_;
+  string tempStr3 = strm3.str();
+  stringstream strm4;
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  strm4<<t2;
+  string tempStr4 = strm4.str();
+  string blockInfo ="Malloc "+tempStr3+" "+tempStr1+" "+tempStr4;
+  Append(blockInfo);
+
+  
+}
+
+void SwapGPU::DeploySwap(){
+   ///swap as per schedule: this version: both out and in sync
+  int r_gc = (gc-location)%maxLen; 
+  if ((asyncSwapFlag == 1) && (!(Table_sched.find(r_gc) == Table_sched.end()))){
+    cout<<"--------sched action at "<<r_gc<<endl;
+    auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
+    auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
+    auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
+    auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
+    if (swap_dir == 0){ SwapOut_idx(swap_idx); 
+      cout<<"Swap Out "<<swap_idx<<endl;
+    }
+    if (swap_dir == 1){ SwapIn_idx(swap_idx); 
+      cout<<"Swap In "<<swap_idx<<endl;
+    }
+    //TODO(junzhe) verify sync what else to be done
+    if (sync_dir == 0){
+      auto last_meta = Table_meta.find(sync_idx)->second;
+      auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      cudaEventSynchronize(last_meta.in_event);
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+      last_meta.block_->update_data(nullptr);
+      cout<<"to free data_"<<last_meta.data_<<endl;
+      pool_->Free(last_meta.data_);
+      last_meta.data_ = nullptr; //not really needed TODO(junzhe)
+      cout<<"sync out "<<sync_idx<<endl;
+      Table_meta.find(sync_idx)->second = last_meta;
+    }
+    if (sync_dir == 1){
+      //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
+      auto last_meta = Table_meta.find(sync_idx)->second;
+      auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      cudaEventSynchronize(last_meta.out_event);
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      Table_not_at_device.erase(last_meta.block_);
+      last_meta.block_->update_data(last_meta.data_);
+      cout<<"sync in "<<sync_idx<<endl;
+      Table_meta.find(sync_idx)->second = last_meta;
+    }
+    cout<<"-------"<<endl;
+  } 
+}
+
+void SwapGPU::Append(string blockInfo){
+  vector<string> v = swap_split(blockInfo, " ");
+  void* tempPtr;
+  stringstream convert(v[1]);
+  convert>>tempPtr;
+  auto tempBlock_ = static_cast<Block*>(tempPtr);
+  
+  // insert size, malloc : flag, block_, size, t; others: insert size t.
+  if (v.size() != 4) {
+    stringstream strm1;
+    strm1<<tempBlock_->size();
+    string tempStr1 = strm1.str();
+    blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2];
+  }
+  // update global load
+  if (maxLen < maxLen_threshold){
+    if (v[0] == "Malloc"){
+      if (global_load.size()>0){
+        global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size());
+      } else {
+        global_load.push_back(tempBlock_->size());
+      }
+    } else if (v[0] == "Free"){
+      global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size());
+    } else {
+      global_load.push_back(global_load[global_load.size()-1]);
+    }
+  }
+  //cout<<blockInfo<<endl;
+  //cout<<tempBlock_->size()<<endl;
+  //cout<<"load: "<<global_load[global_load.size()-1]<<" len of blockInfo and global_load "<<vec_block.size()<<' '<<global_load.size()<<endl;
+
+  //std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+  vec_block.push_back(blockInfo);
+  if (asyncSwapFlag == 1){
+    vec_block_fresh.push_back(blockInfo);
+  }
+  if ((maxLen>maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){
+    fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app);
+    for (int i =0; i<vec_block_fresh.size();i++){
+      file_block_fresh<<vec_block_fresh[i]<<endl;
+    }
+  }
+  // fstream file_block5("append.csv", ios::in|ios::out|ios::app);
+  // file_block5<<gc<<' '<<blockInfo<<' '<<(gc-1247)%612<<endl;
+
+  // update Table_meta's block_ and data_
+  if (maxLen > maxLen_threshold) {
+    //cout<<gc<<' '<<(gc-location)%maxLen<<' '<<blockInfo<<endl;
+    int r_gc = (gc-location)%maxLen;
+    if (!(Table_meta.find(r_gc)==Table_meta.end())){
+      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
+      //TODO(junzhe) verify the length change, if go in, value update
+      cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
+      Table_meta.find(r_gc)->second.block_ = tempBlock_;
+      Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
+    }
+  }
+  if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){
+    if (tempTime != 0){
+      fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
+      auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count();
+      file_time<<(float)(t_now - tempTime)/(float)(1000000)<<endl;
+      
+    }
+    tempTime = (std::chrono::system_clock::now()).time_since_epoch().count();
+  }
+
+  //deploy swap at every index.
+  DeploySwap();
+
+  //test moved from start of malloc/free to end of append, only gc+1 changed
+  Test_sched_switch_swap();
+  //NOTE: this gc++ includes read/write and AppendLayer as well, in addition to malloc/free.
+  gc++;
+
+}
+
+void* SwapGPU::GetRealGpuPtr(const Block* block_){
+  //here should be not update_data()
+  auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
+  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  cudaEventSynchronize(reading_meta.in_event);
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
+  //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
+  //reading_meta.block_->update_data(reading_meta.data_);
+
+  //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
+
+  Table_not_at_device.erase(reading_meta.block_);
+
+  return nullptr; //TODO(junzhe) attention, based on no change here.
+}
+
+void SwapGPU::SwapOut_idx(const int r_idx){
+  //cout<<"doing asynchrous swapOut of r_idx: "<<r_idx<<' '<<endl;
+  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();  
+  cudaError_t err;
+  BlockMeta meta = Table_meta.find(r_idx)->second;
+  cudaEventCreate (&meta.out_event);
+  //cout<<"right before cudaMemcpyAsync Out"<<endl;
+  err = cudaMemcpyAsync(meta.cpu_ptr,meta.data_,meta.size,cudaMemcpyDeviceToHost,meta.out_stream);
+  cudaEventRecord(meta.out_event,meta.out_stream);
+  //cout<<"right after cudaMemcpyAsync"<<endl;
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  cout<<"To update_data swap for (Out) "<<r_idx<<" "<<meta.block_<<" 0"<<endl;
+  Table_meta.find(r_idx)->second = meta;
+  //cout<<"time for asynchrous: "<<t2-t1<<endl;
+  //cudaEventSynchronize(event1);
+  //auto t4 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  //cout<<"time for asynchrous to complete: "<<t4-t1<<endl;
+}
+
+void SwapGPU::SwapIn_idx(const int r_idx){
+  //logic: extra meta, swap, update meta in Table
+  //TODO(junzhe) to clean up free(), make it in somewhere else.
+  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  cudaError_t err;
+  BlockMeta meta = Table_meta.find(r_idx)->second;
+  cudaEventCreate (&meta.in_event);
+  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
+  void* ptr = nullptr;
+  pool_->Malloc((void**)&ptr, meta.size);
+  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
+  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
+  //void* to_rm_ptr = meta.data_;
+  meta.data_ = ptr;
+  cout<<"right before cudaMemcpyAsync In"<<endl;
+  err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
+  cudaEventRecord(meta.in_event,meta.in_stream);
+  cout<<"right after cudaMemcpyAsync"<<endl;
+  cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
+  Table_meta.find(r_idx)->second = meta;
+  //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update.
+  //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  //cout<<"time for asynchrous: "<<t2-t1<<endl;
+}
+
+void SwapGPU::SwapOut(const Block* block_){
+  if (gc < 1000 && block_->size() > 1<<20) {
+    fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
+    BlockMeta meta;
+    meta.data_ = meta.block_->get_data();
+    void* tempPtr = nullptr;
+    cudaMallocHost(&tempPtr,block_->size()); //pinned memory.
+    meta.cpu_ptr = tempPtr;
+    Table_block_meta[block_] = meta;
+    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    cudaError_t err;
+    err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost);
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    file_block5<<"Out "<<block_->size()<<' '<<t2-t1<<endl;
+    cout<<"swap out done at gc: "<<gc<<endl;
+  }
+}
+
+void SwapGPU::SwapIn(const Block* block_){
+  if (gc < 1000 && block_->size() > 1<<20) {
+    fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
+    BlockMeta meta = Table_block_meta.find(block_)->second;
+    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    cudaError_t err;
+    err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice);
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    file_block5<<"In "<<block_->size()<<' '<<t2-t1<<endl;
+    cout<<"swap in done at gc: "<<gc<<endl;
+  }
+}
+
+
+}  // namespace singa
+#endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index 0fb851125a..93a17a823f 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -21,6 +21,10 @@
 #include "singa/utils/logging.h"
 #include "singa/proto/core.pb.h"
 #include <iostream>
+#include <fstream> //a.
+#include <chrono>
+//for SmartMemoryPool
+using namespace std;
 
 #ifdef USE_CUDA
 
@@ -94,6 +98,7 @@ void CnMemPool::Malloc(void **ptr, const size_t size) {
 
 void CnMemPool::Free(void *ptr) {
   CHECK(initialized_) << "Cannot free the memory as the pool is not initialzied";
+  //cout<<"(normal)to free ptr "<<ptr<<endl;
   cnmemStatus_t status = cnmemFree(ptr, NULL);
   CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
       << " " << cnmemGetErrorString(status);
@@ -109,7 +114,1079 @@ void CudaMemPool::Free(void *ptr) {
   cudaError_t status = cudaFree(ptr);
   CHECK_EQ(status, cudaError_t::cudaSuccess);
 }
+
+//for SmartMemPool
+///vertex of the graph.
+class Vertex {
+public:
+    int name;
+    size_t size;
+    int r; //arrive
+    int d; //depart
+    int crossItr =0;
+    Vertex(int,size_t,int,int);
+    pair<size_t, size_t> colorRange;
+    vector<pair<size_t, size_t>> colorOccupied;
+};
+Vertex::Vertex(int n, size_t s, int r1, int d1){
+    name =n;
+    size = s;
+    r = r1;
+    d = d1;
+}//end of class Vertex
+
+
+///Section for structs and respective sorting function:
+// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm
+struct onePieceMsg{
+    /*
+     members: [ptr, size, MallocFree, idx]
+     */
+    string ptr;
+    size_t size;
+    int MallocFree;
+    int idx;
+    onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+};
+
+
+struct less_than_ptrIdx{
+    /*
+     sort onePieceMsg by ptr and then idx.
+     */
+    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    {
+        return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
+    }
+};
+
+
+struct oneIterMsg{
+    /*
+     members: [idx, MallocFree, size_delta]
+     */
+    size_t size_delta;// type as size_t in case size if large.
+    int MallocFree;
+    int idx;
+    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+};
+
+
+struct less_than_iterIdx{
+    /*
+     sort oneIterMsg by Idx.
+     */
+    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
+    {
+        return (struct1.idx<struct2.idx);
+    }
+};
+
+
+//TODO(junzhe) to replace vertex with onePairMsg, try combine other structs as well.
+struct onePairMsg{
+    /*
+     members: [name (r_idx), size, r_idx, d_idx]
+     */
+    int name;
+    size_t size;
+    int r_idx;
+    int d_idx;
+    onePairMsg(int n,size_t s, int r,int d):name(n),size(s),r_idx(r),d_idx(d){}
+};
+
+
+struct less_than_size{
+    /*
+     sort onePairMsg by descending size.
+     */
+    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
+    {
+        return (struct1.size>struct2.size);
+    }
+};
+
+struct less_than_size_rIdx{
+    /*
+     sort onePairMsg by descending size and r_idx
+     */
+    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
+    {
+        return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx<struct2.r_idx)));
+    }
+};
+
+struct less_than_lookupIdx{
+    /*
+     sort lookUpElement by idx.
+     */
+    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
+    {
+        return (struct1.r_idx<struct2.r_idx);
+    }
+};
+
+
+/// string delimiter
+vector<string> split(string s, string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    string token;
+    vector<string> res;
+    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+    }
+    res.push_back(s.substr(pos_start));
+    return res;
+}
+
+///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
+//vector of pairMsg is used in run.
+//vector of iterMsg is used in test.
+
+vector<onePieceMsg> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+    /*
+     convert vector of string into vector of onePieceMsg, sorted by ptr and then idx, and update idxRange to pieceMsgVec size.
+     */
+    vector<onePieceMsg>onePieceMsgVec_;
+    for (int i=0;i<vec.size();i++) {
+        vector<string> v = split(vec[i], " ");
+        if (v[0]=="Malloc"){
+            //convert v[2] from str to size_t
+            size_t result;
+            stringstream convert(v[2]);
+            if (!(convert>>result)){
+                result =-1;
+                cout<<"error for converting size from str to int."<<endl;
+            }
+            onePieceMsg tempMsg(v[1],result, 1, i);
+            onePieceMsgVec_.push_back(tempMsg);
+        }else if (v[0]=="Free"){
+            onePieceMsg tempMsg(v[1],-1, -1, i);
+            onePieceMsgVec_.push_back(tempMsg);
+        }else {
+            cout<<"error for process the onePriceMsg."<<endl;
+        }
+    }
+    
+    sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx());
+    idxRange = static_cast<int>(onePieceMsgVec_.size());
+
+    return onePieceMsgVec_;
+}// end of strVec_2_pieceMsgVec function
+
+
+pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vector<onePieceMsg>onePieceMsgVec_, int idxRange){
+    /*
+     pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks.
+     */
+    vector<onePairMsg>onePairMsgVec_1;
+    vector<onePairMsg>onePairMsgVec_2;
+    int i=0;
+    
+    //while loop processes a pair at each time, if got a pair.
+    while (i<(onePieceMsgVec_.size()-1)){
+        //condition A: start with free. do nothing.
+        if (onePieceMsgVec_[i].MallocFree==-1){
+            i+=1;
+        }
+        //condition B: start with Malloc, next item same ptr and is free.
+        if ((onePieceMsgVec_[i].MallocFree==1)&& (onePieceMsgVec_[i+1].MallocFree==-1)&&((onePieceMsgVec_[i].ptr==onePieceMsgVec_[i+1].ptr))){
+            onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,onePieceMsgVec_[i+1].idx);
+            onePairMsgVec_1.push_back(tempPair);
+            i+=2;
+        }
+        // condition C: start with Malloc, no free.
+        if ((onePieceMsgVec_[i].MallocFree==1)&&(onePieceMsgVec_[i].ptr!=onePieceMsgVec_[i+1].ptr)){
+            onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange);
+            onePairMsgVec_2.push_back(tempPair);
+            i+=1;
+        }
+    }//end of while
+    //condition D: if still left with the last item
+    if ((i<onePieceMsgVec_.size())&&(onePieceMsgVec_[i+1].MallocFree==1)){
+        onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange);
+        onePairMsgVec_2.push_back(tempPair);
+        i+=1;
+    }
+
+    //sort both pairVec
+    sort(onePairMsgVec_1.begin(),onePairMsgVec_1.end(),less_than_size_rIdx());
+    sort(onePairMsgVec_2.begin(),onePairMsgVec_2.end(),less_than_size_rIdx());
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2);
+    
+    return pairOfPairMsgVec_;
+}//end of pieceMsgVec_2_pairOfPairMsgVec function
+
+///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph.
+vector<pair<size_t, size_t>>  mergeSeg(vector<pair<size_t,size_t>> colorOccupied){
+    /*
+     version 12/9 11am -- modify to accomodate unsigned int/size_t
+     input:the collection of color ranges that is once occupied by some block during a block's life time.
+     function: merge consecutive/overlapping segments of colorOccupied
+     output: merged segments in ascending order.
+     time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied.
+     */
+    sort(colorOccupied.begin(), colorOccupied.end());
+    
+    if(colorOccupied.size()<=1){
+        return colorOccupied;
+    }
+    
+    int m = 0;
+    while (m<(colorOccupied.size()-1)){
+        
+        if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){
+            pair<int,int>tempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second));
+            //remove m+1 and m
+            colorOccupied.erase(colorOccupied.begin()+m+1);
+            colorOccupied.erase(colorOccupied.begin()+m);
+            //insert the combined range
+            colorOccupied.insert(colorOccupied.begin()+m,tempItem);
+        }else{
+            m+=1;
+        }
+    }//end of while loop
+    
+    //verify if mergeSeg is completed. O(n^2) optional
+//    if(colorOccupied.size()>1){
+//        for (int i=0;i<(colorOccupied.size()-1);i++){
+//            if(colorOccupied[i].second>=colorOccupied[i+1].first){
+//                cout<<"error in mergeSeg"<<endl;
+//            }
+//        }
+//    }
+    
+    return colorOccupied;
+}//end of mergeSeg function
+
+
+pair<size_t,size_t> FFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
+    /*
+     version 12/2 4pm
+     First Fit weighted coloring
+     return a pair standing for colorRange.
+     local_offset shifts the returned colorRange, allowing multiple run().
+     local_offset not changable, whereas offset is changable.
+     */
+    // condition A: if no occupied, put after the local_offset
+    if (colorMerged.size()==0){
+        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    }
+    
+    // condition B: able to fit before first block, after the local_offset
+    if ((size+local_offset)<(colorMerged[0].first+1)){
+        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    }
+    
+    size_t yLocation= -1;
+    if (colorMerged.size()>1) {
+        int n = 0;
+        while (n<(colorMerged.size()-1)){
+            // condition C: able to fit in between middle blocks.
+            if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){
+                yLocation = colorMerged[n].second+1;
+                break;
+            }
+            n+=1;
+        }//end of while loop.
+        // condition D: allocate after the last block.
+        if (yLocation == -1){
+            yLocation = colorMerged[colorMerged.size()-1].second+1;
+        }
+    }// end of if loop, conditon C and D.
+    
+    // condition E: colorMeger len =1, allocate after the last block.
+    if (colorMerged.size()==1){
+        yLocation = colorMerged[0].second+1;
+    }
+    
+    if (yLocation==-1){
+        cout<<"error in FFallocation!!!"<<endl;
+    }
+    
+    return pair<size_t,size_t>(yLocation,yLocation+size-1);
+}//end of FFallocation function
+
+
+pair<size_t,size_t> BFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
+    /*
+     version 12/11 1pm
+     Best Fit allocation, input and output same as FFallocation
+     */
+    // condition A: if no occupied, put after the local_offset
+    if (colorMerged.size()==0){
+        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    }
+    //condition B: if size=1, able to fit before the first block
+    if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){
+        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    }
+    
+    //condition C: else of B
+    if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){
+        return pair<size_t,size_t>(colorMerged[0].second+1,colorMerged[0].second+size);
+    }
+    
+    //condition D and E:
+    size_t yLocation=-1;
+    pair<int, size_t>tempHole(-1,-1); // n, hole size between n and n+1
+    if (colorMerged.size()>1) {
+        int n = 0;
+        while (n<(colorMerged.size()-1)){
+            // condition C: able to fit in between middle blocks. select smallest.
+            if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)<tempHole.second)){
+                tempHole.first=n;
+                tempHole.second=colorMerged[n+1].first-colorMerged[n].second-1;
+            }
+            n+=1;
+        }//end of while loop.
+        
+        if(tempHole.first==-1){
+            // condition D: allocate after the last block.
+            yLocation = colorMerged[colorMerged.size()-1].second+1;
+        }else{
+            //condition E: best fit in the smallest hole.
+            yLocation = colorMerged[tempHole.first].second+1;
+            
+        }
+    }// end of if loop, conditon D and E.
+    
+    if (yLocation==-1){
+        cout<<"error in BFallocation!"<<endl;
+    }
+    
+    return pair<size_t,size_t>(yLocation,yLocation+size-1);
+}
+
+vector<Vertex> colorSomeVertices(vector<onePairMsg> pairMsgVec_, size_t &offset,string colorMethod){
+    /*
+     color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset.
+     time complexity: O(n^2).
+     */
+    size_t local_offset = offset; //feed into FFallocation, shall never change.
+    int m = static_cast<int>(pairMsgVec_.size());
+    //init all vertices
+    vector<Vertex>vertices;
+    for (int i=0; i<m;i++){
+        Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
+        vertices.push_back(tempVertex);
+
+    }
+
+    int **adj;
+    adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
+    // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
+    for (int i=0; i<m;i++){
+        adj[i] = new int[m];
+        for (int j=0; j<m;j++){
+            if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))){
+                adj[i][j]=1;
+                if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
+                    vertices[i].colorOccupied.push_back(vertices[j].colorRange);
+                }
+            }
+            else { adj[i][j]=0; }
+        }
+        
+        vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
+       
+        if(colorMethod=="FF"){
+            vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
+            
+        }else{ //BF
+            vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset);
+        }
+
+        //update of offset, largest memory footprint as well.
+        if (vertices[i].colorRange.second >=offset){
+            offset = vertices[i].colorRange.second+1;
+        }
+    }//end of for loop.
+    
+    return vertices;
+}
+
+
+///get cross-iteration duration pairs
+pair<map<int,int>,map<int,int>> cross_itr_durations(vector<string>vec_double, int location, int maxLen, int &doubleRange){
+    
+    vector<onePieceMsg>onePieceMsgVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_2,doubleRange);
+    
+    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
+    map<int,int>Table_d2r;
+    for (int i=0;i<pairOfPairMsgVec_2.first.size();i++){
+        if(pairOfPairMsgVec_2.first[i].r_idx<maxLen){
+            Table_r2d[pairOfPairMsgVec_2.first[i].r_idx] =pairOfPairMsgVec_2.first[i].d_idx%maxLen;
+            Table_d2r[pairOfPairMsgVec_2.first[i].d_idx%maxLen]=pairOfPairMsgVec_2.first[i].r_idx;
+        }
+    }
+    
+    return pair<map<int,int>,map<int,int>>(Table_r2d,Table_d2r);
+}
+
+/// main run funtion
+vector<Vertex> run(vector<string>vec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){
+    /*
+     run function, input vector of strings, return colored vertices,
+     update idxRange, offset.
+     time complexity: O(n^2) where n is maxLen.
+     */
+    vector<onePieceMsg>onePieceMsgVec_ = strVec_2_pieceMsgVec(vec,idxRange);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_,idxRange);
+    //1. normal blocks 2. cross-iteration blocks.
+    vector<onePairMsg>pairMsgVec_1 = pairOfPairMsgVec_.first;
+    vector<onePairMsg>pairMsgVec_2 = pairOfPairMsgVec_.second;
+  
+    vector<Vertex>vertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod);
+    for (int i=0; i<vertices_2.size();i++){
+      vertices_2[i].crossItr = 1;
+    }
+    offsetCrossItr = offset;
+    offset = offsetCrossItr*2;
+    vector<Vertex>vertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod);
+    //merge
+    vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end());
+
+    return vertices;
+}
+
+
+///Section of test functions.
+vector<size_t> pairOfPairMsgVec_2_repSeq(pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_){
+    int counter_1M=0; int counter_1F=0; int counter_2=0;
+    vector<onePairMsg>onePairMsgVec_1 = pairOfPairMsgVec_.first;
+    vector<onePairMsg>onePairMsgVec_2 = pairOfPairMsgVec_.second;
+    vector<oneIterMsg>oneIterMsgVec_;
+    for (int i =0; i<onePairMsgVec_1.size(); i++){
+        oneIterMsg tempIterM(onePairMsgVec_1[i].size,1,onePairMsgVec_1[i].r_idx);
+        oneIterMsgVec_.push_back(tempIterM);
+        counter_1M++;
+        
+        size_t temp_s_d = static_cast<size_t>(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx);
+        oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx);
+        oneIterMsgVec_.push_back(tempIterF);
+        counter_1F++;
+    }
+    
+    for (int i =0; i<onePairMsgVec_2.size(); i++){
+        oneIterMsg tempIterM(onePairMsgVec_2[i].size,1,onePairMsgVec_2[i].r_idx);
+        oneIterMsgVec_.push_back(tempIterM);
+        counter_2++;
+    }
+    
+    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
+    //only after sort then can create rep.
+    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
+    for (int i =0; i<oneIterMsgVec_.size(); i++){
+        rep.push_back(oneIterMsgVec_[i].size_delta);
+    }
+
+    return rep;
+}//end of pairOfPairMsgVec_2_repSeq function
+
+
+vector<size_t> maxRepeatedSeg(vector<size_t>rep, int idxRange, int &maxLen, int &location){
+    /*
+     get max repeated non-overlapping Seg of a vector, return the repeated segment,
+     update maxLen, and location of where Seg starts to repeat.
+     brtue force method using equal()
+     time complexity O(n^2)
+     */
+    for (int i=0; i<idxRange;i++){
+        for (int len=1; len<(idxRange-i);len++){
+            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
+                maxLen = len;
+                location = i;
+                cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
+            }
+        }
+    }
+    //TODO(junzhe) verify the subSeq returned, below poped up error in vgg.
+    vector<size_t>subSeq(&rep[location],&rep[location+maxLen]);
+    if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){
+        cout<<"error in get the maxRep"<<endl;
+    }
+    return subSeq;
+}
+
+
+void verifyAndCut (vector<size_t>subSeq, int &maxLen, int &location){
+    /*
+     to cut, in case the repeated Seg contains multiple iterations.
+     */
+    int tempMaxLen=0;
+    int tempLocation =0;
+    int tempIdxRange = maxLen;
+    
+    vector<size_t>tempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation);
+    //TODO(junzhe), tunable threshold.
+    int threshold =50;
+    if (tempMaxLen>threshold){
+        maxLen = tempMaxLen;
+        location += tempLocation;
+        cout<<"max length get cut"<<endl;
+    }
+    cout<<tempMaxLen<<endl;
+}
+
+
+//main function of test
+int test(vector<string>vec3, int &maxLen, int &location){
+    /*
+     main function of test, returns globeCounter, which is when flag shall be switched,
+     update maxLen and location of where the repeated Seg starts.
+     */
+    cout<<"====================== test ========================="<<endl;
+    int idxRange3=0;
+    vector<onePieceMsg>onePieceMsgVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_3,idxRange3);
+    vector<size_t>rep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_);
+    
+    //get repeated sub vector.
+    vector<size_t>subSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location);
+    //cout<<subSeq.size()<<endl;
+    verifyAndCut(subSeq, maxLen, location);
+    int globeCounter=-1;
+    if (maxLen>100){ //TODO(junzhe) tunable threshold.
+        cout<<"new location and maxLen: "<<location<<' '<<maxLen<<endl;
+        globeCounter = idxRange3+maxLen-(idxRange3-location)%maxLen;
+    }
+    return globeCounter;
+}
+
+///verify if coloring got overlapping
+void overlap_test(vector<Vertex> vertices){
+    size_t s = vertices.size();
+    int i,j;
+    for (i=0; i<s; i++){
+        for (j=i+1; j<s; j++){
+            if (((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d)))&& ((max(vertices[i].colorRange.first,vertices[j].colorRange.first))<(1+min(vertices[i].colorRange.second,vertices[j].colorRange.second)))){
+                cout<<"error overlapping"<<endl;
+            }
+        }
+    }
+}
+
+
+SmartMemPool::SmartMemPool(const MemPoolConf &conf){
+    //TODO(junzhe) to figure out what to do here.
+    colorMethod = "BF";
+    conf_ = conf;
+}
+
+void SmartMemPool::Init(){
+  //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
+  mtx_.lock();
+  if(!initialized_){
+    initialized_ =true;
+  }
+  mtx_.unlock();
+}
+
+
+///Malloc
+void SmartMemPool::Malloc(void** ptr, const size_t size){
+    /*
+     1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
+     2. if flag=0, malloc/cudaMalloc, collect vec string
+     3. if flag=1, look up table, malloc/cudaMalloc if not in the Table
+     4. test repeated sequence every 100 blocks, update globeCounter.
+     */
+
+    //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
+    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
+    //file<<gc<<' '<<"Malloc"; //a.
+    if (!initialized_){
+    Init();
+  }
+
+    void* allocatedPtr = NULL; //ptr to be returned
+    
+    if (gc == globeCounter){
+        /// 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
+        
+        mallocFlag=1;
+        cout<<"switched to color-malloc"<<endl;
+        vector<string>vec_run(&vec[location],&vec[location+maxLen]);
+        
+        vector<Vertex>vertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod);
+
+        //here to verify if the coloring got overlapping. TODO(junzhe) optional
+        //overlap_test(vertices);
+        
+        //obtain the cross-iteration duration info
+        int doubleRange=0;
+        vector<string>vec_double(&vec[location],&vec[location+2*maxLen]);
+        pair<map<int,int>,map<int,int>>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange);
+        Table_r2d = pairs.first;
+        Table_d2r = pairs.second;
+        
+        //update ptrPool
+        cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
+        cout<<"ptrPool is: "<<ptrPool<<endl;
+
+        //b.  below 2 loops: vec_r2Ver to replace Table_r2Ver
+        for (int i=0; i<idxRange; i++){
+            lookUpElement tempElement;
+            Vec_r2Ver.push_back(make_pair(i,tempElement));
+        }
+        for (int i=0; i<vertices.size(); i++){
+            lookUpElement temp;
+            temp.r_idx =vertices[i].r;
+            temp.d_idx =Table_r2d.find(vertices[i].r)->second;
+            temp.size =vertices[i].size;
+            temp.offset=vertices[i].colorRange.first;
+            temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
+            temp.Occupied =0;
+            temp.crossItr = vertices[i].crossItr;
+            temp.Occupied_backup =0; 
+            //build tables for lookup.
+            Vec_r2Ver[vertices[i].r].second= temp;
+        }
+    }
+    
+    if(mallocFlag==0){
+        ///  2. if flag=0, malloc/cudaMalloc
+        cudaMalloc(ptr, size);
+        allocatedPtr = *ptr;
+        //update load
+        if(loadLogFlag==1){
+            if (gc>0){
+                Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
+            }else{ //very first block
+                Table_load[gc]=make_pair(size,0);
+            }
+        }
+        //push_back the string for later test and run.
+        string tempStr1 ="Malloc ";
+        stringstream strm2;
+        strm2<<allocatedPtr;
+        string tempStr2 = strm2.str();
+        stringstream strm3;
+        strm3<<size;
+        string tempStr3 = strm3.str();
+        string temp = tempStr1+tempStr2+" "+tempStr3;
+        vec.push_back(temp);
+        //file<<" Condition M1, addr: "<<*ptr<<endl;  //a.
+    }else{
+        /// 3. if flag=1, look up table.
+        int lookupIdx = (gc-location)%maxLen;
+        if ((Vec_r2Ver[lookupIdx].second.size ==size)&&(Vec_r2Ver[lookupIdx].second.Occupied*Vec_r2Ver[lookupIdx].second.Occupied_backup==0)){
+             if (Vec_r2Ver[lookupIdx].second.Occupied==0){
+                //condition M2, normal and crossItr's primary.
+                //assign ptr and mark as occupied, and add in ptr2rIdx
+                allocatedPtr = Vec_r2Ver[lookupIdx].second.ptr;
+                Vec_r2Ver[lookupIdx].second.Occupied= 1;
+                Table_p2r[allocatedPtr]=lookupIdx;                
+                //update load
+                if(loadLogFlag==1){
+                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
+                }
+                //file<<" Condition M2, addr: "<<*ptr<<endl;  //a.
+              }else if ((Vec_r2Ver[lookupIdx].second.crossItr==1) && (Vec_r2Ver[lookupIdx].second.Occupied==1) && (Vec_r2Ver[lookupIdx].second.Occupied_backup ==0)) {
+                //condition M4, crossItr's backup
+                allocatedPtr = (void*)((char*)Vec_r2Ver[lookupIdx].second.ptr+offsetCrossItr*sizeof(char));
+                Vec_r2Ver[lookupIdx].second.Occupied_backup=1;
+                Table_p2r[allocatedPtr]=lookupIdx;
+                //update load
+                if(loadLogFlag==1){
+                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
+                }
+                //file<<" Condition M4, addr: "<<*ptr<<endl;  //a.
+              }
+        }else{  //condition M3, size not proper or both occupied.
+                cudaMalloc(ptr, size);
+                allocatedPtr = *ptr;       
+                //update load
+                if(loadLogFlag==1){
+                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
+                }
+                //file<<" Condition M3, addr: "<<*ptr<<endl;  //a.
+        } 
+    } //end of loop for flag=1
+    
+    ///4. test repeated sequence every 100 blocks, update globeCounter.
+    if (((gc+1)%300==0) && (mallocFlag==0) && (globeCounter==-1)&&(gc+2>checkPoint)){
+        cout<<"gc and GC before test: "<<gc<<' '<<globeCounter<<endl;
+        globeCounter = test(vec,maxLen,location);
+        checkPoint=checkPoint*2;
+    }
+    
+    ///get load info, when gc == GC+2maxLen
+    if (gc==(globeCounter+2*maxLen)&& (globeCounter>0)){
+        getMaxLoad();
+        loadLogFlag=0;
+    }
+    
+    gc++;
+    Table_p2s[allocatedPtr]=size; //update it for load tracking purpose.
+    *ptr = allocatedPtr;
+    ///update block_RWMF
+    string tempStr1 ="Malloc ";
+    stringstream strm2;
+    strm2<<allocatedPtr;
+    string tempStr2 = strm2.str();
+    stringstream strm3;
+    strm3<<size;
+    string tempStr3 = strm3.str();
+    string temp = tempStr1+tempStr2+" "+tempStr3;
+    ///below time can be removed TODO(junzhe)
+    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+    stringstream strm4;
+    strm4<<now;
+    string tempStr4 = strm4.str();
+    temp = temp+" "+tempStr4;
+    vec_block_RWMF.push_back(temp);
+}
+
+///Free
+void SmartMemPool::Free(void* ptr){
+    
+    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
+    //file<<gc<<' '<<"Free"; //a.
+    
+    size_t deallocatedSize = Table_p2s.find(ptr)->second;
+    
+    if ((globeCounter==-1)||(gc<globeCounter)){
+        //push_back the string for later test and run.
+        string tempStr1 ="Free ";
+        stringstream strm2;
+        strm2<<ptr;
+        string tempStr2 = strm2.str();
+        string temp = tempStr1+tempStr2;
+        vec.push_back(temp);
+        
+        //file<<" Condition F1, addr: "<<ptr<<endl;  //a.
+        //update load before free
+        if(loadLogFlag==1){
+            Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
+        }
+        /// before flag switch, for sure all free shall be done by free()
+        cudaFree(ptr);
+    }else{
+        if (!(Table_p2r.find(ptr)==Table_p2r.end())){
+            int resp_rIdx = Table_p2r.find(ptr)->second;
+            Table_p2r.erase(ptr);
+            
+            if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){
+              //Condition F2, from M2
+              Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again.
+              //file<<" Condition F2, addr: "<<ptr<<endl;  //a.
+            }else if (ptr == (void*)((char*)Vec_r2Ver[resp_rIdx].second.ptr+offsetCrossItr*sizeof(char))){
+              //Condition F4, from M4
+              Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
+              //file<<" Condition F4, addr: "<<ptr<<endl;  //a.
+            } else{
+              //Condition F5, from M2, M4 but idx switched.
+              if (((float)((char*)ptr-((char*)ptrPool+offsetCrossItr*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){
+                     Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
+              }else{
+                     Vec_r2Ver[resp_rIdx].second.Occupied =0;
+              }
+            }
+            //update load
+             if(loadLogFlag==1){
+                 Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize);
+             }
+        }else{
+          //update load
+          if(loadLogFlag==1){
+              Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
+          }
+          //file<<" Condition F3, addr: "<<ptr<<endl;  //a.
+          cudaFree(ptr);
+        }
+            
+    }
+    gc++;
+    ///update block_RWMF
+    string tempStr1 ="Free ";
+    stringstream strm2;
+    strm2<<ptr;
+    string tempStr2 = strm2.str();
+    string temp = tempStr1+tempStr2;
+    ///below time can be removed TODO(junzhe)
+    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+    stringstream strm4;
+    strm4<<now;
+    string tempStr4 = strm4.str();
+    temp = temp+" "+tempStr4;
+    vec_block_RWMF.push_back(temp);
+}//end of Free.
+
+
+SmartMemPool::~SmartMemPool(){
+    fstream file_block1("blockInfo_RW.text", ios::in|ios::out|ios::app);
+    fstream file_block2("blockInfo_RWMF.text", ios::in|ios::out|ios::app);
+    for (int i=0; i< vec_block_RW.size();i++){
+        file_block1<<vec_block_RW[i]<<endl;
+    }
+    for (int i=0; i< vec_block_RWMF.size();i++){
+        file_block2<<vec_block_RWMF[i]<<endl;
+    }
+    cudaFree(ptrPool);
+    //TODO(junzhe) verify what else shall be cleaned up.
+}
+
+void SmartMemPool::getMaxLoad(){
+    
+    vector<size_t>cudaLoadLog;
+    for (int i=0; i<Table_load.size();i++){
+        cudaLoadLog.push_back(Table_load.find(i)->second.first);
+    }
+    size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end());
+    int idxMaxCudaLoad = static_cast<int>(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end())));
+    
+    vector<size_t>colorLoadLog;
+    for (int i=0; i<Table_load.size();i++){
+        colorLoadLog.push_back(Table_load.find(i)->second.second);
+    }
+    size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end());
+    int idxMaxColorLoad = static_cast<int>(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end())));
+    size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first;
+    
+    maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad);
+    maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad);
+    memRatio = (float)maxMemUsage/(float)maxTotalLoad;
+    
+    cout<<"=============================memory usage stats print: ================================"<<endl;
+    cout<<"maxColorLoad vs memPoolSize: (at idx "<<idxMaxColorLoad<<")"<<endl;
+    cout<<maxColorLoad<<endl;
+    cout<<offset<<endl;
+    cout<<"maxTotalLoad vs maxCudaLoad(at idx "<<idxMaxCudaLoad<<") maxMemUsage"<<endl;
+    cout<<maxTotalLoad<<endl;
+    cout<<maxCudaLoad<<endl;
+    cout<<maxMemUsage<<endl;
+    cout<<"memRatio: "<<memRatio<<endl;   
+}
+
+std::pair<size_t, size_t> SmartMemPool::GetMemUsage() {
+  //TODO(junzhe) note here the pair is different from that of CnMemPool.
+  return std::make_pair(maxMemUsage, maxTotalLoad);
+}
+    
+void SmartMemPool::Append(string blockInfo) {
+     //TODO(junzhe) add idx later
+    vec_block_RW.push_back(blockInfo);
+    vec_block_RWMF.push_back(blockInfo);
+}
+
+///Swap:
+Swap::Swap(const MemPoolConf &conf){
+    conf_ = conf;
+}
+
+void Swap::Init(){
+  //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
+  mtx_.lock();
+  if(!initialized_){
+    initialized_ =true;
+  }
+  mtx_.unlock();
+}
+
+void Swap::Malloc(void** ptr, const size_t size){
+  //cout<<"to malloc"<<endl;
+  cudaError_t status = cudaMalloc(ptr, size);
+  CHECK_EQ(status, cudaError_t::cudaSuccess);
+  SwapMeta cpu,gpu;
+  cpu.swapSize = size;
+  gpu.swapSize = size;
+  gpu.ptr = *ptr;
+  pair<SwapMeta,SwapMeta>meta = std::make_pair(cpu, gpu);
+  Table_Meta[*ptr] = meta;
+
+ swapLookUpElement temp;
+ temp.size = size;
+ Table_id2LookUpElement[*ptr] = temp;
+  //  int i = 0;
+  // if (!(Table_id2LookUpElement.find(*ptr)==Table_id2LookUpElement.end())){
+  //     i = i + 1;
+  //     temp.data_ = *ptr +i*sizeof(char); 
+  //     while(!(Table_id2LookUpElement.find(temp.data_)==Table_id2LookUpElement.end())){
+  //       //TODO(swap) verify this loop, can simplify as well.
+  //       i = i + 1
+  //       temp.data_ = *ptr +i*sizeof(char);
+  //     }
+  // } else {
+  //     temp.data_ = *ptr;
+  // }
+  // temp.realGpuPtr = *ptr;
+  // temp.location = 1;
+  // temp.size = size;
+
+  // create before swap.
+  // if (size>swapLimit){
+  //   temp.realCpuPtr = malloc(size);
+  // }
+  // stringstream strm1;
+  // strm1<<size;
+  // string tempStr1 = strm1.str();
+  // stringstream strm2;
+  // strm2<<i;
+  // string tempStr2 = strm2.str();
+  // stringstream strm3;
+  // strm3<<data_;
+  // string tempStr3 = strm3.str();
+  // stringstream strm4;
+  // strm4<<temp.realGpuPtr;
+  // string tempStr4 = strm4.str();
+  stringstream strm1;
+  strm1<<size;
+  string tempStr1 = strm1.str();
+  stringstream strm3;
+  strm3<<*ptr;
+  string tempStr3 = strm3.str();
+  stringstream strm4;
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  strm4<<t2;
+  string tempStr4 = strm4.str();
+  string blockInfo ="Malloc "+tempStr3+" "+tempStr1+" "+tempStr4+" (data_&size)";
+  //string blockInfo ="Malloc: size "+tempStr1+", i "+tempStr2+", data_ "+tempStr3+",  realGpuPtr "+tempStr4;
+  vec_block.push_back(blockInfo);
+  //Table_id2LookUpElement[temp.data_]=temp;
+}
+
+void Swap::Free(void *ptr) {
+  //cout<<"to malloc"<<endl;
+  //input is real ptr
+  cudaError_t status = cudaFree(ptr);
+  CHECK_EQ(status, cudaError_t::cudaSuccess);
+  stringstream strm1;
+  strm1<<ptr;
+  string tempStr1 = strm1.str();
+  stringstream strm4;
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  strm4<<t2;
+  string tempStr4 = strm4.str();
+  string blockInfo ="Free "+tempStr1+" "+tempStr4;
+  vec_block.push_back(blockInfo);
+  Table_Meta.erase(ptr);
+
+  Table_id2LookUpElement.erase(ptr);
+}
+
+void Swap::Append(string blockInfo) {
+     //TODO(junzhe) add idx later
+    vec_block.push_back(blockInfo);
+
+}
+
+
+void Swap::SwapOut(void* data_){
+    printf("A. to swapOut\n");
+    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    size_t swapSize = Table_Meta.find(data_)->second.second.swapSize;
+    Table_Meta.find(data_)->second.first.ptr = malloc(swapSize);
+    SwapMeta cpu, gpu;
+    cpu = Table_Meta.find(data_)->second.first;
+    gpu = Table_Meta.find(data_)->second.second;
+    cudaError_t err;
+    err=cudaMemcpy(cpu.ptr,gpu.ptr,gpu.swapSize,cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess)
+      {
+      fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
+      exit(EXIT_FAILURE);
+      }
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    fstream file_block3("blockInfo_swapOut.text", ios::in|ios::out|ios::app);
+    file_block3<<t2-t1<<" "<<swapSize<<endl;
+    printf("B. swapOut done.\n");
+    //cout<<"before free: "<<data_<<endl;
+    cudaFree(data_);
+    //cout<<"after free: "<<data_<<endl;
+}
+
+void Swap::SwapIn(void* data_){
+  printf("1. to swapIn.\n");
+  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  SwapMeta cpu, gpu;
+  cpu = Table_Meta.find(data_)->second.first;
+  gpu = Table_Meta.find(data_)->second.second;
+  gpu.ptr=nullptr;
+  cudaError_t status = cudaMalloc(&gpu.ptr, gpu.swapSize);
+  CHECK_EQ(status, cudaError_t::cudaSuccess);
+  Table_Meta.find(data_)->second.second.ptr=gpu.ptr;
+  //cout<<"after alloc:1 "<<Table_Meta.find(data_)->second.second.ptr<<endl;
+  cudaError_t err;
+  err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice);
+  printf("2. swapIn done.\n");
+  free(cpu.ptr);
+  //
+  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  fstream file_block3("blockInfo_swapIn.text", ios::in|ios::out|ios::app);
+  file_block3<<t2-t1<<" "<<gpu.swapSize<<endl;
+
+
+//below works  
+  // SwapMeta cpu, gpu;
+  // cpu.swapSize=Table_id2LookUpElement.find(data_)->second.size;
+  // cpu.ptr=malloc(cpu.swapSize);
+  // gpu=cpu;
+  // cudaMalloc(&gpu.ptr,cpu.swapSize);
+  
+  // cudaError_t err;
+  // err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice);
+  // printf("2. swapIn done.\n");
+
+///below partial copy 
+  // SwapMeta h_meta;
+  // SwapMeta* d_meta;
+  // h_meta.swapSize=Table_id2LookUpElement.find(data_)->second.size;
+  // h_meta.h_ptr=malloc(h_meta.swapSize);
+  // cudaMalloc(d_meta,sizeof(SwapMeta));
+  // cudaMemcpy(d_meta,&h_meta,sizeof(SwapMeta),cudaMemcpyHostToDevice);
+
+  // void** d_ptr;
+  // cudaMalloc()
+  // void* h_ptr=malloc(tempSize);
+
+///below is to swapIn swapped out items
+    // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    // size_t tempSize = Table_id2LookUpElement.find(data_)->second.size;
+    // void** tempPtr;
+    // cudaMalloc(tempPtr,tempSize);
+    // cout<<"1. to swapIn."<<endl;
+    // cudaError_t err;
+    // err=cudaMemcpy(*tempPtr, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice);
+    // if (err != cudaSuccess)
+    //   {
+    //   fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
+    //   exit(EXIT_FAILURE);
+    //   }
+    // //cudaMemcpy(data_, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice);
+    // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    // fstream file_block4("blockInfo_swapIn.text", ios::in|ios::out|ios::app);
+    // file_block4<<t2-t1<<" "<<tempSize<<endl;
+    // cudaFree(*tempPtr);
+    // free(Table_id2LookUpElement.find(data_)->second.realCpuPtr);
+    // cout<<"testing after SwapIn"<<endl;
+    // printf("2. swapIn done.\n");
+ 
+}
+
+void getMaxLoad (){
+  //empty
+}
+
+std::pair<size_t, size_t> Swap::GetMemUsage() {
+  //empty
+  return std::make_pair(0, 0);
+}
+
+Swap::~Swap(){
+  //put in notes
+    fstream file_block1("blockInfo.text", ios::in|ios::out|ios::app);
+    for (int i=0; i< vec_block.size();i++){
+        file_block1<<vec_block[i]<<endl;
+    }
+   
+}
 }
-#endif
 
 #endif
+
+#endif
\ No newline at end of file

From 8dae4d8f614a4b2d339425be5e36e6a8fc12e22d Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 13 Aug 2018 17:16:35 +0800
Subject: [PATCH 09/19] unable common.cc appendInfo

---
 src/core/common/common.cc | 76 +++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index 73ec0b0c6e..5a6f61268b 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -31,25 +31,25 @@ void* Block::mutable_data() {
     //TODO(junzhe) go back to enable it after device done
     //std::cout<<"mutable_data() "<<this<<' '<<data_<<std::endl;
     initialized_ = true;
-    // if (ptrDevice_!=nullptr){
-    //   //Append info.
-    //   stringstream strm2;
-    //   strm2<<this;
-    //   string tempStr2 = strm2.str();
-    //   stringstream strm4;
-    //   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    //   strm4<<t2;
-    //   string tempStr4 = strm4.str();
-    //   string temp = "Mutable "+tempStr2+" "+tempStr4;   
-    //   ptrDevice_->AppendInfo(temp);
-    // }
-    // //TODO(junzhe) this should not happen, can verify and remove
-    // if (data_ == nullptr) {
-    //   //cout<<"to sleep"<<endl;
-    //   cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-    //   auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-    //   cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
-    // }
+    if (ptrDevice_!=nullptr){
+      //Append info.
+      stringstream strm2;
+      strm2<<this;
+      string tempStr2 = strm2.str();
+      stringstream strm4;
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      strm4<<t2;
+      string tempStr4 = strm4.str();
+      string temp = "Mutable "+tempStr2+" "+tempStr4;   
+      ptrDevice_->AppendInfo(temp);
+    }
+    //TODO(junzhe) this should not happen, can verify and remove
+    if (data_ == nullptr) {
+      //cout<<"to sleep"<<endl;
+      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+      cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
+    }
     
     return static_cast<char*>(data_) + offset_;
   }
@@ -58,26 +58,26 @@ void* Block::mutable_data() {
 const void* Block::data() const {
     CHECK(initialized_) << "Must initialize data before reading it";
     //TODO(junzhe) go back to enable it after device done
-    // if (ptrDevice_!=nullptr){
-    //   //Append info.
-    //   stringstream strm2;
-    //   strm2<<this;
-    //   string tempStr2 = strm2.str();
-    //   stringstream strm4;
-    //   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    //   strm4<<t2;
-    //   string tempStr4 = strm4.str();
-    //   string temp = "Read "+tempStr2+" "+tempStr4;
-    //   ptrDevice_->AppendInfo(temp);
-    // }
+    if (ptrDevice_!=nullptr){
+      //Append info.
+      stringstream strm2;
+      strm2<<this;
+      string tempStr2 = strm2.str();
+      stringstream strm4;
+      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+      strm4<<t2;
+      string tempStr4 = strm4.str();
+      string temp = "Read "+tempStr2+" "+tempStr4;
+      ptrDevice_->AppendInfo(temp);
+    }
 
-    // //TODO(junzhe) this should not happen, can verify and remove
-    // if (data_ == nullptr) {
-    //   //cout<<"to sleep"<<endl;
-    //   cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-    //   auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-    //   cout<<"slept to get data_ updated"<<endl;
-    // }
+    //TODO(junzhe) this should not happen, can verify and remove
+    if (data_ == nullptr) {
+      //cout<<"to sleep"<<endl;
+      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
+      cout<<"slept to get data_ updated"<<endl;
+    }
 
 
     return static_cast<char*>(data_) + offset_;

From 572fe4d5f23ac25a1a1a42aa00f2c861657c11c4 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 20 Aug 2018 14:37:13 +0800
Subject: [PATCH 10/19] correct swap_select()

---
 include/singa/core/device.h |   4 +-
 src/core/.DS_Store          | Bin 6148 -> 6148 bytes
 src/core/device/swap_gpu.cc | 161 ++++++++++++++++--------------------
 3 files changed, 71 insertions(+), 94 deletions(-)

diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 36b10d0d56..86c0a20b84 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -303,7 +303,7 @@ class SwapGPU : public Device {
   int swap_test(vector<string>vec_block,int &maxLen, int &location);
   void swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
   void swap_plan();
-  vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,double maxLoad,double memLimit,string mode);
+  vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
   vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
   void Test_sched_switch_swap();
   void DeploySwap();
@@ -332,7 +332,7 @@ class SwapGPU : public Device {
   //vec_block
   vector<string>vec_block; //iteration 0-3
   vector<string>vec_block_fresh; //iteration 4 5 6
-  vector<double>global_load;
+  vector<double>global_load; // from begining
   vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
   vector<onePieceMsg>vec_run;
   int asyncSwapFlag = 0; //0 for sync, 1 for async.
diff --git a/src/core/.DS_Store b/src/core/.DS_Store
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..018d123c47540aef56047ebb82a09e1df9135f84 100644
GIT binary patch
delta 207
zcmZoMXfc=|#>B`mF;Q%yo}wrV0|Nsi1A_nqLq0=ZPP$=ma(-^X#Kh(GAPF{xWT0>^
z5F$we#bE$NJ~!XRC8e|^nStT@w3nP<C1~;~c;yQ+AeKW6-<V#@HrYUgWivMiHwVzU
b8xy}XPv#d<WMu-%D@-;N5#H<}vWFP}v~Mn(

delta 70
zcmZoMXfc=|#>AjHu~2NHo+1YW5HK<@2yC9nSjM(_0izY;W_AvK4xj>{$am(+{342+
UKzW7)kiy9(Jj$D6L{=~Z03gE-+W-In

diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 694d1d4806..59f376e93e 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -352,21 +352,12 @@ pair<double,int> load_peak(vector<double>vec_load_test,int maxLen){
 
 void load_update(vector<double>& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){
   //update load [start_idx, end_idx) by plusMinus*size
-  //if (start_idx < end_idx){
   for (int i = start_idx+maxLen; i<end_idx+maxLen; i++){
     vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
   }
-  // } else {
-  //   for (int i = start_idx; i < maxLen; i++){
-  //     vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
-  //   }
-  //   for (int i = 0; i < end_idx; i++){ //TODO(junzhe) NOTE, end_idx excluded
-  //     vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
-  //   }
-  // }
 }
 
-vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,double maxLoad,double memLimit,string mode){
+vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode){
   vector<SwapBlock>vec_swap_selct;
   vector<SwapBlock>vec_swap_reject;
   if (mode == "dto"){
@@ -386,30 +377,14 @@ vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,double maxLoad,
     sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); 
   }
 
-  size_t load_swap_selct = 0;
-  if (mode != "r_idx"){  
-    for (int i =0; i<vec_swap.size(); i++){
-      if ((maxLoad-memLimit)>load_swap_selct){
-        vec_swap_selct.push_back(vec_swap[i]);
-        load_swap_selct+=vec_swap[i].size;  
-        //cout<<"Item selected: (r_idx, d_idx, dto) "<<vec_swap[i].r_idx<<"  "<<vec_swap[i].d_idx<<"  "<<static_cast<int>(vec_swap[i].dto/1000000)<<endl;
-      } else {
-        //break; TODO(junzhe) to resume break
-        vec_swap_reject.push_back(vec_swap[i]);
-      }
+  for (int i=0; i<vec_swap.size(); i++){
+    load_update(tempLoad,0,maxLen,-1,vec_swap[i].size,maxLen);
+    vec_swap_selct.push_back(vec_swap[i]);
+    auto max_current = load_peak(tempLoad,maxLen);
+    newMaxLoad = max_current.first;
+    if (newMaxLoad < memLimit){
+      break;
     }
-  } else {
-    //TODO(junzhe) other modes not done yet
-    // sort(vec_swap.begin(),vec_swap.end(),less_than_r_idx_ready());
-    // auto temp_load = origin_load;
-    // int i = 0;
-    // while ((maxLoad-memLimit)>load_swap_selct){
-
-    // }
-    // for (int i =0; i<vec_swap.size(); i++){
-
-    // }
-
   }
   
   return vec_swap_selct;
@@ -669,29 +644,30 @@ void SwapGPU::swap_plan(){
   cout<<"------------------print max_load: (ideal) "<<maxLoad_ideal<<" "<<maxIdx_ideal<<endl;
   //maxLoad_ideal = 400;
   /// select till maxLoad_ideal, dto
-  auto vec_swap_dto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"dto");
-  cout<<"size of vec_swap_dto: "<<vec_swap_dto.size()<<endl;
-  auto vec_load_dto_ideal = swap_load_ideal(vec_load,vec_swap_dto);
-    fstream file_load_dto_ideal("load_dto_ideal.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_load_dto_ideal<<vec_load_dto_ideal[i]<<endl;
-  }
-  auto tempMax_ = load_peak(vec_load_dto_ideal,maxLen);
-  cout<<"------------------print max_load: (dto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  /// select till maxLoad_ideal, pri
-  auto vec_swap_pri = swap_select(vec_swap,maxLoad,maxLoad_ideal,"pri");
-  cout<<"size of vec_swap_pri: "<<vec_swap_dto.size()<<endl;
-  auto vec_load_pri_ideal = swap_load_ideal(vec_load,vec_swap_pri);
-    fstream file_load_pri_ideal("load_pri_ideal.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_load_pri_ideal<<vec_load_pri_ideal[i]<<endl;
-  }
-  tempMax_ = load_peak(vec_load_pri_ideal,maxLen);
-  cout<<"------------------print max_load: (pri ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+  // auto vec_swap_dto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"dto");
+  // cout<<"size of vec_swap_dto: "<<vec_swap_dto.size()<<endl;
+  // auto vec_load_dto_ideal = swap_load_ideal(vec_load,vec_swap_dto);
+  //   fstream file_load_dto_ideal("load_dto_ideal.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+  //   file_load_dto_ideal<<vec_load_dto_ideal[i]<<endl;
+  // }
+  // auto tempMax_ = load_peak(vec_load_dto_ideal,maxLen);
+  // cout<<"------------------print max_load: (dto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+
+  // /// select till maxLoad_ideal, pri
+  // auto vec_swap_pri = swap_select(vec_swap,maxLoad,maxLoad_ideal,"pri");
+  // cout<<"size of vec_swap_pri: "<<vec_swap_dto.size()<<endl;
+  // auto vec_load_pri_ideal = swap_load_ideal(vec_load,vec_swap_pri);
+  //   fstream file_load_pri_ideal("load_pri_ideal.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+  //   file_load_pri_ideal<<vec_load_pri_ideal[i]<<endl;
+  // }
+  // tempMax_ = load_peak(vec_load_pri_ideal,maxLen);
+  // cout<<"------------------print max_load: (pri ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
 
   /// select till maxLoad_ideal, wdto
-  auto vec_swap_wdto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"wdto");
+  auto tempLoad = origin_load;
+  auto vec_swap_wdto = swap_select(vec_swap,tempLoad,maxLoad_ideal*1.4,"wdto");
   cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
   auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
     fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
@@ -707,51 +683,52 @@ void SwapGPU::swap_plan(){
   auto vec_load_dto = origin_load;
   auto vec_load_wdto = origin_load;
   string mode = "stick-to-limit";
-  double overhead_pri = 0;
-  swap_sched(vec_swap_pri, vec_load_pri,overhead_pri,450<<20, mode);
+  // double overhead_pri = 0;
+  // swap_sched(vec_swap_pri, vec_load_pri,overhead_pri,450<<20, mode);
   
-  double overhead_dto = 0;
-  swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
+  // double overhead_dto = 0;
+  // swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
   
   double overhead_wdto = 0;
   swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,450<<20,mode);
   
 
 
-  fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_block10<<vec_load_pri[i]<<endl;
-  }
-  fstream file_block11("load_1_dto.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_block11<<vec_load_dto[i]<<endl;
-  }
-  fstream file_block12("load_1_wdto.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_block12<<vec_load_wdto[i]<<endl;
-  }
-  auto max_1 = load_peak(vec_load_pri,maxLen);
-  size_t maxLoad_1 = max_1.first;
-  int maxIdx_1 = max_1.second;
-  cout<<"------------------print max_load: (1) "<<maxLoad_1<<" "<<maxIdx_1<<endl;
-  //change back order by Idx.
-  sort(vec_run.begin(),vec_run.end(),less_than_Idx());
-  cout<<"done with swap_plan..."<<endl;
-  cout<<"load 2 overhead pri: "<<(float)(overhead_pri)/(float)(1000000)<<endl;
-  cout<<"load 2 overhead dto: "<<(float)(overhead_dto)/(float)(1000000)<<endl;
-  cout<<"load 2 overhead wdto: "<<(float)(overhead_wdto)/(float)(1000000)<<endl;
-  cout<<"total_swapOutTime: "<<(float)(total_swapOutTime)/(float)(3000000)<<endl;
-  cout<<"total_swapInTime: "<<(float)(total_swapInTime)/(float)(3000000)<<endl;
-  auto t1 = vec_run[0].t;
-  auto t2 = vec_run[maxLen].t;
-  auto t3 = vec_run[maxLen*2].t;
-  auto t4 = vec_run[maxLen*3-1].t;
-  cout<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-  file_time<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  file_time<<"iteration time spent: "<<t2-t1<<" "<<t3-t2<<" "<<t4-t3<<endl;
-  file_time<<"idx "<<0<<" "<<maxLen<<" "<<maxLen*2<<" "<<maxLen*3-1<<endl;
-  file_time<<"value "<<t1<<" "<<t2<<" "<<t3<<" "<<t4<<endl;
+  // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+  //   file_block10<<vec_load_pri[i]<<endl;
+  // }
+  // fstream file_block11("load_1_dto.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+  //   file_block11<<vec_load_dto[i]<<endl;
+  // }
+  // fstream file_block12("load_1_wdto.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+  //   file_block12<<vec_load_wdto[i]<<endl;
+  // }
+  //TODO(junzhe) below verification to be done later.
+  // auto max_1 = load_peak(vec_load_pri,maxLen);
+  // size_t maxLoad_1 = max_1.first;
+  // int maxIdx_1 = max_1.second;
+  // cout<<"------------------print max_load: (1) "<<maxLoad_1<<" "<<maxIdx_1<<endl;
+  // //change back order by Idx.
+  // sort(vec_run.begin(),vec_run.end(),less_than_Idx());
+  // cout<<"done with swap_plan..."<<endl;
+  // cout<<"load 2 overhead pri: "<<(float)(overhead_pri)/(float)(1000000)<<endl;
+  // cout<<"load 2 overhead dto: "<<(float)(overhead_dto)/(float)(1000000)<<endl;
+  // cout<<"load 2 overhead wdto: "<<(float)(overhead_wdto)/(float)(1000000)<<endl;
+  // cout<<"total_swapOutTime: "<<(float)(total_swapOutTime)/(float)(3000000)<<endl;
+  // cout<<"total_swapInTime: "<<(float)(total_swapInTime)/(float)(3000000)<<endl;
+  // auto t1 = vec_run[0].t;
+  // auto t2 = vec_run[maxLen].t;
+  // auto t3 = vec_run[maxLen*2].t;
+  // auto t4 = vec_run[maxLen*3-1].t;
+  // cout<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
+  // fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
+  // file_time<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
+  // file_time<<"iteration time spent: "<<t2-t1<<" "<<t3-t2<<" "<<t4-t3<<endl;
+  // file_time<<"idx "<<0<<" "<<maxLen<<" "<<maxLen*2<<" "<<maxLen*3-1<<endl;
+  // file_time<<"value "<<t1<<" "<<t2<<" "<<t3<<" "<<t4<<endl;
 }
 
 

From d2027a9150a26ccc8bce262118066a5f39aea4aa Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 20 Aug 2018 14:44:50 +0800
Subject: [PATCH 11/19] correct swap_select()

---
 src/core/device/swap_gpu.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 59f376e93e..7e31eb15dd 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -381,7 +381,7 @@ vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double>
     load_update(tempLoad,0,maxLen,-1,vec_swap[i].size,maxLen);
     vec_swap_selct.push_back(vec_swap[i]);
     auto max_current = load_peak(tempLoad,maxLen);
-    newMaxLoad = max_current.first;
+    auto newMaxLoad = max_current.first;
     if (newMaxLoad < memLimit){
       break;
     }
@@ -674,7 +674,7 @@ void SwapGPU::swap_plan(){
   for (int i=maxLen; i<maxLen*2; i++){
     file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
   }
-  tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
+  auto tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
   cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
 
   /// load_1 no overhead, based on pri

From 9d84cfc3d679732aeee635c669d3a1fd109f849e Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 20 Aug 2018 14:57:43 +0800
Subject: [PATCH 12/19] enable swap_plan()

---
 src/core/device/swap_gpu.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 7e31eb15dd..19f8f08e4c 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -693,7 +693,6 @@ void SwapGPU::swap_plan(){
   swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,450<<20,mode);
   
 
-
   // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
   // for (int i=maxLen; i<maxLen*2; i++){
   //   file_block10<<vec_load_pri[i]<<endl;
@@ -867,7 +866,7 @@ void SwapGPU::Test_sched_switch_swap(){
  }
  ///switch flag; next idx
  if ((gc+1) == globeCounter){
-    // swap_plan();
+    swap_plan();
     vector<double>vec_load2(&global_load[location],&global_load[location+3*maxLen]);
     origin_load = vec_load2;
     //load before swap, write in

From ed73c3ad04ff48fa4fb28aaa21df4961b9138ba9 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Sun, 26 Aug 2018 22:40:36 +0800
Subject: [PATCH 13/19] documentation

delay swap_plan() by 3 more iterations

update train

correct swap_sched(), swap_select(),swap_plan()

correct load update in swap_select

vec_run changed to new 3 iterations

correct vec_run36 index issue

correct overhead issue, verify vec_run.t

vec_run duplicate to avoid sorting issue
---
 examples/cifar10/{cnn.py => alexnet.py} |   0
 examples/cifar10/train.py               |  46 ++--
 include/singa/core/device.h             |  45 +++-
 src/core/device/swap_gpu.cc             | 344 ++++++++++++++----------
 4 files changed, 263 insertions(+), 172 deletions(-)
 rename examples/cifar10/{cnn.py => alexnet.py} (100%)

diff --git a/examples/cifar10/cnn.py b/examples/cifar10/alexnet.py
similarity index 100%
rename from examples/cifar10/cnn.py
rename to examples/cifar10/alexnet.py
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index 4a4d94fe30..652772ba5a 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -31,24 +31,25 @@
 import os
 import argparse
 
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
 from singa import utils
 from singa import optimizer
 from singa import device
 from singa import tensor
+from singa.proto import core_pb2
 from caffe import caffe_net
 
-import cnn
+import alexnet
 import vgg
 import resnet
 
+from datetime import datetime
+import time
 
 def load_dataset(filepath):
     print('Loading data file %s' % filepath)
     with open(filepath, 'rb') as fd:
-        try:
-            cifar10 = pickle.load(fd, encoding='latin1')
-        except TypeError:
-            cifar10 = pickle.load(fd)
+        cifar10 = pickle.load(fd)
     image = cifar10['data'].astype(dtype=np.uint8)
     image = image.reshape((-1, 3, 32, 32))
     label = np.asarray(cifar10['labels'], dtype=np.uint8)
@@ -129,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu()
+        dev = device.create_cuda_gpu_on(1)
 
     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
@@ -137,16 +138,27 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         opt.register(p, specs)
 
     tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
-    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
     train_x, train_y, test_x, test_y = data
     num_train_batch = train_x.shape[0] // batch_size
     num_test_batch = test_x.shape[0] // batch_size
     idx = np.arange(train_x.shape[0], dtype=np.int32)
+    fileTimeLog =open("epochTimeLog.text","a")
     for epoch in range(1):
         np.random.shuffle(idx)
         loss, acc = 0.0, 0.0
         print('Epoch %d' % epoch)
-        for b in range(20):
+        print(datetime.now().timetz()) # miliseconds
+        print(int(round(time.time()*1000)))
+        fileTimeLog.write('Epoch %d: ' % epoch)
+        fileTimeLog.write(str(int(round(time.time()*1000))))
+        fileTimeLog.write('\n')
+        for b in range(15): #num_train_batch):
+            print ("start of iteration %d: " %b)
+            #time.sleep(1)
+            fileTimeLog.write('iteration %d: ' % b)
+            fileTimeLog.write(str(int(round(time.time()*1000))))
+            fileTimeLog.write('\n')
             x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
             y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
             tx.copy_from_numpy(x)
@@ -175,14 +187,16 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
 
         print('test loss = %f, test accuracy = %f' %
               ((loss / num_test_batch), (acc / num_test_batch)))
+    fileTimeLog.close()
     net.save('model', 20)  # save model params into checkpoint file
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train dcnn for cifar10')
-    parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'],
-                        default='vgg')
+    parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet', 'caffe'],
+                        default='alexnet')
     parser.add_argument('data', default='cifar-10-batches-py')
     parser.add_argument('--use_cpu', action='store_true')
+    parser.add_argument('batch_size',type=int, default=100)
     args = parser.parse_args()
     assert os.path.exists(args.data), \
         'Pls download the cifar10 dataset via "download_data.py py"'
@@ -194,22 +208,22 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         net = caffe_net.create_net(args.use_cpu)
         # for cifar10_full_train_test.prototxt
         train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
         # for cifar10_quick_train_test.prototxt
         # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004,
         #      use_cpu=args.use_cpu)
-    elif args.model == 'cnn':
+    elif args.model == 'alexnet':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
-        net = cnn.create_net(args.use_cpu)
+        net = alexnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
     elif args.model == 'vgg':
         train_x, test_x = normalize_for_vgg(train_x, test_x)
         net = vgg.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
-              use_cpu=args.use_cpu)
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
     else:
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = resnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
-              use_cpu=args.use_cpu)
\ No newline at end of file
+              use_cpu=args.use_cpu,batch_size=args.batch_size)
\ No newline at end of file
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 86c0a20b84..8d51d61d09 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -265,14 +265,14 @@ struct SwapBlock{
     double r_idx_ready; //r_idx + buffer, could be set during selection.
     //int free = -1; //when it is freed 
     //below as per planned.
-    int i1;
-    int i1p;
-    int i2;
-    int i2p;
-    double t1;
-    double t2;
-    double t1p;
-    double t2p;
+    int i1 = 0;
+    int i1p = 0;
+    int i2 = 0;
+    int i2p = 0;
+    double t1 = 0;
+    double t2 = 0;
+    double t1p = 0;
+    double t2p = 0;
     SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
     ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
 };
@@ -299,16 +299,37 @@ class SwapGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override;
+
+  //append info after Malloc, pair.
+  void MakeMetaTable(Block* block,void* data_,int size) override; 
+
+  //test iteration, return GC
   int swap_test(vector<string>vec_block,int &maxLen, int &location);
+
+  //schedule algo
   void swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
+  
+  //entire plan, from swap_select() to swap_sched()
   void swap_plan();
+
+  //selection algo
   vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
+  
+  //load profile as per synchronous swap.
   vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
+  
+  //all the testing, without swap, during Append()
   void Test_sched_switch_swap();
+
+  //swap, during Append()
   void DeploySwap();
+
+  //Append at every index: malloc, free, read, mutable
   void Append(string blockInfo) override;
+
+  //in case gpu ptr wrong. TODO(junzhe) to verify if needed.
   void* GetRealGpuPtr(const Block* block_) override;
+
   void SwapOut(const Block* block_) override;
   void SwapIn(const Block* block_) override;
 
@@ -337,11 +358,13 @@ class SwapGPU : public Device {
   vector<onePieceMsg>vec_run;
   int asyncSwapFlag = 0; //0 for sync, 1 for async.
   int testFlag = 0; //0 means open for test, 1 means no need test anymore.
-  int gc = 0; //global counter each time Malloc/Free, add 1.
+  int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
   int globeCounter = -1;
   int maxLen = 0;
   int location = 0;
-  //design requirement
+  int three_more_location = 0; //location at 3 more iterations later.
+  int three_more_globeCounter = -1; //
+  //design requirement TODO(junzhe)
   float memLimit_ratio = 0.70; 
   size_t smallest_block = 1<<20; //1 MB
   int data_buffer = 4; // used to control readyIdx
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 19f8f08e4c..6768aee844 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -217,14 +217,14 @@ struct less_than_Idx{
 
 
 int SwapOutTime(size_t size){
-    int ans =0; //TODO(junzhe) used to be 0.29; new param as per vgg
+    int ans = 0; //TODO(junzhe) used to be 0.29; new param as per vgg
     if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
     return ans;
 }
 
 int SwapInTime(size_t size){
     //yet to get the formula
-    int ans =0; //TODO(junzhe) used to be 0.13; new param as per vgg
+    int ans = 0; //TODO(junzhe) used to be 0.13; new param as per vgg
     if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
     return ans;
 }
@@ -249,15 +249,15 @@ struct less_than_wdto{
   }
 };
 
-struct less_than_r_idx_ready{
-  /*
-   sort SwapBlock by r_idx_ready, ascending
-   */
-  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-  {
-    return (struct1.r_idx_ready<struct2.r_idx_ready);
-  }
-};
+// struct less_than_r_idx_ready{
+//   /*
+//    sort SwapBlock by r_idx_ready, ascending
+//    */
+//   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+//   {
+//     return (struct1.r_idx_ready<struct2.r_idx_ready);
+//   }
+// };
 
 struct less_than_pri{
     /*
@@ -315,28 +315,28 @@ pair<int,int> load_over_limit(vector<double>vec_load, size_t memLimit, int start
   return std::make_pair(first_over_limit, first_below_limit);
 }
 
-pair<int,int> load_below_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){
-  //input: vec_load, memLimit, range [start_idx, end_idx]
-  //return range overlimit [first_over_limit, first_below_limit)
-  int first_below_limit = maxIdx;
-  int last_below_limit = maxIdx;
-
-  for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){
-    if (vec_load[i] > memLimit){
-      first_below_limit = i+1-maxLen;
-      break;
-    }
-  }
-
-  for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){
-    if (vec_load[i] > memLimit){
-      last_below_limit = i-1-maxLen;
-      break;
-    }
-  }
-
-  return std::make_pair(first_below_limit, last_below_limit);
-}
+// pair<int,int> load_below_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){
+//   //input: vec_load, memLimit, range [start_idx, end_idx]
+//   //return range overlimit [first_over_limit, first_below_limit)
+//   int first_below_limit = maxIdx;
+//   int last_below_limit = maxIdx;
+
+//   for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){
+//     if (vec_load[i] > memLimit){
+//       first_below_limit = i+1-maxLen;
+//       break;
+//     }
+//   }
+
+//   for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){
+//     if (vec_load[i] > memLimit){
+//       last_below_limit = i-1-maxLen;
+//       break;
+//     }
+//   }
+
+//   return std::make_pair(first_below_limit, last_below_limit);
+// }
 
 pair<double,int> load_peak(vector<double>vec_load_test,int maxLen){
   double maxLoad_test = 0;
@@ -359,7 +359,7 @@ void load_update(vector<double>& vec_load,int start_idx, int end_idx, int plusMi
 
 vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode){
   vector<SwapBlock>vec_swap_selct;
-  vector<SwapBlock>vec_swap_reject;
+  //vector<SwapBlock>vec_swap_reject;
   if (mode == "dto"){
     sort(vec_swap.begin(),vec_swap.end(),less_than_dto());  
   }
@@ -376,16 +376,19 @@ vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double>
     }
     sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); 
   }
-
+  cout<<"===============select block one by one================="<<endl;
   for (int i=0; i<vec_swap.size(); i++){
-    load_update(tempLoad,0,maxLen,-1,vec_swap[i].size,maxLen);
+    load_update(tempLoad,vec_swap[i].r_idx_ready,vec_swap[i].d_idx,-1,vec_swap[i].size,maxLen);
     vec_swap_selct.push_back(vec_swap[i]);
+    auto tempOverLimit_ = load_over_limit(tempLoad,memLimit,0,maxLen,maxLen);
+    cout<<vec_swap[i].r_idx_ready<<","<<vec_swap[i].d_idx<<" ((("<<tempOverLimit_.first<<","<<tempOverLimit_.second<<")))"<<endl;
     auto max_current = load_peak(tempLoad,maxLen);
     auto newMaxLoad = max_current.first;
     if (newMaxLoad < memLimit){
       break;
     }
   }
+  cout<<"=============selection done================"<<endl;
   
   return vec_swap_selct;
 }
@@ -407,118 +410,139 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
     update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit.
   */ 
   //TODO(junzhe) wordy, can merge in common part.
-  if (mode == "no-overhead"){
-    //update i1p
-    //sort by r_idx for i1p update
+  overhead = 0;
+  cout<<"----------------swap_sched----------------"<<endl;
+  if (mode == "stick-to-limit"){
     sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = 0;
-      if (itm.cat == "A1") { readyIdx = itm.r_idx; }
-      if (itm.cat == "A2") { readyIdx = itm.r_idx + data_buffer; }
-      if (itm.cat == "A3") { readyIdx = itm.r_idx + mutable_data_buffer; }
-
+      int readyIdx = itm.r_idx_ready;
+      cout<<"["<<itm.size<<"] ";
+      cout<<readyIdx;
       if (i > 0){
         readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
       }
+      cout<<" -> "<<readyIdx;
       itm.i1 = readyIdx;
-      itm.t1 = vec_run[readyIdx].t;
+      // cout<<"check t1((("<<itm.t1;
+      itm.t1 = vec_run[readyIdx+maxLen].t;
+      // cout<<" "<<readyIdx<<" "<<maxLen<<" || "<<vec_run[readyIdx+maxLen].t<<" "<<itm.t1<<")))";
       itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx].t){
-        readyIdx++;
+      total_swapOutTime+=SwapOutTime(itm.size);
+      while (itm.t1p > vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity.
+        readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead.
       }
+      //get min compare with maxIdx and readyIdx.
+      readyIdx = std::min(maxIdx,readyIdx);
+      cout<<" || "<<readyIdx;
+      //TODO(junzhe) 1st should be desired.
+      load_update(vec_load_temp,readyIdx+1,itm.d_idx,-1,itm.size,maxLen);
+      // load_update(vec_load_temp,itm.r_idx_ready,itm.d_idx,-1,itm.size,maxLen);
+      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      cout<<" ((("<<tempOverLimit_.first<<','<<tempOverLimit_.second<<"))) ";
+      if ((tempOverLimit_.first != -1) && (tempOverLimit_.first <= readyIdx)) { 
+        load_update(vec_load_temp,tempOverLimit_.first-1,readyIdx+1,-1,itm.size,maxLen);
+        // cout<<" ((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
+        readyIdx = tempOverLimit_.first - 1; //TODO(junzhe) boundary
+        overhead+=(itm.t1p-vec_run[readyIdx+maxLen].t);
+        cout<<"==== overhead added "<<itm.t1p-vec_run[readyIdx+maxLen].t<<endl;
+        cout<<"time spent "<<SwapOutTime(itm.size)<<endl;
+        cout<<"so time "<<itm.t1<<endl;
+        cout<<"eo 1 time "<<itm.t1p<<endl;
+        cout<<"eo 2 time "<<vec_run[readyIdx+maxLen].t<<endl;
+
+      }
+      cout<<" -> "<<readyIdx<<endl;   
       itm.i1p = readyIdx;
       vec_swap_selct[i] = itm;
+      // auto tempOverLimit_1 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      // cout<<"end: overlimit first and i1p "<<tempOverLimit_1.first<<' '<<itm.i1p<<endl;
+      
     }
-    //update i2p
+    cout<<"----------------sched part II-------------"<<endl;
     sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
       int needIdx = itm.d_idx;
+      cout<<needIdx;
       if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
+      cout<<" -> "<<needIdx;
       itm.i2 = needIdx;
-      double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
-      while (prepareTime < vec_run[needIdx].t){
+      double prepareTime = vec_run[needIdx+maxLen].t - SwapInTime(itm.size);
+      total_swapInTime+=SwapInTime(itm.size);
+      while (prepareTime < vec_run[needIdx+maxLen].t){
         needIdx--;
       }
+      needIdx = std::max(needIdx,maxIdx+1);
+      cout<<" || "<<needIdx;
       itm.i2p = needIdx;
       itm.t2p = prepareTime;
+      // auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      // cout<<"(((before come back (right over limit): "<<tempOverLimit_2.second<<endl;
+      load_update(vec_load_temp,itm.i2p,itm.d_idx,1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+      auto tempOverLimit_3 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      cout<<" ((("<<tempOverLimit_3.first<<","<<tempOverLimit_3.second<<")))";
+      // cout<<"|||after come back (right over limit): "<<tempOverLimit_3.second<<endl;
+      // if (tempOverLimit_3.second > 0){
+      //   cout<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<itm.i1p<<' '<<itm.i2p<<' '<<itm.i2<<")))"<<endl;
+      // }
+
+      if ((tempOverLimit_3.second != -1) && (vec_run[tempOverLimit_3.second+maxLen].t > itm.t2p)) {
+        overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p);
+        cout<<"==== overhead added "<<vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p<<endl;
+        load_update(vec_load_temp,itm.i2p,tempOverLimit_3.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+        itm.i2p = tempOverLimit_3.second+1;
+        auto tempOverLimit_4 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      }
+      cout<<" -> "<<itm.i2p<<endl;
+      //cout<<"after consider overlimit (right over limit): "<<tempOverLimit_4.second<<endl;
       vec_swap_selct[i] = itm;
-      load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
     }
+    cout<<":::::END OF SCHED, overhead is "<<overhead<<endl;
+  }///end of first mode.
 
-    }
-  if (mode == "stick-to-limit"){
+
+  ///this mode not really in use, for test purpose only.
+  if (mode == "no-overhead"){
+    //update i1p
+    //sort by r_idx for i1p update
     sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = itm.r_idx_ready;
-      cout<<"change of readyIdx: "<<readyIdx;
+      int readyIdx = 0;
+      if (itm.cat == "A1") { readyIdx = itm.r_idx; }
+      if (itm.cat == "A2") { readyIdx = itm.r_idx + data_buffer; }
+      if (itm.cat == "A3") { readyIdx = itm.r_idx + mutable_data_buffer; }
+
       if (i > 0){
         readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
       }
-      cout<<"||compare with last i1p "<<readyIdx;
       itm.i1 = readyIdx;
       itm.t1 = vec_run[readyIdx].t;
       itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      total_swapOutTime+=SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx].t){ //TODO(junzhe) reduce time complexity.
-        readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead.
-      }
-      //get min compare with maxIdx and readyIdx.
-      readyIdx = std::min(maxIdx,readyIdx);
-      cout<<"||count swap time "<<readyIdx;
-      load_update(vec_load_temp,readyIdx+1,itm.d_idx,-1,itm.size,maxLen);
-      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      if ((tempOverLimit_.first != -1) && (tempOverLimit_.first <= readyIdx)) { 
-        load_update(vec_load_temp,tempOverLimit_.first-1,readyIdx+1,-1,itm.size,maxLen);
-        cout<<"((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
-        readyIdx = tempOverLimit_.first -1; //TODO(junzhe) boundary
-
+      while (itm.t1p > vec_run[readyIdx].t){
+        readyIdx++;
       }
-      cout<<"||count over limit "<<readyIdx<<endl;
-      auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
       itm.i1p = readyIdx;
-      cout<<"end: overlimit first and i1p "<<tempOverLimit_2.first<<' '<<itm.i1p<<endl;
       vec_swap_selct[i] = itm;
     }
+    //update i2p
     sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
-    cout<<"right over limit ===================="<<endl;
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
       int needIdx = itm.d_idx;
-      cout<<"change of needIdx: "<<needIdx;
       if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
-      cout<<"||compare with last i2p "<<needIdx;
       itm.i2 = needIdx;
       double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
-      total_swapInTime+=SwapInTime(itm.size);
       while (prepareTime < vec_run[needIdx].t){
         needIdx--;
       }
-      needIdx = std::max(needIdx,maxIdx+1);
-      cout<<"||count swap in "<<needIdx;
       itm.i2p = needIdx;
       itm.t2p = prepareTime;
-      auto tempOverLimit_0 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<"(((before come back (right over limit): "<<tempOverLimit_0.second<<endl;
-      load_update(vec_load_temp,itm.i2p,itm.d_idx,1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<"|||after come back (right over limit): "<<tempOverLimit_.second<<endl;
-      if (tempOverLimit_.second > 0){
-        cout<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<itm.i1p<<' '<<itm.i2p<<' '<<itm.i2<<")))"<<endl;
-      }
-
-      if ((tempOverLimit_.second != -1) && (vec_run[tempOverLimit_.second].t > itm.t2p)) {
-        overhead+=(vec_run[tempOverLimit_.second].t - itm.t2p);
-        load_update(vec_load_temp,itm.i2p,tempOverLimit_.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-        itm.i2p = tempOverLimit_.second+1;
-        auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      }
-      cout<<"||count overlimit "<<itm.i2p<<endl;
-      //cout<<"after consider overlimit (right over limit): "<<tempOverLimit_2.second<<endl;
       vec_swap_selct[i] = itm;
+      load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
     }
+
   }
   
 }
@@ -564,52 +588,75 @@ void SwapGPU::swap_plan(){
   cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
   sort(vec_pieceMsg.begin(),vec_pieceMsg.end(),less_than_Idx());
   // scale down idx, to middle iteration.
-  tempTime_baseline = vec_pieceMsg[location].t;
+  tempTime_baseline = vec_pieceMsg[three_more_location].t;
   for (int i=0; i<vec_pieceMsg.size();i++){
-    vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - location - maxLen;
+    vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - three_more_location - maxLen;
     vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline;
   }
   //3 iterations of vec_run and vec_load, maxIdx and maxLoad
-  vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]);
+  vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]);
   vec_run = temp_vec_run;
-  fstream file_vec_run("vec_run.csv", ios::in|ios::out|ios::app);
+  fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app);
   for (int i =0; i<vec_run.size();i++){
     //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    file_vec_run<<i<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    tempTime2 = vec_run[i].t;
+    // file_vec_run<<i<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
+    // tempTime2 = vec_run[i].t;
+    file_vec_run<<i-maxLen<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].size<<' '<<vec_run[i].t<<endl;
   }
+
+  vector<onePieceMsg>temp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]);
+  auto vec_run2 = temp_vec_run2;
+  fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app);
+  for (int i =0; i<vec_run2.size();i++){
+    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
+    // file_vec_run2<<i<<' '<<vec_run2[i].t<<' '<<vec_run2[i].t-tempTime2<<endl;
+    // tempTime2 = vec_run[i].t;
+    file_vec_run2<<i<<' '<<vec_run2[i].MallocFree<<' '<<vec_run2[i].size<<endl;
+  }
+
   vector<double>vec_load(&global_load[location],&global_load[location+3*maxLen]);
   origin_load = vec_load;
-  //load before swap, write in
-  fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
+  //3 iterations
+  fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app);
   for (int i=0; i<origin_load.size(); i++){
-    file_load_origin<<origin_load[i]<<endl;
+    file_load_origin<<i<<' '<<origin_load[i]<<endl;
   }
+  
+  // vector<double>vec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]);
+  // auto origin_load2 = vec_load2;
+  // //3 iterations
+  // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app);
+  // for (int i=0; i<origin_load2.size(); i++){
+  //   file_load_origin2<<i<<" "<<origin_load2[i]<<endl;
+  // }
 
-  fstream file_load_current("load_current.csv", ios::in|ios::out|ios::app);
-  for (int i=0; i<maxLen; i++){
-    file_load_current<<vec_load[i]<<endl;
-  }
+  // //one iteration.
+  // fstream file_load_current("load_current.csv", ios::in|ios::out|ios::app);
+  // for (int i=0; i<maxLen; i++){
+  //   file_load_current<<vec_load[i]<<endl;
+  // }
   cout<<"build from scratch:::::::::::::::;"<<endl;
   auto max_current = load_peak(vec_load,maxLen);
   maxLoad = max_current.first;
   maxIdx = max_current.second;
   cout<<"------------------print max_load: (current) "<<maxLoad<<" "<<maxIdx<<endl;
 
-  //sort by ptr & idx
-  sort(vec_run.begin(),vec_run.end(),less_than_ptrIdx());
+
+  //sort by ptr & idx, sorting the duplicate
+  auto vec_run_dup = vec_run;
+  sort(vec_run_dup.begin(),vec_run_dup.end(),less_than_ptrIdx());
   ///formulate swappable items.
-  cout<<"==============================print swappable items "<<maxIdx<<endl;
+  cout<<"==============================print swappable items, with maxIdx "<<maxIdx<<endl;
   vector<SwapBlock>vec_swap;
-  size_t load_swap = 0;
-  for (int i =1; i<vec_run.size(); i++){
+  // size_t load_swap = 0;
+  for (int i =1; i<vec_run_dup.size(); i++){
     //SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
     //ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
-    if ((vec_run[i].size >= smallest_block) && (vec_run[i-1].idx<maxIdx) && (vec_run[i].idx>maxIdx) 
-      && (vec_run[i-1].ptr ==vec_run[i].ptr) 
-      && ((vec_run[i-1].MallocFree==3) or (vec_run[i-1].MallocFree==2) or (vec_run[i-1].MallocFree==4)))
+    if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idx<maxIdx) && (vec_run_dup[i].idx>maxIdx) 
+      && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) 
+      && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4)))
     {
-      SwapBlock itm(vec_run[i].ptr, vec_run[i].size, vec_run[i-1].idx, vec_run[i].idx, vec_run[i-1].t, vec_run[i].t);
+      SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t);
       itm.dto = itm.d_time-itm.r_time;
       itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
       if (itm.dt>=0){
@@ -618,15 +665,16 @@ void SwapGPU::swap_plan(){
         itm.pri = itm.dt * 1/itm.size;
       }
       //cat A
-      if (vec_run[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
-      if (vec_run[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
-      if (vec_run[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
+      if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
+      if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
+      if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
 
       vec_swap.push_back(itm);
-      load_swap+=itm.size;
+      // load_swap+=itm.size;
+      cout<<itm.size<<" ";
       cout<<"Items Swappable: (r_idx, d_idx, cat, MB, dt/us, PS) || "<<itm.r_idx<<' '<<itm.d_idx;
-      cout<<" ||  "<<itm.cat<<"  ||"<<(float)(itm.size)/(float)(1024*1024);
-      cout<<' ||'<<itm.dt/1000<<' '<<itm.pri<<endl;
+      cout<<" ||  "<<itm.cat<<"  || "<<(float)(itm.size)/(float)(1024*1024);
+      cout<<' || '<<itm.dt/1000<<' '<<itm.pri<<endl;
     } 
   }
   cout<<"size vec_swap: "<<vec_swap.size()<<endl;
@@ -667,20 +715,22 @@ void SwapGPU::swap_plan(){
 
   /// select till maxLoad_ideal, wdto
   auto tempLoad = origin_load;
-  auto vec_swap_wdto = swap_select(vec_swap,tempLoad,maxLoad_ideal*1.4,"wdto");
+  auto memLimit_wdto = 350<<20;
+  //TODO(junzhe) memLimit = maxLoad_ideal*1.4
+  auto vec_swap_wdto = swap_select(vec_swap,tempLoad,memLimit_wdto,"wdto");
   cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
-  auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
-    fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
-    file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
-  }
-  auto tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
-  cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
+  // auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
+    // fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
+  // for (int i=maxLen; i<maxLen*2; i++){
+    // file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
+  // }
+  // auto tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
+  // cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
 
   /// load_1 no overhead, based on pri
   //auto vec_swap_pri = vec_swap_pri;
-  auto vec_load_pri = origin_load;
-  auto vec_load_dto = origin_load;
+  // auto vec_load_pri = origin_load;
+  // auto vec_load_dto = origin_load;
   auto vec_load_wdto = origin_load;
   string mode = "stick-to-limit";
   // double overhead_pri = 0;
@@ -690,7 +740,7 @@ void SwapGPU::swap_plan(){
   // swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
   
   double overhead_wdto = 0;
-  swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,450<<20,mode);
+  swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,memLimit_wdto,mode);
   
 
   // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
@@ -848,7 +898,7 @@ void SwapGPU::Test_sched_switch_swap(){
     swap removed to DeploySwap
     v2: test after every index, at Append. order and index changed.
   */
-  ///test & schedule
+  ///test iteration
   if (((gc+1)%(maxLen_threshold) == 0) && (asyncSwapFlag == 0) && (testFlag == 0)){
     //TODO(junzhe) not lean, chances are globeCounter found more than 300 idx ago: redudant test.
     cout<<"gc, GC and vec_len before test: "<<gc<<' '<<globeCounter<<' '<<vec_block.size()<<endl;
@@ -857,6 +907,8 @@ void SwapGPU::Test_sched_switch_swap(){
     maxLen_threshold = std::min(2000,maxLen_threshold);
     if (maxLen > maxLen_threshold) {
       testFlag = 1;
+      three_more_globeCounter = globeCounter + 3*maxLen;
+      three_more_location = location + 3*maxLen;
       cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<<endl;
       cout<<"size of Table_sched: "<<Table_sched.size()<<endl;
       cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
@@ -865,17 +917,19 @@ void SwapGPU::Test_sched_switch_swap(){
    }
  }
  ///switch flag; next idx
- if ((gc+1) == globeCounter){
+ if ((gc+1) == three_more_globeCounter){
     swap_plan();
-    vector<double>vec_load2(&global_load[location],&global_load[location+3*maxLen]);
-    origin_load = vec_load2;
-    //load before swap, write in
-    fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
-    for (int i=0; i<origin_load.size(); i++){
-      file_load_origin<<origin_load[i]<<endl;
-    }
     asyncSwapFlag = 1;
-    cout<<"switched flag for at "<<globeCounter<<endl;
+    // vector<double>vec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]);
+    // origin_load = vec_load2;
+    // //load before swap, write in
+    // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
+    // for (int i=0; i<origin_load.size(); i++){
+    //   file_load_origin<<origin_load[i]<<endl;
+    // }
+    cout<<"switched flag for at "<<three_more_globeCounter<<endl;
+    cout<<"maxLen is "<<maxLen<<endl;
+    cout<<"globeCounter "<<globeCounter<<endl;
  }
 }
 

From 650a4e6e96882e08867c95914ab3f3492fd5d6de Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Wed, 29 Aug 2018 15:17:05 +0800
Subject: [PATCH 14/19] impl swap_construct_tables(), swap_update_tables(),
 DeploySwap()

verified itm 5 indices in Table_sched

vec_swap_select pass by reference in swap_sched()

impl swap_update_tables(), before DeploySwap(), both at Append()

for time being, remove negative r_idx itms && git push origin vd1

handle last itr by impl sizeSqn and verification to change asyncSwapFlag back to 0
---
 include/singa/core/device.h |  33 +++--
 src/core/device/swap_gpu.cc | 235 +++++++++++++++++++++++++++---------
 src/core/memory/memory.cc   |   4 +-
 3 files changed, 200 insertions(+), 72 deletions(-)

diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8d51d61d09..db50dd4d4a 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -300,33 +300,41 @@ class SwapGPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
 
+  //Append at every index: malloc, free, read, mutable
+  void Append(string blockInfo) override;
+
   //append info after Malloc, pair.
   void MakeMetaTable(Block* block,void* data_,int size) override; 
 
+  //all the testing, without swap, during Append()
+  void Test_sched_switch_swap();
+
   //test iteration, return GC
   int swap_test(vector<string>vec_block,int &maxLen, int &location);
 
-  //schedule algo
-  void swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
-  
-  //entire plan, from swap_select() to swap_sched()
+  //entire plan, from swap_select() to swap_sched(), swap_deploy_tables()
   void swap_plan();
 
   //selection algo
   vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
+
+  //schedule algo
+  void swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
   
-  //load profile as per synchronous swap.
-  vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
-  
-  //all the testing, without swap, during Append()
-  void Test_sched_switch_swap();
+  //make tables Table_sched and Table_meta
+  void swap_construct_tables(vector<SwapBlock>vec_swap_selct);
+
+  //update Table_meta, during Append()
+  void swap_update_tables(Block* tempBlock_);
 
   //swap, during Append()
   void DeploySwap();
 
-  //Append at every index: malloc, free, read, mutable
-  void Append(string blockInfo) override;
 
+
+  //load profile as per synchronous swap.
+  vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
+  
   //in case gpu ptr wrong. TODO(junzhe) to verify if needed.
   void* GetRealGpuPtr(const Block* block_) override;
 
@@ -349,6 +357,7 @@ class SwapGPU : public Device {
   //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
   map<int,std::tuple<int,int,int,int>>Table_sched; // changed to with sync_r_idx
 
+  // vector<SwapBlock>vec_swap_selct_global;
 
   //vec_block
   vector<string>vec_block; //iteration 0-3
@@ -356,6 +365,8 @@ class SwapGPU : public Device {
   vector<double>global_load; // from begining
   vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
   vector<onePieceMsg>vec_run;
+  vector<int>opsSequence; //sequence of operations of one middle iteration
+  vector<size_t>sizeSequence; //size of all operations of one middle iteration
   int asyncSwapFlag = 0; //0 for sync, 1 for async.
   int testFlag = 0; //0 means open for test, 1 means no need test anymore.
   int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 6768aee844..1521912b9c 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -405,7 +405,7 @@ vector<double> SwapGPU::swap_load_ideal(vector<double>vec_load,vector<SwapBlock>
   return vec_load_return;
 }
 
-void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode){
+void SwapGPU::swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode){
   /*
     update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit.
   */ 
@@ -417,7 +417,7 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
       int readyIdx = itm.r_idx_ready;
-      cout<<"["<<itm.size<<"] ";
+      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
       cout<<readyIdx;
       if (i > 0){
         readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
@@ -445,11 +445,11 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
         // cout<<" ((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
         readyIdx = tempOverLimit_.first - 1; //TODO(junzhe) boundary
         overhead+=(itm.t1p-vec_run[readyIdx+maxLen].t);
-        cout<<"==== overhead added "<<itm.t1p-vec_run[readyIdx+maxLen].t<<endl;
-        cout<<"time spent "<<SwapOutTime(itm.size)<<endl;
-        cout<<"so time "<<itm.t1<<endl;
-        cout<<"eo 1 time "<<itm.t1p<<endl;
-        cout<<"eo 2 time "<<vec_run[readyIdx+maxLen].t<<endl;
+        cout<<"==== overhead added "<<itm.t1p-vec_run[readyIdx+maxLen].t<<"... ";
+        // cout<<"time spent "<<SwapOutTime(itm.size)<<endl;
+        // cout<<"so time "<<itm.t1<<endl;
+        // cout<<"eo 1 time "<<itm.t1p<<endl;
+        // cout<<"eo 2 time "<<vec_run[readyIdx+maxLen].t<<endl;
 
       }
       cout<<" -> "<<readyIdx<<endl;   
@@ -463,6 +463,7 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
     sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
+      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
       int needIdx = itm.d_idx;
       cout<<needIdx;
       if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
@@ -489,7 +490,7 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
 
       if ((tempOverLimit_3.second != -1) && (vec_run[tempOverLimit_3.second+maxLen].t > itm.t2p)) {
         overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p);
-        cout<<"==== overhead added "<<vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p<<endl;
+        cout<<"==== overhead added "<<vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p<<"... ";
         load_update(vec_load_temp,itm.i2p,tempOverLimit_3.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
         itm.i2p = tempOverLimit_3.second+1;
         auto tempOverLimit_4 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
@@ -548,6 +549,95 @@ void SwapGPU::swap_sched(vector<SwapBlock>vec_swap_selct, vector<double>&vec_loa
 }
 
 
+void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;
+  sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+  //for each swap select, make Table_sched and Table_meta
+  // for (int i = static_cast<int>(vec_swap_selct.size()-1);i>=0; i--){
+  for (int i =0; i<vec_swap_selct.size(); i++){
+    auto itm = vec_swap_selct[i];
+    if (itm.r_idx >= 0){
+    //TODO(junzhe) for time being, remove negative r_idx itms.
+      cout<<itm.r_idx<<" || "<<itm.i1<<" "<<itm.i1p<<" "<<itm.i2p<<" "<<itm.i2<<endl;
+      //i1 swap
+      if (Table_sched.find(itm.i1) == Table_sched.end()){
+        Table_sched[itm.i1] = std::make_tuple(itm.r_idx,0,-1,-1);
+      } else {
+        std::get<0>(Table_sched.find(itm.i1)->second) = itm.r_idx;
+        std::get<1>(Table_sched.find(itm.i1)->second) = 0;
+      }
+      //i2p swap
+      if (Table_sched.find(itm.i2p) == Table_sched.end()){
+        Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1);      
+      } else {
+        std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx;
+        std::get<1>(Table_sched.find(itm.i2p)->second) = 1;
+      }
+      // i1p sync
+      if (Table_sched.find(itm.i1p) == Table_sched.end()){
+        Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0);
+      } else {
+        std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx;
+        std::get<3>(Table_sched.find(itm.i1p)->second) = 0; 
+      }
+      //i2 sync
+      if (Table_sched.find(itm.i2) == Table_sched.end()){
+        Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1);
+      } else {
+        std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx;
+        std::get<3>(Table_sched.find(itm.i1p)->second) = 1;
+      }
+
+      ///Make Table_meta
+      void* tempPtr = nullptr;
+      cudaMallocHost(&tempPtr,itm.size); //pinned memory.
+      BlockMeta meta;
+      meta.size = itm.size;
+      meta.cpu_ptr = tempPtr;
+      meta.out_stream = stream1;
+      meta.in_stream = stream2;
+      //meta.last_out_idx = vec_swap_selct[i].last_out_idx;
+      //meta.last_in_idx = vec_swap_selct[i].last_in_idx;
+      //meta.i2 = vec_swap_selct[i].i2;
+      Table_meta[itm.r_idx] = meta;
+    }
+
+  }
+  cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;
+  cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
+  cout<<"size of Table_sched =================="<<Table_sched.size()<<endl;
+  cout<<"print Table_sched, idx, r_idx, sync, direction"<<endl;
+  for (int i = -500; i<maxLen; i++){
+    if (!(Table_sched.find(i) == Table_sched.end())){
+      cout<<i<<"-->";
+      cout<<std::get<0>(Table_sched.find(i)->second)<<" ";
+      cout<<std::get<1>(Table_sched.find(i)->second)<<" ";
+      cout<<std::get<2>(Table_sched.find(i)->second)<<" ";
+      cout<<std::get<3>(Table_sched.find(i)->second)<<endl;
+    }
+  }
+
+}
+
+void SwapGPU::swap_update_tables(Block* tempBlock_){
+  // update Table_meta's block_ and data_; update once atfer swap test is passed.
+  //TODO(junzhe) should not be able to update negative r_idx, as of now.
+  if (testFlag == 1) {
+    //cout<<gc<<' '<<(gc-location)%maxLen<<' '<<blockInfo<<endl;
+    int r_gc = (gc-location)%maxLen;
+    if (!(Table_meta.find(r_gc)==Table_meta.end())){
+      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
+      //TODO(junzhe) verify the length change, if go in, value update
+      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
+      Table_meta.find(r_gc)->second.block_ = tempBlock_;
+      Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
+    }
+  }
+
+}
+
 int SwapGPU::swap_test(vector<string>vec_block,int &maxLen, int &location){
 
   ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx.
@@ -593,6 +683,15 @@ void SwapGPU::swap_plan(){
     vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - three_more_location - maxLen;
     vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline;
   }
+
+  // build opsSqn, and sizeSqn
+  // cout<<"------printing sequenc--------"<<endl;
+  vector<onePieceMsg>one_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]);
+  for (int i =0; i<one_itr.size();i++){
+    opsSequence.push_back(one_itr[i].MallocFree);
+    sizeSequence.push_back(one_itr[i].size);
+    // cout<<one_itr[i].MallocFree<<' '<<one_itr[i].size<<endl;
+  }
   //3 iterations of vec_run and vec_load, maxIdx and maxLoad
   vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]);
   vec_run = temp_vec_run;
@@ -715,9 +814,10 @@ void SwapGPU::swap_plan(){
 
   /// select till maxLoad_ideal, wdto
   auto tempLoad = origin_load;
-  auto memLimit_wdto = 350<<20;
+  auto memLimit_wdto = 550<<20;
   //TODO(junzhe) memLimit = maxLoad_ideal*1.4
   auto vec_swap_wdto = swap_select(vec_swap,tempLoad,memLimit_wdto,"wdto");
+  // vec_swap_selct_global = vec_swap_wdto;
   cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
   // auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
     // fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
@@ -741,7 +841,9 @@ void SwapGPU::swap_plan(){
   
   double overhead_wdto = 0;
   swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,memLimit_wdto,mode);
-  
+
+  swap_construct_tables(vec_swap_wdto);
+
 
   // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
   // for (int i=maxLen; i<maxLen*2; i++){
@@ -910,8 +1012,6 @@ void SwapGPU::Test_sched_switch_swap(){
       three_more_globeCounter = globeCounter + 3*maxLen;
       three_more_location = location + 3*maxLen;
       cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<<endl;
-      cout<<"size of Table_sched: "<<Table_sched.size()<<endl;
-      cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
       cout<<"impt numbers (maxLen, location, GC) "<<maxLen<<' '<<location<<' '<<globeCounter<<endl;
       
    }
@@ -956,50 +1056,57 @@ void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){
 }
 
 void SwapGPU::DeploySwap(){
-   ///swap as per schedule: this version: both out and in sync
+   ///swap and sync as per schedule.
   int r_gc = (gc-location)%maxLen; 
+
   if ((asyncSwapFlag == 1) && (!(Table_sched.find(r_gc) == Table_sched.end()))){
     cout<<"--------sched action at "<<r_gc<<endl;
     auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
     auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
     auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
     auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
-    if (swap_dir == 0){ SwapOut_idx(swap_idx); 
-      cout<<"Swap Out "<<swap_idx<<endl;
+    if (swap_dir == 0){ 
+      SwapOut_idx(swap_idx); 
+      cout<<"----Swap Out "<<swap_idx<<endl;
     }
-    if (swap_dir == 1){ SwapIn_idx(swap_idx); 
-      cout<<"Swap In "<<swap_idx<<endl;
+    if (swap_dir == 1){ 
+      SwapIn_idx(swap_idx); 
+      cout<<"----Swap In "<<swap_idx<<endl;
     }
     //TODO(junzhe) verify sync what else to be done
     if (sync_dir == 0){
+      ///sync swap-out, including sync, update block's data_ to nullptr, free data_, update meta.
       auto last_meta = Table_meta.find(sync_idx)->second;
       auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
       cudaEventSynchronize(last_meta.in_event);
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+
+      // Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+
       last_meta.block_->update_data(nullptr);
-      cout<<"to free data_"<<last_meta.data_<<endl;
+      // cout<<"to free data_"<<last_meta.data_<<endl;
       pool_->Free(last_meta.data_);
       last_meta.data_ = nullptr; //not really needed TODO(junzhe)
-      cout<<"sync out "<<sync_idx<<endl;
+      cout<<"----sync out "<<sync_idx<<endl;
       Table_meta.find(sync_idx)->second = last_meta;
     }
     if (sync_dir == 1){
+      ///sync swap-in, including sync, update block's data_ to new gpu address, update meta.
       //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
       auto last_meta = Table_meta.find(sync_idx)->second;
       auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
       cudaEventSynchronize(last_meta.out_event);
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      Table_not_at_device.erase(last_meta.block_);
+      // Table_not_at_device.erase(last_meta.block_);
       last_meta.block_->update_data(last_meta.data_);
-      cout<<"sync in "<<sync_idx<<endl;
+      cout<<"----sync in "<<sync_idx<<endl;
       Table_meta.find(sync_idx)->second = last_meta;
     }
-    cout<<"-------"<<endl;
   } 
 }
 
 void SwapGPU::Append(string blockInfo){
+
   vector<string> v = swap_split(blockInfo, " ");
   void* tempPtr;
   stringstream convert(v[1]);
@@ -1013,6 +1120,7 @@ void SwapGPU::Append(string blockInfo){
     string tempStr1 = strm1.str();
     blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2];
   }
+
   // update global load
   if (maxLen < maxLen_threshold){
     if (v[0] == "Malloc"){
@@ -1027,36 +1135,29 @@ void SwapGPU::Append(string blockInfo){
       global_load.push_back(global_load[global_load.size()-1]);
     }
   }
+
+  //append into vec_block
+  vec_block.push_back(blockInfo);
+
+
   //cout<<blockInfo<<endl;
   //cout<<tempBlock_->size()<<endl;
   //cout<<"load: "<<global_load[global_load.size()-1]<<" len of blockInfo and global_load "<<vec_block.size()<<' '<<global_load.size()<<endl;
-
   //std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-  vec_block.push_back(blockInfo);
-  if (asyncSwapFlag == 1){
-    vec_block_fresh.push_back(blockInfo);
-  }
-  if ((maxLen>maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){
-    fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app);
-    for (int i =0; i<vec_block_fresh.size();i++){
-      file_block_fresh<<vec_block_fresh[i]<<endl;
-    }
-  }
+  
+  // if (asyncSwapFlag == 1){
+  //   vec_block_fresh.push_back(blockInfo);
+  // }
+  // if ((maxLen>maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){
+  //   fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app);
+  //   for (int i =0; i<vec_block_fresh.size();i++){
+  //     file_block_fresh<<vec_block_fresh[i]<<endl;
+  //   }
+  // }
   // fstream file_block5("append.csv", ios::in|ios::out|ios::app);
   // file_block5<<gc<<' '<<blockInfo<<' '<<(gc-1247)%612<<endl;
 
-  // update Table_meta's block_ and data_
-  if (maxLen > maxLen_threshold) {
-    //cout<<gc<<' '<<(gc-location)%maxLen<<' '<<blockInfo<<endl;
-    int r_gc = (gc-location)%maxLen;
-    if (!(Table_meta.find(r_gc)==Table_meta.end())){
-      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
-      //TODO(junzhe) verify the length change, if go in, value update
-      cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
-      Table_meta.find(r_gc)->second.block_ = tempBlock_;
-      Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
-    }
-  }
+  //print time duration per iteration
   if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){
     if (tempTime != 0){
       fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
@@ -1067,6 +1168,18 @@ void SwapGPU::Append(string blockInfo){
     tempTime = (std::chrono::system_clock::now()).time_since_epoch().count();
   }
 
+  //check if last iteration, TODO(junzhe) further verify with MallocFree.
+  if (asyncSwapFlag == 1){
+    int r_gc = (gc-location)%maxLen;
+    if (tempBlock_->size() != sizeSequence[r_gc]){
+      asyncSwapFlag = 0;
+      cout<<"!!!! asyncSwapFlag changed back to 0"<<endl;
+    }
+  }
+
+  //update Table_meta
+  swap_update_tables(tempBlock_);
+
   //deploy swap at every index.
   DeploySwap();
 
@@ -1078,18 +1191,18 @@ void SwapGPU::Append(string blockInfo){
 }
 
 void* SwapGPU::GetRealGpuPtr(const Block* block_){
-  //here should be not update_data()
-  auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  cudaEventSynchronize(reading_meta.in_event);
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
-  //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
-  //reading_meta.block_->update_data(reading_meta.data_);
+  // //here should be not update_data()
+  // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
+  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  // cudaEventSynchronize(reading_meta.in_event);
+  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  // //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
+  // //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
+  // //reading_meta.block_->update_data(reading_meta.data_);
 
-  //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
+  // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
 
-  Table_not_at_device.erase(reading_meta.block_);
+  // Table_not_at_device.erase(reading_meta.block_);
 
   return nullptr; //TODO(junzhe) attention, based on no change here.
 }
@@ -1105,7 +1218,8 @@ void SwapGPU::SwapOut_idx(const int r_idx){
   cudaEventRecord(meta.out_event,meta.out_stream);
   //cout<<"right after cudaMemcpyAsync"<<endl;
   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  cout<<"To update_data swap for (Out) "<<r_idx<<" "<<meta.block_<<" 0"<<endl;
+  // cout<<"To update_data swap for (Out) "<<r_idx<<" "<<meta.block_<<" 0"<<endl;
+  //update meta's out_event
   Table_meta.find(r_idx)->second = meta;
   //cout<<"time for asynchrous: "<<t2-t1<<endl;
   //cudaEventSynchronize(event1);
@@ -1127,11 +1241,12 @@ void SwapGPU::SwapIn_idx(const int r_idx){
   //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
   //void* to_rm_ptr = meta.data_;
   meta.data_ = ptr;
-  cout<<"right before cudaMemcpyAsync In"<<endl;
+  // cout<<"right before cudaMemcpyAsync In"<<endl;
   err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
   cudaEventRecord(meta.in_event,meta.in_stream);
-  cout<<"right after cudaMemcpyAsync"<<endl;
-  cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
+  // cout<<"right after cudaMemcpyAsync"<<endl;
+  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
+  //upadte meta's new gpu addr, in_event
   Table_meta.find(r_idx)->second = meta;
   //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update.
   //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index 93a17a823f..ae8293b3db 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -98,10 +98,12 @@ void CnMemPool::Malloc(void **ptr, const size_t size) {
 
 void CnMemPool::Free(void *ptr) {
   CHECK(initialized_) << "Cannot free the memory as the pool is not initialzied";
-  //cout<<"(normal)to free ptr "<<ptr<<endl;
+  // cout<<"to free ptr "<<ptr<<endl;
   cnmemStatus_t status = cnmemFree(ptr, NULL);
+  // cout<<"done cnmemFree "<<endl;
   CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
       << " " << cnmemGetErrorString(status);
+  // cout<<"done status check"<<endl;
 }
 
 // ===========================================================================

From 479ad2abf17f8afbfb78916c38d2b5b4e621c525 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Fri, 31 Aug 2018 11:18:55 +0800
Subject: [PATCH 15/19] enable include negative r_idx into Table_sched

---
 src/core/device/swap_gpu.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 1521912b9c..6d101a23d2 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -558,7 +558,7 @@ void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
   // for (int i = static_cast<int>(vec_swap_selct.size()-1);i>=0; i--){
   for (int i =0; i<vec_swap_selct.size(); i++){
     auto itm = vec_swap_selct[i];
-    if (itm.r_idx >= 0){
+    // if (itm.r_idx >= 0){
     //TODO(junzhe) for time being, remove negative r_idx itms.
       cout<<itm.r_idx<<" || "<<itm.i1<<" "<<itm.i1p<<" "<<itm.i2p<<" "<<itm.i2<<endl;
       //i1 swap
@@ -602,7 +602,7 @@ void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
       //meta.last_in_idx = vec_swap_selct[i].last_in_idx;
       //meta.i2 = vec_swap_selct[i].i2;
       Table_meta[itm.r_idx] = meta;
-    }
+    // }
 
   }
   cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;

From 383fffe6373474781b25b719ef0b112f05db9b1f Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Fri, 31 Aug 2018 14:14:09 +0800
Subject: [PATCH 16/19] cross iteration swap, last iteration, milestone

correct swap_construct_tables(), included negative r_idx for swap_update_tables() and DeploySwap()

include negative r_idx for DeploySwap()

impl GetRealGpuPtr() to swapIn nullptr Block at last iteration

impl GetRealGpuPtr(), and optimize data() and mutable_data()

impl GetRealGpuPtr(), and optimize data() and mutable_data()

verify const issue

change to return tempData instead of updating data_

without remove erasing in Table_not_at_device

milestone of last itr, at 550 MB
---
 include/singa/core/device.h |   5 +-
 src/core/common/common.cc   |   8 +-
 src/core/device/swap_gpu.cc | 156 +++++++++++++++++++++++++-----------
 3 files changed, 118 insertions(+), 51 deletions(-)

diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index db50dd4d4a..ae9dd88425 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -327,9 +327,12 @@ class SwapGPU : public Device {
   //update Table_meta, during Append()
   void swap_update_tables(Block* tempBlock_);
 
-  //swap, during Append()
+  //swap/sync during Append()
   void DeploySwap();
 
+  //exec DelpoySwap
+  void DeploySwap_exec(int r_gc);
+
 
 
   //load profile as per synchronous swap.
diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index 5a6f61268b..f3d144a9c8 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -48,7 +48,9 @@ void* Block::mutable_data() {
       //cout<<"to sleep"<<endl;
       cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
       auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"slept to get data_ updated: "<<this<<' '<<data_<<endl;
+      cout<<"print returned tempData_ "<<tempData_<<endl;
+      // cout<<"slept to get data_ updated: (mutable_data) "<<this<<' '<<data_<<endl;
+      return static_cast<char*>(tempData_) + offset_;
     }
     
     return static_cast<char*>(data_) + offset_;
@@ -76,7 +78,9 @@ const void* Block::data() const {
       //cout<<"to sleep"<<endl;
       cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
       auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"slept to get data_ updated"<<endl;
+      cout<<"print returned tempData_ "<<tempData_<<endl;
+      // cout<<"slept to get data_ updated: (data) "<<this<<' '<<data_<<endl;
+      return static_cast<char*>(tempData_) + offset_;
     }
 
 
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 6d101a23d2..82a72f80b1 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -587,7 +587,7 @@ void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
         Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1);
       } else {
         std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx;
-        std::get<3>(Table_sched.find(itm.i1p)->second) = 1;
+        std::get<3>(Table_sched.find(itm.i2)->second) = 1;
       }
 
       ///Make Table_meta
@@ -605,7 +605,7 @@ void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
     // }
 
   }
-  cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;
+  cout<<"---------------print all 1, 1', 2', 2-----------DONE"<<endl;
   cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
   cout<<"size of Table_sched =================="<<Table_sched.size()<<endl;
   cout<<"print Table_sched, idx, r_idx, sync, direction"<<endl;
@@ -623,9 +623,10 @@ void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
 
 void SwapGPU::swap_update_tables(Block* tempBlock_){
   // update Table_meta's block_ and data_; update once atfer swap test is passed.
-  //TODO(junzhe) should not be able to update negative r_idx, as of now.
+  // enable to update negative r_idx. 
+  // it's safe in below procedure, as r_gc and r_gc_n should never be the same.
   if (testFlag == 1) {
-    //cout<<gc<<' '<<(gc-location)%maxLen<<' '<<blockInfo<<endl;
+    //update positive r_idx
     int r_gc = (gc-location)%maxLen;
     if (!(Table_meta.find(r_gc)==Table_meta.end())){
       //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
@@ -634,6 +635,16 @@ void SwapGPU::swap_update_tables(Block* tempBlock_){
       Table_meta.find(r_gc)->second.block_ = tempBlock_;
       Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
     }
+
+    //update negative r_idx
+    int r_gc_n = r_gc - maxLen;
+    if (!(Table_meta.find(r_gc_n)==Table_meta.end())){
+      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
+      //TODO(junzhe) verify the length change, if go in, value update
+      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
+      Table_meta.find(r_gc_n)->second.block_ = tempBlock_;
+      Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data();
+    }
   }
 
 }
@@ -1058,51 +1069,68 @@ void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){
 void SwapGPU::DeploySwap(){
    ///swap and sync as per schedule.
   int r_gc = (gc-location)%maxLen; 
+  int r_gc_n = r_gc - maxLen;
 
-  if ((asyncSwapFlag == 1) && (!(Table_sched.find(r_gc) == Table_sched.end()))){
-    cout<<"--------sched action at "<<r_gc<<endl;
-    auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
-    auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
-    auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
-    auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
-    if (swap_dir == 0){ 
-      SwapOut_idx(swap_idx); 
-      cout<<"----Swap Out "<<swap_idx<<endl;
-    }
-    if (swap_dir == 1){ 
-      SwapIn_idx(swap_idx); 
-      cout<<"----Swap In "<<swap_idx<<endl;
+  if (asyncSwapFlag == 1){
+    if ((gc < three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
+      cout<<"condition A"<<endl;
+      DeploySwap_exec(r_gc_n);
     }
-    //TODO(junzhe) verify sync what else to be done
-    if (sync_dir == 0){
-      ///sync swap-out, including sync, update block's data_ to nullptr, free data_, update meta.
-      auto last_meta = Table_meta.find(sync_idx)->second;
-      auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      cudaEventSynchronize(last_meta.in_event);
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-
-      // Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
-
-      last_meta.block_->update_data(nullptr);
-      // cout<<"to free data_"<<last_meta.data_<<endl;
-      pool_->Free(last_meta.data_);
-      last_meta.data_ = nullptr; //not really needed TODO(junzhe)
-      cout<<"----sync out "<<sync_idx<<endl;
-      Table_meta.find(sync_idx)->second = last_meta;
+    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
+      cout<<"condition B"<<endl;
+      DeploySwap_exec(r_gc_n);
     }
-    if (sync_dir == 1){
-      ///sync swap-in, including sync, update block's data_ to new gpu address, update meta.
-      //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
-      auto last_meta = Table_meta.find(sync_idx)->second;
-      auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      cudaEventSynchronize(last_meta.out_event);
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      // Table_not_at_device.erase(last_meta.block_);
-      last_meta.block_->update_data(last_meta.data_);
-      cout<<"----sync in "<<sync_idx<<endl;
-      Table_meta.find(sync_idx)->second = last_meta;
+    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) {
+      cout<<"condition C"<<endl;
+      DeploySwap_exec(r_gc);
     }
-  } 
+  }
+}
+
+
+void SwapGPU::DeploySwap_exec(int r_gc){
+  cout<<"--------sched action at "<<r_gc<<endl;
+  auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
+  auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
+  auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
+  auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
+  if (swap_dir == 0){ 
+    SwapOut_idx(swap_idx); 
+    cout<<"----Swap Out "<<swap_idx<<endl;
+  }
+  if (swap_dir == 1){ 
+    SwapIn_idx(swap_idx); 
+    cout<<"----Swap In "<<swap_idx<<endl;
+  }
+  //TODO(junzhe) verify sync what else to be done
+  if (sync_dir == 0){
+    ///sync swap-out, including sync, update block's data_ to nullptr, free data_, update meta.
+    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    cudaEventSynchronize(last_meta.in_event);
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+
+    Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+
+    last_meta.block_->update_data(nullptr);
+    // cout<<"to free data_"<<last_meta.data_<<endl;
+    pool_->Free(last_meta.data_);
+    last_meta.data_ = nullptr; //not really needed TODO(junzhe)
+    cout<<"----sync out "<<sync_idx<<endl;
+    Table_meta.find(sync_idx)->second = last_meta;
+  }
+  if (sync_dir == 1){
+    ///sync swap-in, including sync, update block's data_ to new gpu address, update meta.
+    //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
+    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    cudaEventSynchronize(last_meta.out_event);
+    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+    Table_not_at_device.erase(last_meta.block_);
+    last_meta.block_->update_data(last_meta.data_);
+    cout<<"----sync in "<<sync_idx<<endl;
+    Table_meta.find(sync_idx)->second = last_meta;
+  }
 }
 
 void SwapGPU::Append(string blockInfo){
@@ -1191,6 +1219,37 @@ void SwapGPU::Append(string blockInfo){
 }
 
 void* SwapGPU::GetRealGpuPtr(const Block* block_){
+  // in case that block is at host memory, swapIn ad hoc.
+  auto r_idx = Table_not_at_device.find(block_)->second;
+
+  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  cudaError_t err;
+  BlockMeta meta = Table_meta.find(r_idx)->second;
+  cudaEventCreate (&meta.in_event);
+  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
+  void* ptr = nullptr;
+  pool_->Malloc((void**)&ptr, meta.size);
+  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
+  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
+  //void* to_rm_ptr = meta.data_;
+  meta.data_ = ptr;
+  // cout<<"right before cudaMemcpyAsync In"<<endl;
+  err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
+  cudaEventRecord(meta.in_event,meta.in_stream);
+  // cout<<"right after cudaMemcpyAsync"<<endl;
+  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
+  //upadte meta's new gpu addr, in_event
+  
+
+  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  cudaEventSynchronize(meta.out_event);
+  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
+  // Table_not_at_device.erase(block_);
+  // last_meta.block_->update_data(last_meta.data_);
+  // cout<<"----sync in "<<sync_idx<<endl;
+  // Table_meta.find(sync_idx)->second = last_meta;
+  Table_meta.find(r_idx)->second = meta;
+
   // //here should be not update_data()
   // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
   // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
@@ -1199,12 +1258,13 @@ void* SwapGPU::GetRealGpuPtr(const Block* block_){
   // //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
   // //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
   // //reading_meta.block_->update_data(reading_meta.data_);
-
   // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
-
   // Table_not_at_device.erase(reading_meta.block_);
+  // block_->update_data(static_cast<Block*>(ptr));
+  
+  cout<<"print ptr from function GetRealGpuPtr() "<<ptr<<endl;
 
-  return nullptr; //TODO(junzhe) attention, based on no change here.
+  return ptr; //TODO(junzhe) attention, based on no change here.
 }
 
 void SwapGPU::SwapOut_idx(const int r_idx){

From 2fb3f027c0e4b4ab59c5a3d5eba25f1634d6e450 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Mon, 10 Sep 2018 16:06:56 +0800
Subject: [PATCH 17/19] vd2: swap+pool new class of pool: SwapPool important
 APIs: PoolOpt(), Malloc(), Free() PoolOpt() takes in M/F sequences including
 those induced by swapping cross-iteration variables and last iteration case
 solved.

record down MF after swap done, for one iteration
---
 examples/cifar10/train.py   |   4 +-
 include/singa/core/device.h |   1 +
 include/singa/core/memory.h |  36 ++-
 src/core/device/cuda_gpu.cc |   2 +-
 src/core/device/platform.cc |   2 +-
 src/core/device/swap_gpu.cc |  89 ++++++-
 src/core/memory/memory.cc   | 467 ++++++++++++++++++++----------------
 7 files changed, 376 insertions(+), 225 deletions(-)

diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index 652772ba5a..861bd65ac0 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -130,7 +130,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu_on(1)
+        dev = device.create_cuda_gpu_on(0)
 
     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
@@ -153,7 +153,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
         fileTimeLog.write('Epoch %d: ' % epoch)
         fileTimeLog.write(str(int(round(time.time()*1000))))
         fileTimeLog.write('\n')
-        for b in range(15): #num_train_batch):
+        for b in range(20): #num_train_batch):
             print ("start of iteration %d: " %b)
             #time.sleep(1)
             fileTimeLog.write('iteration %d: ' % b)
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index ae9dd88425..96bc8e9c41 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -365,6 +365,7 @@ class SwapGPU : public Device {
   //vec_block
   vector<string>vec_block; //iteration 0-3
   vector<string>vec_block_fresh; //iteration 4 5 6
+  vector<string>vec_block_mf; //itr 8 9 10
   vector<double>global_load; // from begining
   vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
   vector<onePieceMsg>vec_run;
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index cc40d0433c..343b4449de 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -50,6 +50,8 @@ class DeviceMemPool {
   virtual void Malloc(void** ptr, const size_t size)  = 0;
   virtual void Free(void* ptr)  = 0;
   virtual void Append(string blockInfo) = 0;
+
+  virtual void PoolOpt(vector<string> &vec_mf) = 0;
   
   virtual void SwapOut(void* data_) = 0;
   virtual void SwapIn(void* data_) = 0;
@@ -74,7 +76,9 @@ class CnMemPool : public DeviceMemPool {
 
   void Malloc(void** ptr, const size_t size);
   void Free(void* ptr);
-    void Append(string blockInfo){}
+  void Append(string blockInfo){}
+
+  void PoolOpt(vector<string> &vec_mf) override {}
     
   void SwapOut(void* data_) override {}
   void SwapIn(void* data_) override {}
@@ -102,7 +106,9 @@ class CudaMemPool : public DeviceMemPool {
  public:
   void Malloc(void** ptr, const size_t size) override;
   void Free(void* ptr) override;
-void Append(string blockInfo){}
+  void Append(string blockInfo){}
+
+  void PoolOpt(vector<string> &vec_mf) override {}
 
   void SwapOut(void* data_) override {}
   void SwapIn(void* data_) override {}
@@ -134,9 +140,11 @@ class SmartMemPool: public DeviceMemPool {
     void getMaxLoad(void);
     std::pair<size_t, size_t> GetMemUsage() override;
     void Append(string blockInfo);
+
+    void PoolOpt(vector<string> &vec_mf) override {}
     
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
+    void SwapOut(void* data_) override {}
+    void SwapIn(void* data_) override {}
 protected:
     void Init();
 private:
@@ -196,19 +204,22 @@ struct SwapMeta{
     void* d_ptr; //not used for
 };
 
-class Swap : public DeviceMemPool {
+class SwapPool : public DeviceMemPool {
 public:
-    Swap(const MemPoolConf &conf); //constructor
+    SwapPool(const MemPoolConf &conf); //constructor
     //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
     void Malloc(void** ptr, const size_t size);
     void Free(void* ptr);
-    ~Swap();
+    ~SwapPool();
     void getMaxLoad(void);
     std::pair<size_t, size_t> GetMemUsage() override;
     void Append(string blockInfo);
     
     void SwapOut(void* data_);
     void SwapIn(void* data_);
+
+    //PoolOpt() construct pool based on MF info after Swap constructed.
+    void PoolOpt(vector<string> &vec_mf);
 protected:
     void Init();
 private:
@@ -219,8 +230,15 @@ class Swap : public DeviceMemPool {
     std::mutex mtx_; 
     vector<string> vec_block;
     size_t swapLimit = 1<<23; //8MB
-    map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
-    map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
+    int poolFlag = 0;
+    int pc = 0;
+    int maxLen_mf = 0;
+    void* ptrPool = nullptr;
+    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
+    map<int,lookUpElement>Table_r2v; //r-> vertex
+    vector<pair<int,lookUpElement>>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe)
+    // map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
+    // map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
 };
 
 #endif
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 16de1ef5ca..52d4b4fb02 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -48,7 +48,7 @@ const int kNumCudaStream = 1;
 CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
   MemPoolConf conf;
   conf.add_device(id);
-  pool_ = std::make_shared<Swap>(conf);
+  pool_ = std::make_shared<CnMemPool>(conf);
   Setup();
 }
 
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
index d64b2594c2..48b2a94520 100644
--- a/src/core/device/platform.cc
+++ b/src/core/device/platform.cc
@@ -128,7 +128,7 @@ Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {
     conf.add_device(device);
     CHECK_LE(bytes, Platform::GetGPUMemSize(device).first);
   }
-  auto pool = std::make_shared<CnMemPool>(conf);
+  auto pool = std::make_shared<SwapPool>(conf);
 
   vector<shared_ptr<Device> > ret;
   for (auto device : devices) {
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 82a72f80b1..cd4f1fdbd9 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -42,7 +42,6 @@ const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
 
 ///functions to be used
 ///Section for structs and respective sorting function:
-// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm
 
 
 
@@ -924,7 +923,8 @@ SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) {
 
   MemPoolConf conf;
   conf.add_device(id);
-  pool_ = std::make_shared<Swap>(conf);
+  //TODO(junzhe) note that it has been <Swap> for building SwapGPU, which doesnt matter.
+  pool_ = std::make_shared<SwapPool>(conf); 
   Setup();
 
 }
@@ -987,6 +987,26 @@ void* SwapGPU::Malloc(int size) {
   if (size > 0) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Malloc((void**)&ptr, size);
+
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Malloc ";
+      stringstream strm2;
+      strm2<<ptr;
+      string tempStr2 = strm2.str();
+      stringstream strm3;
+      strm3<<size;
+      string tempStr3 = strm3.str();
+      string temp = tempStr1+tempStr2+" "+tempStr3;
+      vec_block_mf.push_back(temp);
+    }
+    //record mf semantics after swap plan done
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Malloc "<<ptr<<" "<<size;
+      file_mf_one_itr<<endl;
+    }
     // TODO(wangwei) remove the memset.
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
@@ -1000,6 +1020,21 @@ void SwapGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Free(ptr);
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Free ";
+      stringstream strm2;
+      strm2<<ptr;
+      string tempStr2 = strm2.str();
+      string temp = tempStr1+tempStr2;
+      vec_block_mf.push_back(temp);
+    }
+
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Free "<<ptr<<endl;
+    }
   }
 
   //cout<<"free done"<<endl; 
@@ -1115,6 +1150,21 @@ void SwapGPU::DeploySwap_exec(int r_gc){
     last_meta.block_->update_data(nullptr);
     // cout<<"to free data_"<<last_meta.data_<<endl;
     pool_->Free(last_meta.data_);
+    ///append vec_block_mf
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+      && ((gc - maxLen) >= three_more_globeCounter)){
+      string tempStr1 ="Free ";
+      stringstream strm2;
+      strm2<<last_meta.data_;
+      string tempStr2 = strm2.str();
+      string temp = tempStr1+tempStr2;
+      vec_block_mf.push_back(temp);
+    }
+
+    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"Free "<<last_meta.data_<<" SwapOut(Sync)"<<endl;
+    }
     last_meta.data_ = nullptr; //not really needed TODO(junzhe)
     cout<<"----sync out "<<sync_idx<<endl;
     Table_meta.find(sync_idx)->second = last_meta;
@@ -1213,8 +1263,24 @@ void SwapGPU::Append(string blockInfo){
 
   //test moved from start of malloc/free to end of append, only gc+1 changed
   Test_sched_switch_swap();
-  //NOTE: this gc++ includes read/write and AppendLayer as well, in addition to malloc/free.
+  //NOTE: this gc includes read/write and AppendLayer as well, in addition to malloc/free.
   gc++;
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) == three_more_globeCounter)){
+    cout<<"==================to call PoolOpt"<<endl;
+    fstream file_mf_8910("mf_8910.csv", ios::in|ios::out|ios::app);
+    for (int i = 0; i< vec_block_mf.size();i++){
+      file_mf_8910<<vec_block_mf[i]<<endl;
+    }
+    cout<<"len of vec_block_mf: "<<vec_block_mf.size()<<endl;
+    pool_->PoolOpt(vec_block_mf);
+    cout<<"==================to call PoolOpt done"<<endl;
+  }
+
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) 
+    && ((gc - three_more_globeCounter)%maxLen == 0)){
+      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+      file_mf_one_itr<<"-----new itr------"<<endl;
+  }
 
 }
 
@@ -1297,6 +1363,23 @@ void SwapGPU::SwapIn_idx(const int r_idx){
   //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
+  ///append vec_block_mf
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
+    && ((gc - maxLen) >= three_more_globeCounter)){
+    string tempStr1 ="Malloc ";
+    stringstream strm2;
+    strm2<<ptr;
+    string tempStr2 = strm2.str();
+    stringstream strm3;
+    strm3<<meta.size;
+    string tempStr3 = strm3.str();
+    string temp = tempStr1+tempStr2+" "+tempStr3;
+    vec_block_mf.push_back(temp);
+  }
+  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
+    file_mf_one_itr<<"Malloc "<<ptr<<" "<<meta.size<<" swapIn"<<endl;
+  }
   //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
   //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
   //void* to_rm_ptr = meta.data_;
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index ae8293b3db..8c7bc1c057 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -139,8 +139,8 @@ Vertex::Vertex(int n, size_t s, int r1, int d1){
 
 
 ///Section for structs and respective sorting function:
-// onePieceMsg, onePairMsg, oneIterMsg, version 11/30 3pm
-struct onePieceMsg{
+// onePieceMsg_pool, onePairMsg, oneIterMsg, version 11/30 3pm
+struct onePieceMsg_pool{
     /*
      members: [ptr, size, MallocFree, idx]
      */
@@ -148,15 +148,15 @@ struct onePieceMsg{
     size_t size;
     int MallocFree;
     int idx;
-    onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+    onePieceMsg_pool(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
 };
 
 
 struct less_than_ptrIdx{
     /*
-     sort onePieceMsg by ptr and then idx.
+     sort onePieceMsg_pool by ptr and then idx.
      */
-    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    inline bool operator() (const onePieceMsg_pool& struct1, const onePieceMsg_pool& struct2)
     {
         return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
     }
@@ -247,11 +247,11 @@ vector<string> split(string s, string delimiter) {
 //vector of pairMsg is used in run.
 //vector of iterMsg is used in test.
 
-vector<onePieceMsg> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+vector<onePieceMsg_pool> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
     /*
-     convert vector of string into vector of onePieceMsg, sorted by ptr and then idx, and update idxRange to pieceMsgVec size.
+     convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size.
      */
-    vector<onePieceMsg>onePieceMsgVec_;
+    vector<onePieceMsg_pool>onePieceMsg_poolVec_;
     for (int i=0;i<vec.size();i++) {
         vector<string> v = split(vec[i], " ");
         if (v[0]=="Malloc"){
@@ -262,24 +262,24 @@ vector<onePieceMsg> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
                 result =-1;
                 cout<<"error for converting size from str to int."<<endl;
             }
-            onePieceMsg tempMsg(v[1],result, 1, i);
-            onePieceMsgVec_.push_back(tempMsg);
+            onePieceMsg_pool tempMsg(v[1],result, 1, i);
+            onePieceMsg_poolVec_.push_back(tempMsg);
         }else if (v[0]=="Free"){
-            onePieceMsg tempMsg(v[1],-1, -1, i);
-            onePieceMsgVec_.push_back(tempMsg);
+            onePieceMsg_pool tempMsg(v[1],-1, -1, i);
+            onePieceMsg_poolVec_.push_back(tempMsg);
         }else {
             cout<<"error for process the onePriceMsg."<<endl;
         }
     }
     
-    sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx());
-    idxRange = static_cast<int>(onePieceMsgVec_.size());
+    sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
+    idxRange = static_cast<int>(onePieceMsg_poolVec_.size());
 
-    return onePieceMsgVec_;
+    return onePieceMsg_poolVec_;
 }// end of strVec_2_pieceMsgVec function
 
 
-pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vector<onePieceMsg>onePieceMsgVec_, int idxRange){
+pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vector<onePieceMsg_pool>onePieceMsg_poolVec_, int idxRange){
     /*
      pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks.
      */
@@ -288,27 +288,27 @@ pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vecto
     int i=0;
     
     //while loop processes a pair at each time, if got a pair.
-    while (i<(onePieceMsgVec_.size()-1)){
+    while (i<(onePieceMsg_poolVec_.size()-1)){
         //condition A: start with free. do nothing.
-        if (onePieceMsgVec_[i].MallocFree==-1){
+        if (onePieceMsg_poolVec_[i].MallocFree==-1){
             i+=1;
         }
         //condition B: start with Malloc, next item same ptr and is free.
-        if ((onePieceMsgVec_[i].MallocFree==1)&& (onePieceMsgVec_[i+1].MallocFree==-1)&&((onePieceMsgVec_[i].ptr==onePieceMsgVec_[i+1].ptr))){
-            onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,onePieceMsgVec_[i+1].idx);
+        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
+            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
             onePairMsgVec_1.push_back(tempPair);
             i+=2;
         }
         // condition C: start with Malloc, no free.
-        if ((onePieceMsgVec_[i].MallocFree==1)&&(onePieceMsgVec_[i].ptr!=onePieceMsgVec_[i+1].ptr)){
-            onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange);
+        if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
+            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
             onePairMsgVec_2.push_back(tempPair);
             i+=1;
         }
     }//end of while
     //condition D: if still left with the last item
-    if ((i<onePieceMsgVec_.size())&&(onePieceMsgVec_[i+1].MallocFree==1)){
-        onePairMsg tempPair(onePieceMsgVec_[i].idx,onePieceMsgVec_[i].size,onePieceMsgVec_[i].idx,idxRange);
+    if ((i<onePieceMsg_poolVec_.size())&&(onePieceMsg_poolVec_[i+1].MallocFree==1)){
+        onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
         onePairMsgVec_2.push_back(tempPair);
         i+=1;
     }
@@ -514,8 +514,8 @@ vector<Vertex> colorSomeVertices(vector<onePairMsg> pairMsgVec_, size_t &offset,
 ///get cross-iteration duration pairs
 pair<map<int,int>,map<int,int>> cross_itr_durations(vector<string>vec_double, int location, int maxLen, int &doubleRange){
     
-    vector<onePieceMsg>onePieceMsgVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_2,doubleRange);
+    vector<onePieceMsg_pool>onePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange);
     
     map<int,int>Table_r2d; //full duration info, cross-iteration duration.
     map<int,int>Table_d2r;
@@ -536,8 +536,8 @@ vector<Vertex> run(vector<string>vec, int &idxRange, size_t &offset, size_t &off
      update idxRange, offset.
      time complexity: O(n^2) where n is maxLen.
      */
-    vector<onePieceMsg>onePieceMsgVec_ = strVec_2_pieceMsgVec(vec,idxRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_,idxRange);
+    vector<onePieceMsg_pool>onePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange);
     //1. normal blocks 2. cross-iteration blocks.
     vector<onePairMsg>pairMsgVec_1 = pairOfPairMsgVec_.first;
     vector<onePairMsg>pairMsgVec_2 = pairOfPairMsgVec_.second;
@@ -643,8 +643,8 @@ int test(vector<string>vec3, int &maxLen, int &location){
      */
     cout<<"====================== test ========================="<<endl;
     int idxRange3=0;
-    vector<onePieceMsg>onePieceMsgVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsgVec_3,idxRange3);
+    vector<onePieceMsg_pool>onePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3);
+    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3);
     vector<size_t>rep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_);
     
     //get repeated sub vector.
@@ -976,13 +976,13 @@ void SmartMemPool::Append(string blockInfo) {
     vec_block_RWMF.push_back(blockInfo);
 }
 
-///Swap:
-Swap::Swap(const MemPoolConf &conf){
+///SwapPool
+SwapPool::SwapPool(const MemPoolConf &conf){
     conf_ = conf;
 }
 
-void Swap::Init(){
-  //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
+void SwapPool::Init(){
+
   mtx_.lock();
   if(!initialized_){
     initialized_ =true;
@@ -990,203 +990,252 @@ void Swap::Init(){
   mtx_.unlock();
 }
 
-void Swap::Malloc(void** ptr, const size_t size){
-  //cout<<"to malloc"<<endl;
-  cudaError_t status = cudaMalloc(ptr, size);
-  CHECK_EQ(status, cudaError_t::cudaSuccess);
-  SwapMeta cpu,gpu;
-  cpu.swapSize = size;
-  gpu.swapSize = size;
-  gpu.ptr = *ptr;
-  pair<SwapMeta,SwapMeta>meta = std::make_pair(cpu, gpu);
-  Table_Meta[*ptr] = meta;
-
- swapLookUpElement temp;
- temp.size = size;
- Table_id2LookUpElement[*ptr] = temp;
-  //  int i = 0;
-  // if (!(Table_id2LookUpElement.find(*ptr)==Table_id2LookUpElement.end())){
-  //     i = i + 1;
-  //     temp.data_ = *ptr +i*sizeof(char); 
-  //     while(!(Table_id2LookUpElement.find(temp.data_)==Table_id2LookUpElement.end())){
-  //       //TODO(swap) verify this loop, can simplify as well.
-  //       i = i + 1
-  //       temp.data_ = *ptr +i*sizeof(char);
-  //     }
-  // } else {
-  //     temp.data_ = *ptr;
-  // }
-  // temp.realGpuPtr = *ptr;
-  // temp.location = 1;
-  // temp.size = size;
 
-  // create before swap.
-  // if (size>swapLimit){
-  //   temp.realCpuPtr = malloc(size);
+void SwapPool::PoolOpt(vector<string> &vec_mf) {
+  //TODO(junzhe) redo 9/17
+
+  ///process vec_mf of 3itr into blocks,maxLen
+  //assume format of string: MF ptr size;
+  //onePieceMsg_pool verified
+  // for (int i = 0; i< vec_mf.size();i++){
+  //   cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
   // }
-  // stringstream strm1;
-  // strm1<<size;
-  // string tempStr1 = strm1.str();
-  // stringstream strm2;
-  // strm2<<i;
-  // string tempStr2 = strm2.str();
-  // stringstream strm3;
-  // strm3<<data_;
-  // string tempStr3 = strm3.str();
-  // stringstream strm4;
-  // strm4<<temp.realGpuPtr;
-  // string tempStr4 = strm4.str();
-  stringstream strm1;
-  strm1<<size;
-  string tempStr1 = strm1.str();
-  stringstream strm3;
-  strm3<<*ptr;
-  string tempStr3 = strm3.str();
-  stringstream strm4;
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  strm4<<t2;
-  string tempStr4 = strm4.str();
-  string blockInfo ="Malloc "+tempStr3+" "+tempStr1+" "+tempStr4+" (data_&size)";
-  //string blockInfo ="Malloc: size "+tempStr1+", i "+tempStr2+", data_ "+tempStr3+",  realGpuPtr "+tempStr4;
-  vec_block.push_back(blockInfo);
-  //Table_id2LookUpElement[temp.data_]=temp;
-}
+  // cout<<"===================print done"<<endl;
+  vector<onePieceMsg_pool>onePieceMsg_poolVec_;
+  maxLen_mf = vec_mf.size()/3;
+  cout<<"maxLen_mf "<<maxLen_mf<<endl;
+  cout<<"len of vec_mf "<<vec_mf.size()<<endl;
+  for (int i = 0;i < vec_mf.size();i++){
+    vector<string> v = split(vec_mf[i], " ");
+    // cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
+    // cout<<"||"<<v[0]<<"||"<<v[1]<<"||"<<endl;
+
+    if (v[0]=="Malloc"){
+      //convert v[2] from str to size_t
+      size_t result;
+      stringstream convert(v[2]);
+      // cout<<"1"<<endl;
+      if (!(convert>>result)){
+        result =-1;
+        cout<<"error for converting size from str to int."<<endl;
+      }
+      // cout<<"2"<<endl;
+      onePieceMsg_pool tempMsg(v[1],result, 1, i-maxLen_mf);
+      // cout<<"3"<<endl;
+      onePieceMsg_poolVec_.push_back(tempMsg);
+      // cout<<"4"<<endl;
+    }else if (v[0]=="Free"){
+      // cout<<"1"<<endl;
+      onePieceMsg_pool tempMsg(v[1],-1, -1, i-maxLen_mf);
+      // cout<<"2"<<endl;
+      onePieceMsg_poolVec_.push_back(tempMsg);
+      // cout<<"3"<<endl;
+    }else {
+      cout<<"error for process the onePriceMsg."<<endl;
+      // cout<<i<<" "<<v[0]<<"||"<<vec_mf[i]<<endl;
+    }
+  }
+  sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
+  // cout<<"===================print done(2nd loop)"<<endl;
+  // for (int i = 0;i < vec_mf.size();i++){
 
-void Swap::Free(void *ptr) {
-  //cout<<"to malloc"<<endl;
-  //input is real ptr
-  cudaError_t status = cudaFree(ptr);
-  CHECK_EQ(status, cudaError_t::cudaSuccess);
-  stringstream strm1;
-  strm1<<ptr;
-  string tempStr1 = strm1.str();
-  stringstream strm4;
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  strm4<<t2;
-  string tempStr4 = strm4.str();
-  string blockInfo ="Free "+tempStr1+" "+tempStr4;
-  vec_block.push_back(blockInfo);
-  Table_Meta.erase(ptr);
-
-  Table_id2LookUpElement.erase(ptr);
-}
+  
+  //pair
+  vector<onePairMsg>pairMsgVec_;
+  int i = 0;
+  // cout<<"before while loop"<<endl;
 
-void Swap::Append(string blockInfo) {
-     //TODO(junzhe) add idx later
-    vec_block.push_back(blockInfo);
+  while (i<(onePieceMsg_poolVec_.size()-1)){
+    
+    // cout<<i<<" before 1st if"<<endl;
+    if (onePieceMsg_poolVec_[i].MallocFree==-1){
+      //condition A: start with free. do nothing.
+      i+=1;
+      // cout<<i<<" condition A"<<endl;
+    } else {
+        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)
+          &&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
+          //condition B: start with Malloc, next item same ptr and is free.
+          if ((onePieceMsg_poolVec_[i].idx >=0 && onePieceMsg_poolVec_[i].idx <maxLen_mf)
+            ||(onePieceMsg_poolVec_[i+1].idx >=0 && onePieceMsg_poolVec_[i+1].idx <maxLen_mf)){
+            //condition B1: at least one of the index in range [0,maxLen_mf]
+            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
+            pairMsgVec_.push_back(tempPair); 
+            // cout<<i<<" condition B"<<endl;
+          }
+          i+=2; //no matter in the middle iteration or not, plus 2.
+        } else {
+          //condiction C: not one pair, single one.
+          i+=1;
+          // cout<<" condiction C"<<endl;
+          //       if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
+          //     // onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
+          //     // onePairMsgVec_2.push_back(tempPair);
+          //     i+=1;
+          //     cout<<i<<" condition C"<<endl;
+          // }
+        }
+    } 
+  }
+  sort(pairMsgVec_.begin(),pairMsgVec_.end(),less_than_size_rIdx());
+  // cout<<"after while loop"<<endl;
+  cout<<"size of pairs "<<pairMsgVec_.size()<<endl;
+
+
+  ///get E, V of the blocks， coloring
+  //V
+  int m = static_cast<int>(pairMsgVec_.size());
+  vector<Vertex>vertices;
+  for (int i=0; i<m;i++){
+    Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
+    vertices.push_back(tempVertex);
+  }
+
+  //E and coloring
+  
+  int offset = 0;
+  int **adj;
+  adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
+  // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
+  for (int i=0; i<m;i++){
+    adj[i] = new int[m];
+    for (int j=0; j<m;j++){
+      if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))
+        || (min(vertices[i].d,vertices[j].d)<0 && 
+        min(vertices[i].d,vertices[j].d)+2*maxLen_mf< max(vertices[i].r,vertices[j].r))){
+        //TODO(junzhe) verify after ||
+        adj[i][j]=1;
+        if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
+          vertices[i].colorOccupied.push_back(vertices[j].colorRange);
+        }
+      }
+      else { 
+        adj[i][j]=0; 
+      }
+    }
+    
+    vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
+
+    // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
+    vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset);
 
+    //update of offset, largest memory footprint as well.
+    if (vertices[i].colorRange.second >=offset){
+      offset = vertices[i].colorRange.second+1;
+    }
+  }//end of for loop.
+  cout<<"offset is "<<offset<<endl;
+  cout<<"===================to print vertices"<<endl;
+  cout<<"a, d, size, color["<<endl;
+  for (int i=0; i<m;i++){
+    cout<<"("<<vertices[i].r<<","<<vertices[i].d<<") "<<vertices[i].size<<" "<<vertices[i].colorRange.first<<endl;
+  }
+  cout<<"===================to print vertices done"<<endl;
+
+  ///Make pool, and make table
+  cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
+  cout<<"ptrPool is: "<<ptrPool<<endl;
+
+  for (int i=0; i<vertices.size();i++){
+    lookUpElement temp;
+    temp.r_idx = vertices[i].r;
+    temp.d_idx = vertices[i].d;
+    temp.size = vertices[i].size;
+    temp.offset = vertices[i].colorRange.first;
+    temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
+    temp.Occupied = 0;
+    Table_r2v[vertices[i].r] = temp;
+  }
+  cout<<"===================Table_r2v done"<<endl;
+  poolFlag = 1;
+    
 }
 
+void SwapPool::Malloc(void** ptr, const size_t size){
+  
+  void* allocatedPtr =nullptr;
+  
+  if (poolFlag == 0) {
+    cudaError_t status = cudaMalloc(ptr, size);
+    CHECK_EQ(status, cudaError_t::cudaSuccess);
+  } else {
+    //POOLFLAG = 1 
+    if (pc < maxLen_mf){
+      if ((Table_r2v.find(pc-maxLen_mf) == Table_r2v.end()) || (!(size == Table_r2v.find(pc-maxLen_mf)->second.size))){
+        //not in table of negative r_idx
+        cudaError_t status = cudaMalloc(ptr, size);
+        CHECK_EQ(status, cudaError_t::cudaSuccess);
+      } else{
+        //in the table of negative r_idx
+        auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second;
+        allocatedPtr = tempMeta.ptr;
+        *ptr = allocatedPtr;
+        Table_p2r[allocatedPtr]=pc-maxLen_mf; 
+
+      }
+    } else{
+      //8 9 10
+      int r_pc = pc%maxLen_mf;
+      if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){
+        //not here, should be abnormal
+        cudaError_t status = cudaMalloc(ptr, size);
+        CHECK_EQ(status, cudaError_t::cudaSuccess);
+      } else{
+        //in the table
+        auto tempMeta = Table_r2v.find(r_pc)->second;
+        allocatedPtr = tempMeta.ptr;
+        *ptr = allocatedPtr;
+        Table_p2r[allocatedPtr]=r_pc; 
 
-void Swap::SwapOut(void* data_){
-    printf("A. to swapOut\n");
-    auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    size_t swapSize = Table_Meta.find(data_)->second.second.swapSize;
-    Table_Meta.find(data_)->second.first.ptr = malloc(swapSize);
-    SwapMeta cpu, gpu;
-    cpu = Table_Meta.find(data_)->second.first;
-    gpu = Table_Meta.find(data_)->second.second;
-    cudaError_t err;
-    err=cudaMemcpy(cpu.ptr,gpu.ptr,gpu.swapSize,cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess)
-      {
-      fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
-      exit(EXIT_FAILURE);
       }
-    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    fstream file_block3("blockInfo_swapOut.text", ios::in|ios::out|ios::app);
-    file_block3<<t2-t1<<" "<<swapSize<<endl;
-    printf("B. swapOut done.\n");
-    //cout<<"before free: "<<data_<<endl;
-    cudaFree(data_);
-    //cout<<"after free: "<<data_<<endl;
+    }
+  }
+
+
+    pc++;     
+  }
+
+
+void SwapPool::Free(void *ptr) {
+  if (poolFlag == 0){
+    cudaError_t status = cudaFree(ptr);
+    CHECK_EQ(status, cudaError_t::cudaSuccess);
+  } else{
+    if (Table_p2r.find(ptr)==Table_p2r.end()){
+      cudaError_t status = cudaFree(ptr);
+      CHECK_EQ(status, cudaError_t::cudaSuccess);
+    }
+    
+  }
+
 }
 
-void Swap::SwapIn(void* data_){
-  printf("1. to swapIn.\n");
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  SwapMeta cpu, gpu;
-  cpu = Table_Meta.find(data_)->second.first;
-  gpu = Table_Meta.find(data_)->second.second;
-  gpu.ptr=nullptr;
-  cudaError_t status = cudaMalloc(&gpu.ptr, gpu.swapSize);
-  CHECK_EQ(status, cudaError_t::cudaSuccess);
-  Table_Meta.find(data_)->second.second.ptr=gpu.ptr;
-  //cout<<"after alloc:1 "<<Table_Meta.find(data_)->second.second.ptr<<endl;
-  cudaError_t err;
-  err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice);
-  printf("2. swapIn done.\n");
-  free(cpu.ptr);
-  //
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  fstream file_block3("blockInfo_swapIn.text", ios::in|ios::out|ios::app);
-  file_block3<<t2-t1<<" "<<gpu.swapSize<<endl;
-
-
-//below works  
-  // SwapMeta cpu, gpu;
-  // cpu.swapSize=Table_id2LookUpElement.find(data_)->second.size;
-  // cpu.ptr=malloc(cpu.swapSize);
-  // gpu=cpu;
-  // cudaMalloc(&gpu.ptr,cpu.swapSize);
-  
-  // cudaError_t err;
-  // err=cudaMemcpy(gpu.ptr, cpu.ptr ,cpu.swapSize,cudaMemcpyHostToDevice);
-  // printf("2. swapIn done.\n");
-
-///below partial copy 
-  // SwapMeta h_meta;
-  // SwapMeta* d_meta;
-  // h_meta.swapSize=Table_id2LookUpElement.find(data_)->second.size;
-  // h_meta.h_ptr=malloc(h_meta.swapSize);
-  // cudaMalloc(d_meta,sizeof(SwapMeta));
-  // cudaMemcpy(d_meta,&h_meta,sizeof(SwapMeta),cudaMemcpyHostToDevice);
-
-  // void** d_ptr;
-  // cudaMalloc()
-  // void* h_ptr=malloc(tempSize);
-
-///below is to swapIn swapped out items
-    // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    // size_t tempSize = Table_id2LookUpElement.find(data_)->second.size;
-    // void** tempPtr;
-    // cudaMalloc(tempPtr,tempSize);
-    // cout<<"1. to swapIn."<<endl;
-    // cudaError_t err;
-    // err=cudaMemcpy(*tempPtr, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice);
-    // if (err != cudaSuccess)
-    //   {
-    //   fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
-    //   exit(EXIT_FAILURE);
-    //   }
-    // //cudaMemcpy(data_, Table_id2LookUpElement.find(data_)->second.realCpuPtr ,Table_id2LookUpElement.find(data_)->second.size,cudaMemcpyHostToDevice);
-    // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    // fstream file_block4("blockInfo_swapIn.text", ios::in|ios::out|ios::app);
-    // file_block4<<t2-t1<<" "<<tempSize<<endl;
-    // cudaFree(*tempPtr);
-    // free(Table_id2LookUpElement.find(data_)->second.realCpuPtr);
-    // cout<<"testing after SwapIn"<<endl;
-    // printf("2. swapIn done.\n");
- 
+void SwapPool::Append(string blockInfo) {
+  //NA
+}
+
+
+void SwapPool::SwapOut(void* data_){
+  //NA
+}
+
+void SwapPool::SwapIn(void* data_){
+  //NA 
 }
 
 void getMaxLoad (){
   //empty
 }
 
-std::pair<size_t, size_t> Swap::GetMemUsage() {
+std::pair<size_t, size_t> SwapPool::GetMemUsage() {
   //empty
   return std::make_pair(0, 0);
 }
 
-Swap::~Swap(){
-  //put in notes
-    fstream file_block1("blockInfo.text", ios::in|ios::out|ios::app);
-    for (int i=0; i< vec_block.size();i++){
-        file_block1<<vec_block[i]<<endl;
-    }
-   
+SwapPool::~SwapPool(){
+  //NA  
 }
+
 }
 
 #endif

From 0f3722d30c5942ef4b0b484bee80bf3833524e02 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Wed, 31 Oct 2018 14:08:53 +0800
Subject: [PATCH 18/19] add documentation

---
 .DS_Store                               |  Bin 8196 -> 0 bytes
 CMakeLists.txt                          |    2 +-
 examples/cifar10/{alexnet.py => cnn.py} |    0
 examples/cifar10/train.py               |    2 +-
 include/singa/core/common.h             |    6 +-
 include/singa/core/device.h             |  215 ++--
 include/singa/core/memory.h             |  209 ++-
 src/.DS_Store                           |  Bin 8196 -> 0 bytes
 src/core/.DS_Store                      |  Bin 6148 -> 0 bytes
 src/core/common/common.cc               |   51 +-
 src/core/device/cuda_gpu.cc             |    9 +-
 src/core/device/device.cc               |   27 +-
 src/core/device/swap_gpu.cc             | 1487 +++++++++-------------
 src/core/memory/memory.cc               | 1543 +++++++++++------------
 14 files changed, 1562 insertions(+), 1989 deletions(-)
 delete mode 100644 .DS_Store
 rename examples/cifar10/{alexnet.py => cnn.py} (100%)
 delete mode 100644 src/.DS_Store
 delete mode 100644 src/core/.DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e74703a65a58eddc3bf6ead31e5ced542081bb7d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMTWl0n82<llfinZ8rwFv`x^$&fxfIya%FTk?n^+2#*e$(qDYH9cJ8?R*?96T{
zrKXZ-)EJGI#6&b2;u|JBkr-c0yhn|f*m#Q?6W@F?z8T}?KXaxnZRvwAYG6(>=R4;=
z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe
zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc
zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn
zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&*
zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu
zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd
zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H
z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O<D7hK~$g96N=
z(MDk|N9EFrHSzVgZAsp-_pY|7vX$k$VwI{65VN*t-ftTDo>5cxhWj-)YuLJJ5A_yJ
z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!&
z7c=SpNUS<m%h)ktZ_OK2OS8H$wprmCOS_bayv0^^yTV;1{Qxbe4BeqNGOjB<mX<Lr
zY1*akVZ2l6ElM_}u}R&}DzHxJ%Q_<?l&34r&9PR-KVT=t+=n#R>&h86O)s)*ua3p~
zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i
zH5|emo<<ivJckeCBls9Tj?dzY_!7R1ui#~T9pA#Y@f~~@Kfy2XJNW}6(_%d=`_nak
zPp#l#B(Vk`(;_FYCDwTR-3R_7v3|2cD)gq+Yu43l+`3~|Q}fIs&Re&uCFH9{%EFh+
zv>g1!o2x-7Wi{SIsxr<jrf9xQ=!Z=S@9IQ6&V_JfcS9|8BcyHF-LyHb@)A<6;yZ8K
zuEzNi5~&<*XjD~RN-CD4yZ6xOQqn7%idI&wRrjkYMLc30{9vUvt}^AHYwsdlf;Zqj
z_=q(3CHw$C!7uO|>1_#?V<oBW7L4OMyd5{;7OcaqxD$7i?(QVzwc`Qoz=Nc|G#<r1
z?8gDjqCwglMicX-y&{g{8GH!O`x<-{pT=iMh0l=+FXE+nn(LTWn?!tVR&&Mfylpx5
z5Lu=bI@dYuT~`d0$ZV>_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z
z_)SXmL#XnP<AnZkoN(q3Lux0;Rm7wMoRmlwD*yV2fU0P?|NG2K`osNymW&&z^*1B&
B*USI_

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7acb5feea4..b630497b95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 IF (USE_CUDA)
     include(ExternalProject)
     ExternalProject_Add(cnmem
-        GIT_REPOSITORY "https://github.com/junzhezhang/cnmem.git"
+        GIT_REPOSITORY "https://github.com/nusdbsystem/cnmem.git"
         GIT_TAG "master"
         SOURCE_DIR "cnmem/"
         CONFIGURE_COMMAND "${CMAKE_COMMAND}"
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/cnn.py
similarity index 100%
rename from examples/cifar10/alexnet.py
rename to examples/cifar10/cnn.py
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index 861bd65ac0..9b07991fb4 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -39,7 +39,7 @@
 from singa.proto import core_pb2
 from caffe import caffe_net
 
-import alexnet
+import cnn
 import vgg
 import resnet
 
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 7ea45259bb..47c1068db0 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -56,8 +56,8 @@ class Device;
 /// Block represent a chunk of memory (on device or host).
 class Block {
  public:
-  Block(void* ptr, size_t size, size_t offset = 0, Device* ptrDevice = nullptr)
-      : data_(ptr), size_(size), offset_(offset), ptrDevice_(ptrDevice) {
+  Block(void* ptr, size_t size, size_t offset = 0, Device* ptr_device = nullptr)
+      : data_(ptr), size_(size), offset_(offset), ptr_device_(ptr_device) {
     ref_count_ = 1;  // std::make_shared<std::atomic<int>>(1);
   }
   // Disabled as it is not used currently.
@@ -90,7 +90,7 @@ class Block {
   void* data_ = nullptr;
   size_t size_ = 0;
   size_t offset_ = 0;
-  Device* ptrDevice_;
+  Device* ptr_device_;
   bool initialized_ = false;
   // Disabled as it is not used currently.
   // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 96bc8e9c41..e9dcc1402d 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -66,10 +66,8 @@ class Device {
   /// Called by Tensor.
   void FreeBlock(Block* block);
   
-  void AppendInfo(string blockInfo);
-  void* GetRealGpuPtrInfo(const Block* block_);
-  void SwapOutInfo(const Block* block_);
-  void SwapInInfo(const Block* block_);
+  void AppendInfo(string block_info);
+  void* UpdateGpuPtrInfo(const Block* block_ptr);
 
   /// Return the size (bytes) of memory in use
   /// TODO(wangwei) override this function for all devices.
@@ -108,7 +106,7 @@ class Device {
 
   int id() const { return id_; }
 
-  virtual void* GetRealGpuPtr(const Block* block_) = 0;
+  virtual void* UpdateGpuPtr(const Block* block_ptr) = 0;
 
  private:
   Device() {};
@@ -125,11 +123,8 @@ class Device {
 
   /// Free device memory.
   virtual void Free(void* ptr) = 0;
-  virtual void MakeMetaTable(Block* block,void* data_,int size) = 0;
-  virtual void Append(string blockInfo) = 0;
-  
-  virtual void SwapOut(const Block* block_) = 0;
-  virtual void SwapIn(const Block* block_) = 0;
+  virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0;
+  virtual void Append(string block_info) = 0;
 
  protected:
   int id_ = 0;
@@ -171,11 +166,10 @@ class CppCPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override {}
-  void* GetRealGpuPtr(const Block* block_) override {}
-  void SwapOut(const Block* block_) override {}
-  void SwapIn(const Block* block_) override {}
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override {}
+  void* UpdateGpuPtr(const Block* block_ptr) override {}
+
 };
 
 
@@ -206,11 +200,9 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override;
-  void* GetRealGpuPtr(const Block* block_) override;
-  void SwapOut(const Block* block_) override;
-  void SwapIn(const Block* block_) override;
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override;
+  void* UpdateGpuPtr(const Block* block_ptr) override;
 
  private:
   void Setup();
@@ -222,21 +214,21 @@ class CudaGPU : public Device {
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
 
 ///SwapGPU
-struct onePieceMsg{
+struct DeviceOptInfo{
     /*
-     members: [ptr, size, MallocFree, idx]
+     members: [ptr, size, operation_type, idx]
      */
     string ptr;
     size_t size;
-    int MallocFree;
+    int operation_type;
     int idx;
     double t;
-    onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+    DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
 };
 
 struct BlockMeta{
     /*
-     block Meta.
+     meta of swapping memory blocks
      */
     Block* block_ = nullptr;
     void* data_ = nullptr;
@@ -249,34 +241,39 @@ struct BlockMeta{
 };
 
 struct SwapBlock{
-
+    /*
+    meta of candidate blocks
+    */
     string ptr;
-    string cat;  //A1, A2, A3...
+    string cat; //sub category of the candidate blocks, read-read, write-read, etc.
     int name;
     size_t size;
+    //index of last read/write before swap out, and first read/write after swap in
     int r_idx; //out idx
     int d_idx; //in idx
+    //index of last read/write before swap out, and first read/write after swap in
     double r_time; // out time
     double d_time; //in time
-    double dt; //delta t: t2'-t1'
-    double pri;  //look at here if big enough TODO(junzhe)
-    double dto; //t2-t1
-    double wdto = 0; //t2-t1 weighted by swap_load
-    double r_idx_ready; //r_idx + buffer, could be set during selection.
-    //int free = -1; //when it is freed 
-    //below as per planned.
-    int i1 = 0;
-    int i1p = 0;
-    int i2 = 0;
-    int i2p = 0;
-    double t1 = 0;
-    double t2 = 0;
-    double t1p = 0;
-    double t2p = 0;
-    SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
-    ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
+    double DOA; //Duation of Absence
+    double AOA;  //Area of Absence
+    double DOA_origin; //t2-t1, DOA without taking out time spent
+    double WDOA = 0; //weighted DOA
+    double majority_voting = 0;
+    int r_idx_ready; //r_idx + buffer
+
+    //below are index and time for scheduling
+    int idx_out_start  = 0;
+    int idx_out_end = 0;
+    int idx_in_end = 0;
+    int idx_in_start = 0;
+    double t_out_start = 0;
+    double t_out_end = 0;
+    double t_in_end  = 0;
+    double t_in_start = 0;
+    SwapBlock(string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end): 
+    ptr(p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {}
 };
-/// Device able to Swap memory between Nvidia GPU and Swap
+/// Device able to Swap memory between Nvidia GPU and CPU
 class SwapGPU : public Device {
  public:
   ~SwapGPU();
@@ -300,98 +297,92 @@ class SwapGPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
 
-  //Append at every index: malloc, free, read, mutable
-  void Append(string blockInfo) override;
+  //Append at every index: free, read, mutable
+  void Append(string block_info) override;
 
-  //append info after Malloc, pair.
-  void MakeMetaTable(Block* block,void* data_,int size) override; 
+  //append info after Malloc, as Block* is not available till Malloc() done.
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; 
 
-  //all the testing, without swap, during Append()
-  void Test_sched_switch_swap();
+  //Detection and Plan
+  void DetectionPlan();
 
   //test iteration, return GC
-  int swap_test(vector<string>vec_block,int &maxLen, int &location);
+  int Detection(vector<string>vec_block,int &iteration_length, int &location_of_2nd_iteration);
 
-  //entire plan, from swap_select() to swap_sched(), swap_deploy_tables()
-  void swap_plan();
+  //entire plan, from SelectBlock() to Scheduling(), BuildMetaTables()
+  void Plan();
 
-  //selection algo
-  vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
+  //block selection algo
+  vector<SwapBlock> SelectBlock(vector<SwapBlock>vec_swap,vector<double> temp_load,double mem_limit,string mode);
 
   //schedule algo
-  void swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
+  void Scheduling(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double mem_limit,string mode);
   
-  //make tables Table_sched and Table_meta
-  void swap_construct_tables(vector<SwapBlock>vec_swap_selct);
+  //make tables table_sched and table_meta
+  void BuildMetaTables(vector<SwapBlock>vec_swap_selct);
 
-  //update Table_meta, during Append()
-  void swap_update_tables(Block* tempBlock_);
+  //update table_meta, during Append()
+  void UpdateMetaTables(Block* block_ptr);
 
   //swap/sync during Append()
   void DeploySwap();
 
   //exec DelpoySwap
-  void DeploySwap_exec(int r_gc);
-
-
+  void DeploySwapExec(int relative_counter);
 
   //load profile as per synchronous swap.
-  vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
+  vector<double> GetIdealLoad(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
   
-  //in case gpu ptr wrong. TODO(junzhe) to verify if needed.
-  void* GetRealGpuPtr(const Block* block_) override;
+  //in case gpu ptr wrong, updated it after swap_in ad hoc
+  void* UpdateGpuPtr(const Block* block_ptr) override;
 
-  void SwapOut(const Block* block_) override;
-  void SwapIn(const Block* block_) override;
+  //Swap Synchronous, for early iterations
+  void SwapOutSynchronous(const Block* block_ptr);
+  void SwapInSynchronous(const Block* block_ptr);
 
-  //changed to intake data_ instead
-  void SwapOut_idx(const int r_idx);
-  void SwapIn_idx(const int r_idx);
+  //Swap asynchronous, for middle iteraions
+  void SwapOut(const int idx);
+  void SwapIn(const int idx);
 
  private:
   void Setup();
-  ///Tables needed
-  //r_idx->BlockMeta
-  map<int,BlockMeta>Table_meta;
-  map<const Block*,BlockMeta>Table_block_meta; //TODO(junzhe) for measure speed only.
-  map<const Block*, int>Table_not_at_device;  //int refers to its r_idx of the block/meta
-  //map<const Block*, size_t>Table_block_size;  //Table block_ -> size TODO(junzhe) no need, can call block_->size()
-
-  //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
-  map<int,std::tuple<int,int,int,int>>Table_sched; // changed to with sync_r_idx
 
-  // vector<SwapBlock>vec_swap_selct_global;
+  map<int,BlockMeta>table_meta;
+  map<const Block*,BlockMeta>table_block_meta; //for measure speed only.
+  map<const Block*, int>table_not_at_device;  //int refers to its r_idx of the block/meta
+  map<int,std::tuple<int,int,int,int>>table_sched; // changed to with sync_r_idx
 
   //vec_block
-  vector<string>vec_block; //iteration 0-3
-  vector<string>vec_block_fresh; //iteration 4 5 6
-  vector<string>vec_block_mf; //itr 8 9 10
-  vector<double>global_load; // from begining
-  vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
-  vector<onePieceMsg>vec_run;
-  vector<int>opsSequence; //sequence of operations of one middle iteration
-  vector<size_t>sizeSequence; //size of all operations of one middle iteration
-  int asyncSwapFlag = 0; //0 for sync, 1 for async.
-  int testFlag = 0; //0 means open for test, 1 means no need test anymore.
-  int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
-  int globeCounter = -1;
-  int maxLen = 0;
-  int location = 0;
-  int three_more_location = 0; //location at 3 more iterations later.
-  int three_more_globeCounter = -1; //
-  //design requirement TODO(junzhe)
-  float memLimit_ratio = 0.70; 
+  vector<string>vec_block; //iterations for Detection, i.e. detect iterations.
+  vector<string>vec_block_fresh; //iterations that are used for Planning,
+  vector<string>vec_block_mf; //iterations used to construct pool
+  vector<double>global_load; // load from begining
+  vector<double>origin_load; //3 iteration load, for planning.
+  vector<DeviceOptInfo>vec_run;
+  vector<int>operation_sequence; //sequence of operations of one middle iteration
+  vector<size_t>size_sequence; //size of all operations of one middle iteration
+
+  int async_swap_flag = 0; //0 for sync, 1 for async.
+  int past_test_flag = 0; //0 means need to test, 1 means no need test anymore.
+  int global_index = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
+  int global_index_threshold = -1;
+  int iteration_length = 0;
+  int location_of_2nd_iteration = 0; //index of start of 2nd iteration
+  int location_of_5th_iteration = 0; //index of start of 5th iteration
+  int three_more_iteration_global_index_threshold = -1;
+
+  //design specs
+  float mem_limit_ratio = 0.70; 
   size_t smallest_block = 1<<20; //1 MB
   int data_buffer = 4; // used to control readyIdx
   int mutable_data_buffer = 6;
-  double maxLoad;
-  int maxIdx;
-  double total_swapInTime = 0;
-  double total_swapOutTime = 0;
-  double tempTime = 0;
-  double tempTime2 = 0;
-  double tempTime_baseline; //vec_run[0] time
-  int maxLen_threshold = 1000;
+  double max_load;
+  int max_idx;
+  double total_swap_in_time = 0;
+  double total_swap_out_time = 0;
+  double temp_time = 0;
+  double temp_time_baseline; //vec_run[0] time
+  int iteration_length_threshold = 1000;
 
  private:
   shared_ptr<DeviceMemPool> pool_;
@@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device {
   /// Converts the void pointer into a Buffer object, then deletes the object.
   /// This has the effect of freeing up device memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override {}
-  void* GetRealGpuPtr(const Block* block_) override {}
-  void SwapOut(const Block* block_) override {}
-  void SwapIn(const Block* block_) override {}
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override {}
+  void* UpdateGpuPtr(const Block* block_ptr) override {}
 
 
 private:
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index 343b4449de..b3dfd672ec 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -53,8 +53,6 @@ class DeviceMemPool {
 
   virtual void PoolOpt(vector<string> &vec_mf) = 0;
   
-  virtual void SwapOut(void* data_) = 0;
-  virtual void SwapIn(void* data_) = 0;
   /// Return a pair for free and total memory managed by this pool.
   virtual std::pair<size_t, size_t> GetMemUsage() {
     return std::make_pair(0u, 0u);
@@ -80,8 +78,6 @@ class CnMemPool : public DeviceMemPool {
 
   void PoolOpt(vector<string> &vec_mf) override {}
     
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
   std::pair<size_t, size_t> GetMemUsage() override;
 
   // release all memory and set cnmem manager to unintialized
@@ -110,135 +106,116 @@ class CudaMemPool : public DeviceMemPool {
 
   void PoolOpt(vector<string> &vec_mf) override {}
 
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
 };
 
-//for SmartMemPool
-struct lookUpElement{
-    /*
-     for memory pool Malloc look-up table.
-     */
-    int r_idx;
-    int d_idx;
-    size_t size;
-    size_t offset;
-    void* ptr;
-    int Occupied; //0 is free, 1 is occupied.
-    int crossItr; 
-    int Occupied_backup; 
+//for SmartMemPool and SwapPool
+struct PoolBlockMeta{
+  /*
+   for memory pool Malloc look-up table.
+   */
+  int r_idx;
+  int d_idx;
+  size_t size;
+  size_t offset;
+  void* ptr;
+  int occupied; //0 is free, 1 is occupied.
+  int cross_iteration; 
+  int occupied_backup; 
+};
+
+///struct Vertex
+struct Vertex{
+  int name;
+  size_t size;
+  int r; //arrive
+  int d; //depart
+  int cross_iteration =0;
+  pair<size_t, size_t> color_range;
+  vector<pair<size_t, size_t>> vec_color_preoccupied;
+  Vertex(int n, size_t s, int r1, int d1):name(n),size(s),r(r1),d(d1){}
+
 };
 
-///class mem-pool SmartMemPool
+
+///SmartMemPool
 class SmartMemPool: public DeviceMemPool {
 public:
-    SmartMemPool(const MemPoolConf &conf); //constructor
-    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
-    void Malloc(void** ptr, const size_t size);
-    void Free(void* ptr);
-    ~SmartMemPool();
-    void getMaxLoad(void);
-    std::pair<size_t, size_t> GetMemUsage() override;
-    void Append(string blockInfo);
-
-    void PoolOpt(vector<string> &vec_mf) override {}
-    
-    void SwapOut(void* data_) override {}
-    void SwapIn(void* data_) override {}
-protected:
-    void Init();
-private:
-    MemPoolConf conf_;
-    // whether the (global) memory pool has been initialized
-    bool initialized_ = false;
-    // lock on the initialized variable
-    std::mutex mtx_;
-
-    string colorMethod;
-    int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc
-    int gc =0; //global counter each time Malloc/Free, add 1.
-    int globeCounter=-1;
-    int loadLogFlag =1; //record when its 1.
-    void* ptrPool = NULL;
-    int idxRange = 0;
-    size_t offset = 0;
-    size_t offsetCrossItr=0; //cross iteration offset.
-    int maxLen =0;
-    int location=0;
-    vector<string> vec;
-    vector<string> vec_block_RW;
-    vector<string> vec_block_RWMF;
-    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
-    map<int,int>Table_d2r;
-    //map<int,lookUpElement>Table_r2Ver;
-    vector<pair<int,lookUpElement>>Vec_r2Ver; //b. replace Table_r2Ver
-    map<int, pair<size_t,size_t>>Table_load; //gc, <cudaLoad, colorLoad>
-    map<void*,size_t>Table_p2s; //For tracking load in Free. add when allocate, delete when deallocate.
-    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
-    int checkPoint=300; //for reduce number of test.
-    size_t maxTotalLoad;
-    size_t maxMemUsage;
-    float memRatio;
-};
+  SmartMemPool(const MemPoolConf &conf); //constructor
+  void Malloc(void** ptr, const size_t size);
+  void Free(void* ptr);
+  ~SmartMemPool();
+  std::pair<size_t, size_t> GetMemUsage() override;
+  void GetMaxLoad(void);
+  void Append(string blockInfo);
+  vector<Vertex> Plan(vector<string>vec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method);
+  int Detection(vector<string>vec_string_test, int &iteration_length, int &location_2nd_iteration);
 
+  void PoolOpt(vector<string> &vec_mf) override {}
 
-//for Swap
-struct swapLookUpElement{
-    /*
-    book keep the block info and status
-     */
-    void* data_ = nullptr;
-    void* realGpuPtr = nullptr;
-    void* realCpuPtr = nullptr;
+protected:
+  void Init();
+private:
+  MemPoolConf conf_;
+  // whether the (global) memory pool has been initialized
+  bool initialized_ = false;
+  // lock on the initialized variable
+  std::mutex mtx_;
 
-    int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C.
-    size_t size; //size may used as of now.
+  string color_method;
+  int malloc_flag = 0; //0 for cudaMalloc, 1 for coloringMalloc
+  int global_index = 0; //global counter each time Malloc/Free, add 1.
+  int global_index_threshold = -1;
+  int load_flag = 1; //record load at 1
+  void* ptr_pool = NULL;
+  int idx_range = 0;
+  size_t offset = 0;
+  size_t offset_cross_iteration = 0; //cross iteration offset.
+  int iteration_length = 0;
+  int location_2nd_iteration = 0;
+  vector<string> vec;
+  vector<string> vec_block_rw; //read write only opt info
+  vector<string> vec_block_rw_mf; //read write, malloc, free opt info
+  map<int,int>table_ridx_to_didx; //table match from r_idx to d_idx
+  map<int,int>table_didx_to_ridx; //table match from d_idx to r_idx
+
+  vector<pair<int,PoolBlockMeta>>vec_block_meta; //vec of block meta, index in the vector refering to the r_idx
+  map<int, pair<size_t,size_t>>table_load; //global_index, <cudaLoad, colorLoad>
+  map<void*,size_t>table_ptr_to_size; //for tracking load in Free. add when allocate, delete when deallocate.
+  map<void*,int>table_ptr_to_ridx; //ptr for arrival idx, for look up Table during free
+  int check_point = 300; //for reduce number of test.
+  size_t max_total_load;
+  size_t max_mem_usage;
 };
 
-struct SwapMeta{
-    /*
-     for copy between block and info.
-     */
-    size_t swapSize;
-    void* ptr;
-    void* d_ptr; //not used for
-};
 
+///SwapPool 
 class SwapPool : public DeviceMemPool {
 public:
-    SwapPool(const MemPoolConf &conf); //constructor
-    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
-    void Malloc(void** ptr, const size_t size);
-    void Free(void* ptr);
-    ~SwapPool();
-    void getMaxLoad(void);
-    std::pair<size_t, size_t> GetMemUsage() override;
-    void Append(string blockInfo);
-    
-    void SwapOut(void* data_);
-    void SwapIn(void* data_);
+  SwapPool(const MemPoolConf &conf); //constructor
+  void Malloc(void** ptr, const size_t size);
+  void Free(void* ptr);
+  ~SwapPool();
+  std::pair<size_t, size_t> GetMemUsage() override;
+  void Append(string blockInfo);
 
-    //PoolOpt() construct pool based on MF info after Swap constructed.
-    void PoolOpt(vector<string> &vec_mf);
+  //PoolOpt() construct pool based on MF info after Swap constructed.
+  void PoolOpt(vector<string> &vec_mf);
 protected:
-    void Init();
+  void Init();
 private:
-    MemPoolConf conf_;
-    // whether the (global) memory pool has been initialized
-    bool initialized_ = false;
-    // lock on the initialized variable
-    std::mutex mtx_; 
-    vector<string> vec_block;
-    size_t swapLimit = 1<<23; //8MB
-    int poolFlag = 0;
-    int pc = 0;
-    int maxLen_mf = 0;
-    void* ptrPool = nullptr;
-    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
-    map<int,lookUpElement>Table_r2v; //r-> vertex
-    vector<pair<int,lookUpElement>>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe)
-    // map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
-    // map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
+  MemPoolConf conf_;
+  // whether the (global) memory pool has been initialized
+  bool initialized_ = false;
+  // lock on the initialized variable
+  std::mutex mtx_; 
+
+  vector<string> vec_block;
+  int pool_flag = 0;
+  int pool_index = 0; //like global counter in device class
+  int iteration_length_mf = 0; //max length of malloc free operation sequences.
+  void* ptr_pool = nullptr;
+  map<void*,int>table_ptr_to_ridx; //map ptr to arrival idx, for look up Table during free
+  map<int,PoolBlockMeta>table_pool_meta; //table of pool block meta, key with r_idx
 };
 
 #endif
diff --git a/src/.DS_Store b/src/.DS_Store
deleted file mode 100644
index a87953d857c6cf9354df29d312f03c027a6411d3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMTWl0n82<mZz?p&4Qv}+&E?p^AE-h?Z1d4#$n^+2#*rhE6y3Fp3?ZoNKvNOAF
zDK(WuqsC~wBqpNK5Z`LT6N&M~#Cz0uiH*0YG4ahO<C`&F{xfH`DU?3=q6X$9bG~!_
z^Z#@9|IRmaa%KSl+w*!8Ks5j`x&-o-R9&Npad9mwC4r@k5Gfwg<8C%>Ice&DxX3#+
zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr
zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj
zFd;FlLk2<y7Be6opEAgR1354jp5N11%XGbX{3|ReEnBvH1ydBJl(SQPqt1Arr+Im|
zUCW*Dd!}xrgLcw!togQ<?l+C`6kprn*q&zEMs9)o$pO>Wo$+=jZ|kmqpQD;LO0oV#
zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La
z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn
z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc<yC+Btej)Nw}3R{ZvyldQNP=Hx5
zIw{QMsLZZf7u|T<w)h=;@9LZ@UtPf~*Qn|cacg_#{icySHg4+P=yA=>7`ATOBLfA~
zHPe=nw6!s#5+@a9u6%t}b<NF@$WpF@`LuZ^Z+erwj<Gqux}sdIjohm6%j~31zv76a
zi<xwPBvKoxXY7QqcjOGJrCHq^*{X1jC0$BH-l9?6p>S78K0pg9Lz~r|jO)rVOG_J;
zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3
z!rrFqhRztPOKg_w?bA?~h|8Wr+er;<r)}#VI0*)1VG^EzC*di$0I$L&xD0Q?`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!IiiQYj7jh;U<h@EAGX8xF6fG2M^=D*o#R#g~NCn
zH5|b#o<SEqJckeCBls9Tj?dzY_!7R1uizzo9pA#Y@f~~@Kfy2XJNW}kigG<7`^6f+
zr`G%fNpcN7sFe&~ORmwbyAS?Ha{XqN)aXrX*KMfV+}ON(U)%gLUbS)yMda&73d0vn
zQ4s#}U8zDTW;J?@bY)B|rD?uGD2UAo@7h>2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm
zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C
z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZP<W~xC{4?^6n(<b>Ttm#zUmPB=+MV
z9>*cfph4;zMH6$Rz5<Tp1U`i4eHA{6PvbMB!{<nc7jgC~)pZy3CJ|p-sJcR5&bAzT
zglt8XUg;k8t_ucAWfs-q{NJ_o_y6l6*D#5Yfslb~Gk|5CeVy$zw$$5RoV8<gou*3^
x;Ws1E524CGjuZOFal-jO45=L>R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC

diff --git a/src/core/.DS_Store b/src/core/.DS_Store
deleted file mode 100644
index 018d123c47540aef56047ebb82a09e1df9135f84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKK~BUl476c|7IE1l$9!Rb5Ut7!_5(<*1P4m@z~1``zQ!B)1v9omDnwk6(1PqL
z87H>WWU3~yi0JM(Y(+LAvPK~)m4>jpX*vt%6;N%ANgf~Ww)=h>4~>CNv5RxRkX|Mk
z@wDf^u#Z#M4acWm<mLJG(w6keK74CG-2W+lPVt-NoRWcLAQ?yoe#QXKY|8q|G3#U?
z8At{`7~uX;P>3~fcC@Vnm8AecUZYK*GnZgY39Ny$BP<a0P@souq!{Yq=r7?{17}AM
z7megY<IV5Mi`v^Ueld5^>X>yhkPIvtIQQm;=l>0UnP!r|49Qk9kPQ4Y26WZ!yB!u4
zXX}^c@vJQ<w<r|)%~T-J7ncAuxQ|?$(d~sg{A%Fr$WhpD;lR8I7$I3D1HZt)8>^2g
AjsO4v

diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index f3d144a9c8..d6e9c5a301 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -22,34 +22,28 @@
 #include <iostream>
 #include <fstream>
 #include <string>
-//TODO(junzhe) ifdef to counter verify
-///only include mutable_data() and data()
 
 namespace singa {
 
 void* Block::mutable_data() {
-    //TODO(junzhe) go back to enable it after device done
-    //std::cout<<"mutable_data() "<<this<<' '<<data_<<std::endl;
     initialized_ = true;
-    if (ptrDevice_!=nullptr){
-      //Append info.
+
+    //Append block info: opt_type, ptr, time_stamp
+    if (ptr_device_!=nullptr){
       stringstream strm2;
       strm2<<this;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm4;
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
       strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Mutable "+tempStr2+" "+tempStr4;   
-      ptrDevice_->AppendInfo(temp);
+      string temp_str4 = strm4.str();
+      string temp = "Mutable "+temp_str2+" "+temp_str4;   
+      ptr_device_->AppendInfo(temp);
     }
-    //TODO(junzhe) this should not happen, can verify and remove
+
+    //update ptr after swap in done, if variable is not swapped back yet as expected.
     if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"print returned tempData_ "<<tempData_<<endl;
-      // cout<<"slept to get data_ updated: (mutable_data) "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptr_device_->UpdateGpuPtrInfo(this);
       return static_cast<char*>(tempData_) + offset_;
     }
     
@@ -59,41 +53,38 @@ void* Block::mutable_data() {
 
 const void* Block::data() const {
     CHECK(initialized_) << "Must initialize data before reading it";
-    //TODO(junzhe) go back to enable it after device done
-    if (ptrDevice_!=nullptr){
+
+    //Append block info: opt_type, ptr, time_stamp
+    if (ptr_device_!=nullptr){
       //Append info.
       stringstream strm2;
       strm2<<this;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm4;
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
       strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Read "+tempStr2+" "+tempStr4;
-      ptrDevice_->AppendInfo(temp);
+      string temp_str4 = strm4.str();
+      string temp = "Read "+temp_str2+" "+temp_str4;
+      ptr_device_->AppendInfo(temp);
     }
 
-    //TODO(junzhe) this should not happen, can verify and remove
+    //update ptr after swap in done, if variable is not swapped back yet as expected.
     if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"print returned tempData_ "<<tempData_<<endl;
-      // cout<<"slept to get data_ updated: (data) "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptr_device_->UpdateGpuPtrInfo(this);
       return static_cast<char*>(tempData_) + offset_;
     }
 
-
     return static_cast<char*>(data_) + offset_;
   }
 
 void* Block::get_data() {
+  //get data without calling data(), to avoid append block info.
   return data_;
 }
 
 void Block::update_data(void* data_new) {
+  //update data_, after the swap in completes.
   data_ = data_new;
-  std::cout<<"results update_data:: "<<this<<' '<<data_<<std::endl;
 }
 
 
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 52d4b4fb02..7ec8a9deb5 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -127,17 +127,10 @@ void CudaGPU::Append(string blockInfo){
     pool_->Append(blockInfo);
 }
 
-void* CudaGPU::GetRealGpuPtr(const Block* block_){
+void* CudaGPU::UpdateGpuPtr(const Block* block_){
   return nullptr;
 }
 
-void CudaGPU::SwapOut(const Block* block_){
-  
-}
-
-void CudaGPU::SwapIn(const Block* block_){
-  
-}
 
 }  // namespace singa
 #endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index b2988f3615..59faddc5c6 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -41,9 +41,7 @@ Block* Device::NewBlock(int size) {
   if (size > 0) {
     void* ptr = Malloc(size);
     Block* block_ = new Block(ptr, size,0,this);
-    //std::cout<<"(reference) from device.cc after, data_, block_ device: "<<ptr<<" "<<block_<<' '<<this<<std::endl;
-    MakeMetaTable(block_,ptr,size); // make table and append vec_block.
-    //cout<<"NewBlock: "<<block_<<' '<<ptr<<endl;
+    AppendAfterMalloc(block_,ptr,size); // make table and append vec_block.
     return block_;
   } else {
     return nullptr;
@@ -53,22 +51,18 @@ Block* Device::NewBlock(int size) {
 // TODO(wangwei) return Block to the memory manager
 void Device::FreeBlock(Block* block) {
   if (block != nullptr) {
-    //TODO(junzhe) to merge it
     auto tempPtr = block->mutable_data();
-    //cout<<"FreeBlock: "<<block<<' '<<tempPtr<<endl;
     Free(tempPtr);
-    //cout<<"SwapGPU::Free() returned"<<endl;
-    //Free(block->mutable_data());
     
-    //Add Append for free here.
+    //append block info for free operation.
     stringstream strm1;
     strm1<<block;
-    string tempStr1 = strm1.str();
+    string temp_str1 = strm1.str();
     stringstream strm4;
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
     strm4<<t2;
-    string tempStr4 = strm4.str();
-    string blockInfo ="Free "+tempStr1+" "+tempStr4;
+    string temp_str4 = strm4.str();
+    string blockInfo ="Free "+temp_str1+" "+temp_str4;
     Append(blockInfo);
 
     delete block;
@@ -79,17 +73,10 @@ void Device::AppendInfo(string blockInfo){
   Append(blockInfo);
 }
 
-void* Device::GetRealGpuPtrInfo(const Block* block_){
-  return GetRealGpuPtr(block_);
+void* Device::UpdateGpuPtrInfo(const Block* block_){
+  return UpdateGpuPtr(block_);
 }
 
-void Device::SwapOutInfo(const Block* block_){
-  SwapOut(block_);
-}
-
-void Device::SwapInInfo(const Block* block_){
-  SwapIn(block_);
-}
 
 void Device::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                             CopyDirection direct, int dst_offset,
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index cd4f1fdbd9..85a4061f30 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -40,175 +40,162 @@ const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
                                    cudaMemcpyDeviceToHost,
                                    cudaMemcpyDeviceToDevice};
 
-///functions to be used
-///Section for structs and respective sorting function:
-
-
-
-struct less_than_ptrIdx{
+struct sort_by_ptr_idx_ascending{
     /*
-     sort onePieceMsg by ptr and then idx.
+     sort DeviceOptInfo by ptr and then idx.
      */
-    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    inline bool operator() (const DeviceOptInfo& struct1, const DeviceOptInfo& struct2)
     {
         return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
     }
 };
 
 
-struct oneIterMsg{
+struct DeviceOptSimplifiedInfo{
     /*
-     members: [idx, MallocFree, size_delta]
+     members: [idx, operation_type, size_delta]
      */
-    size_t size_delta;// type as size_t in case size if large.
-    int MallocFree;
+    size_t size_delta; //size if Malloc, else: delta to last index
+    int operation_type;
     int idx;
-    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+    DeviceOptSimplifiedInfo(size_t s, int M, int i):size_delta(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_iterIdx{
+struct sort_by_DeviceOptSimplifiedInfo_idx_ascending{
     /*
-     sort oneIterMsg by Idx.
+     sort DeviceOptSimplifiedInfo by Idx.
      */
-    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
+    inline bool operator() (const DeviceOptSimplifiedInfo& struct1, const DeviceOptSimplifiedInfo& struct2)
     {
         return (struct1.idx<struct2.idx);
     }
 };
 
-struct less_than_lookupIdx{
-    /*
-     sort lookUpElement by idx.
-     */
-    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
-};
 
+vector<string> SplitOptString(string s, string delimiter) {
+  // string delimiter
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  string token;
+  vector<string> res;
+  while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+    token = s.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    res.push_back(token);
+  }
+  res.push_back(s.substr(pos_start));
 
-/// string delimiter
-vector<string> swap_split(string s, string delimiter) {
-    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
-    string token;
-    vector<string> res;
-    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
-        token = s.substr(pos_start, pos_end - pos_start);
-        pos_start = pos_end + delim_len;
-        res.push_back(token);
-    }
-    res.push_back(s.substr(pos_start));
-    return res;
+  return res;
 }
 
-///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
-//vector of pairMsg is used in run.
-//vector of iterMsg is used in test.
 
-vector<onePieceMsg> swap_strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+vector<DeviceOptInfo> DeviceOptSeqStrToStruct(vector<string> vec, int &idx_range){
     /*
-     convert vector of string into vector of onePieceMsg, sorted by ptr 
-     and then idx, and update idxRange to pieceMsgVec size.
-     format of onePieceMsg [ptr, size/-1, flag, idx, timestamp]
+     convert vector of string into vector of DeviceOptInfo, sorted by ptr 
+     and then idx, and update idx_range to pieceMsgVec size.
+     format of DeviceOptInfo [ptr, size/-1, flag, idx, timestamp]
      flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable
-     version on 5/29, with equval blockInfo length: flag, block_, size, t
-     */
-    vector<onePieceMsg>onePieceMsgVec_;
+    */
+    vector<DeviceOptInfo>vec_opt_info;
 
     for (int i=0;i<vec.size();i++) {
-      vector<string> v = swap_split(vec[i], " ");
-      int MallocFree;
+      vector<string> v = SplitOptString(vec[i], " ");
+      int operation_type;
       if (v[0]=="Malloc"){
-        MallocFree = 1;
+        operation_type = 1;
       }else if (v[0]=="Free"){
-        MallocFree = -1;
+        operation_type = -1;
       }else if (v[0]=="Mutable"){
-        MallocFree = 4;
+        operation_type = 4;
       }else if (v[0]=="Read"){ 
-        MallocFree = 2;
+        operation_type = 2;
       }else if (v[0]=="Layer"){
-        MallocFree = 3;
+        operation_type = 3;
       }
-      //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+      //DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
       size_t result;
       stringstream convert(v[2]);
       if (!(convert>>result)){
-          result =-1;
-          cout<<"error for converting size from str to int."<<endl;
+        result =-1;
+        cout<<"error for converting size from str to int."<<endl;
       }
-      onePieceMsg tempMsg(v[1],result, MallocFree, i);
-      double tempTime;
+      DeviceOptInfo itm(v[1],result, operation_type, i);
+      double temp_time;
       stringstream convert2(v[3]);
-      convert2>>tempTime;
-      tempMsg.t =tempTime;
-      onePieceMsgVec_.push_back(tempMsg);
+      convert2>>temp_time;
+      itm.t =temp_time;
+      vec_opt_info.push_back(itm);
     }
  
-    sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx());
-    idxRange = static_cast<int>(onePieceMsgVec_.size());
-
-    return onePieceMsgVec_;
-}// end of strVec_2_pieceMsgVec function
-
-
-vector<size_t> Swap_piece2rep (vector<onePieceMsg>onePieceMsgVec_){
-    vector<oneIterMsg>oneIterMsgVec_;
-    string tempStr;
-    int tempIdx=0;
-    for (int i=0;i<onePieceMsgVec_.size();i++){
-        if (onePieceMsgVec_[i].MallocFree==1){
-            //update tempStr and idx.
-            tempStr = onePieceMsgVec_[i].ptr;
-            tempIdx = onePieceMsgVec_[i].idx;
-            oneIterMsg tempMsg(onePieceMsgVec_[i].size,1,onePieceMsgVec_[i].idx);
-            oneIterMsgVec_.push_back(tempMsg);
-        } else {
-            oneIterMsg tempMsg(onePieceMsgVec_[i].idx-tempIdx,onePieceMsgVec_[i].MallocFree,onePieceMsgVec_[i].idx);
-            tempIdx = onePieceMsgVec_[i].idx;
-            oneIterMsgVec_.push_back(tempMsg);
-        }
-        //cout<<oneIterMsgVec_[i].size_delta<<' '<<oneIterMsgVec_[i].MallocFree<<' '<<oneIterMsgVec_[i].idx<<endl;
+    sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_ptr_idx_ascending());
+    idx_range = static_cast<int>(vec_opt_info.size());
+
+    return vec_opt_info;
+}
+
+
+vector<size_t> DeviceOptSeqRepeatableTestPreProcess(vector<DeviceOptInfo>vec_opt_info){
+  /*
+  pre process Device Operation Sequence Struct info for repeatable test,
+  return a vector of int for fast detection.
+  */
+  vector<DeviceOptSimplifiedInfo>vec_opt_simplified_info;
+  string temp_str;
+  int temp_idx=0;
+  for (int i=0;i<vec_opt_info.size();i++){
+    if (vec_opt_info[i].operation_type==1){
+      //update temp_str and idx.
+      temp_str = vec_opt_info[i].ptr;
+      temp_idx = vec_opt_info[i].idx;
+      DeviceOptSimplifiedInfo itm(vec_opt_info[i].size,1,vec_opt_info[i].idx);
+      vec_opt_simplified_info.push_back(itm);
+    } else {
+      DeviceOptSimplifiedInfo itm(vec_opt_info[i].idx-temp_idx,vec_opt_info[i].operation_type,vec_opt_info[i].idx);
+      temp_idx = vec_opt_info[i].idx;
+      vec_opt_simplified_info.push_back(itm);
     }
+  }
     
-    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
-    //only after sort then can create rep.
-    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
-    for (int i =0; i<oneIterMsgVec_.size(); i++){
-        rep.push_back(oneIterMsgVec_[i].size_delta);
-        //cout<<rep[i]<<endl;
-    }
-    cout<<"rep size: "<<rep.size()<<endl;
-    return rep;
+  sort(vec_opt_simplified_info.begin(),vec_opt_simplified_info.end(),sort_by_DeviceOptSimplifiedInfo_idx_ascending());
+  //only after sort then can create vec_rep.
+  vector<size_t>vec_rep; // vector of size_delta, name it as vec_rep for simlisity.
+  for (int i =0; i<vec_opt_simplified_info.size(); i++){
+    vec_rep.push_back(vec_opt_simplified_info[i].size_delta);
+  }
+  return vec_rep;
 }
-void repPatternDetector(vector<size_t>rep, int &maxLen, int &location, int maxLen_threshold, int gc ){
-    int idxRange = (int)rep.size();
-    int threshold = maxLen_threshold;
-    vector<pair<int,int>>maxLen_location;
-    
-    for (int i=0; i<idxRange;i++){
-        if (maxLen>threshold){
-            break;
-        }
-        for (int len=1; len<(idxRange-i);len++){
-            if (maxLen>threshold){
-                break;
-            }
-            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
-                maxLen = len;
-                location = i;
-                maxLen_location.push_back(make_pair(maxLen,location));
-                // cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
-            }
-        }
+void RepeatableTest(vector<size_t>rep, int &iteration_length, int &location_of_2nd_iteration, int iteration_length_threshold, int global_index ){
+  /*
+  repeatable test, input vector of int, 
+  in-place update max_legth (length of iteration) 
+  and location_of_2nd_iteration (where 2nd iteration starts)
+  */
+  int idx_range = (int)rep.size();
+  int threshold = iteration_length_threshold;
+  vector<pair<int,int>>iteration_length_location_of_2nd_iteration;
+  
+  for (int i=0; i<idx_range;i++){
+    if (iteration_length>threshold){
+      break;
+    }
+    for (int len=1; len<(idx_range-i);len++){
+      if (iteration_length>threshold){
+        break;
+      }
+      if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_length<len)) {
+        iteration_length = len;
+        location_of_2nd_iteration = i;
+        iteration_length_location_of_2nd_iteration.push_back(make_pair(iteration_length,location_of_2nd_iteration));
+      }
     }
-}// end of repPatternDetector
+  }
+}
 
-struct less_than_Idx{
+struct sort_by_idx_ascending{
     /*
-     sort onePieceMsg by ptr and then idx.
+     sort DeviceOptInfo by ptr and then idx.
      */
-    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    inline bool operator() (const DeviceOptInfo& struct1, const DeviceOptInfo& struct2)
     {
         return (struct1.idx<struct2.idx);
     }
@@ -216,331 +203,328 @@ struct less_than_Idx{
 
 
 int SwapOutTime(size_t size){
-    int ans = 0; //TODO(junzhe) used to be 0.29; new param as per vgg
-    if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
-    return ans;
+  int ans = 0; 
+  //measured in 16 PCIe, pinned memory.
+  if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
+  return ans;
 }
 
 int SwapInTime(size_t size){
-    //yet to get the formula
-    int ans = 0; //TODO(junzhe) used to be 0.13; new param as per vgg
-    if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
-    return ans;
+  int ans = 0; 
+  //measured as per ncra ~ ncrd, 16 PCIe, pinned memory.
+  if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
+  return ans;
 }
 
-struct less_than_dto{
+struct sort_by_DOA_origin_descending{
   /*
-   sort SwapBlock by dto, descending
-   */
+  sort SwapBlock by DOA_origin, descending
+  */
   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
   {
-    return (struct1.dto>struct2.dto);
+    return (struct1.DOA_origin>struct2.DOA_origin);
   }
 };
 
-struct less_than_wdto{
+struct sort_by_WDOA_descending{
   /*
-   sort SwapBlock by weighted dto, descending
+  sort SwapBlock by weighted DOA_origin, descending
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.WDOA>struct2.WDOA);
+  }
+};
+
+struct sort_by_AOA_descending{
+  /*
+   sort SwapBlock by pri, descending
    */
   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
   {
-    return (struct1.wdto>struct2.wdto);
+    return (struct1.AOA>struct2.AOA);
   }
 };
 
-// struct less_than_r_idx_ready{
-//   /*
-//    sort SwapBlock by r_idx_ready, ascending
-//    */
-//   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-//   {
-//     return (struct1.r_idx_ready<struct2.r_idx_ready);
-//   }
-// };
-
-struct less_than_pri{
-    /*
-     sort SwapBlock by pri, descending
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.pri>struct2.pri);
-    }
+struct sort_by_idx_ascending_swap{
+  /*
+  sort DeviceOptInfo_Swap by idx.
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.r_idx<struct2.r_idx);
+  }
 };
 
-struct less_than_Idx_Swap{
-    /*
-     sort onePieceMsg_Swap by idx.
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
+struct sort_by_idx_descending_swap{
+  /*
+  sort DeviceOptInfo_Swap by idx. reverse
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.d_idx>struct2.d_idx);
+  }
 };
 
-struct less_than_Idx_Swap_rvs{
-    /*
-     sort onePieceMsg_Swap by idx. reverse
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.d_idx>struct2.d_idx);
-    }
+struct sort_by_majority_voting_ascending{
+  /*
+  sort majority voting, ascending
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.majority_voting<struct2.majority_voting);
+  }
 };
 
 
-pair<int,int> load_over_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){
-  //input: vec_load, memLimit, range [start_idx, end_idx)
-  //return range overlimit [first_over_limit, first_below_limit)
+pair<int,int> GetOptIdxAboveLoadLimit(vector<double>vec_load, size_t mem_limit, int start_idx, int end_idx,int iteration_length){
+  /*
+  get operation index (range) that above the load limit.
+  input: vec_load, mem_limit, range [start_idx, end_idx)
+  return range overlimit [first_over_limit, first_below_limit)
+  */
   int first_over_limit = start_idx;
   int first_below_limit = end_idx;
 
-  for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){
-    if (vec_load[i] > memLimit){
-      first_over_limit = i-maxLen;
+  for (int i = start_idx+iteration_length; i < end_idx+iteration_length; i++){
+    if (vec_load[i] > mem_limit){
+      first_over_limit = i-iteration_length;
       break;
     }
   }
 
-  for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){
-    if (vec_load[i] > memLimit){
-      first_below_limit = i-1-maxLen;
+  for (int i = end_idx+iteration_length; i > first_over_limit+iteration_length; i--){
+    if (vec_load[i] > mem_limit){
+      first_below_limit = i-1-iteration_length;
       break;
     }
   }
+
   if (first_over_limit == start_idx) first_over_limit = -1;
+  
   if (first_below_limit == end_idx) first_below_limit = -1;
 
   return std::make_pair(first_over_limit, first_below_limit);
 }
 
-// pair<int,int> load_below_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){
-//   //input: vec_load, memLimit, range [start_idx, end_idx]
-//   //return range overlimit [first_over_limit, first_below_limit)
-//   int first_below_limit = maxIdx;
-//   int last_below_limit = maxIdx;
-
-//   for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){
-//     if (vec_load[i] > memLimit){
-//       first_below_limit = i+1-maxLen;
-//       break;
-//     }
-//   }
-
-//   for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){
-//     if (vec_load[i] > memLimit){
-//       last_below_limit = i-1-maxLen;
-//       break;
-//     }
-//   }
-
-//   return std::make_pair(first_below_limit, last_below_limit);
-// }
-
-pair<double,int> load_peak(vector<double>vec_load_test,int maxLen){
-  double maxLoad_test = 0;
-  int maxIdx_test = 0;
-  for (int i = maxLen; i < maxLen*2; i++){
-    if (maxLoad_test < vec_load_test[i]){
-      maxLoad_test = vec_load_test[i];
-      maxIdx_test = i - maxLen;
+
+pair<double,int> GetLoadPeak(vector<double>vec_load_test,int iteration_length){
+  /*
+  return value and index of load peak
+  */
+  double max_load_test = 0;
+  int max_idx_test = 0;
+  for (int i = iteration_length; i < iteration_length*2; i++){
+    if (max_load_test < vec_load_test[i]){
+      max_load_test = vec_load_test[i];
+      max_idx_test = i - iteration_length;
     } 
   }
-  return std::make_pair(maxLoad_test,maxIdx_test);
+  return std::make_pair(max_load_test,max_idx_test);
 }
 
-void load_update(vector<double>& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){
-  //update load [start_idx, end_idx) by plusMinus*size
-  for (int i = start_idx+maxLen; i<end_idx+maxLen; i++){
-    vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
+void UpdateLoad(vector<double>& vec_load,int start_idx, int end_idx, int plus_minus, size_t size,int iteration_length){
+  /*
+  update load [start_idx, end_idx) by plus_minus*size
+  */
+  for (int i = start_idx+iteration_length; i<end_idx+iteration_length; i++){
+    vec_load[i] = vec_load[i] + static_cast<double>(size) * plus_minus;
   }
 }
 
-vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode){
+
+///define SwapGPU member functions
+vector<SwapBlock> SwapGPU::SelectBlock(vector<SwapBlock>vec_swap,vector<double> temp_load,double mem_limit,string mode){
   vector<SwapBlock>vec_swap_selct;
-  //vector<SwapBlock>vec_swap_reject;
-  if (mode == "dto"){
-    sort(vec_swap.begin(),vec_swap.end(),less_than_dto());  
+  /*
+  select swapping blocks based on a cetain priority score or BO score;
+  with load updated
+  */
+  if (mode == "DOA_origin"){
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending());  
   }
-  if (mode == "pri"){
-    sort(vec_swap.begin(),vec_swap.end(),less_than_pri());  
+
+  if (mode == "AOA"){
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending());  
   }
-  if (mode == "wdto"){
-    //TODO(junzhe) time complexity
+
+  if (mode == "WDOA"){
     for (int i = 0; i < vec_swap.size(); i++){
       auto itm = vec_swap[i];
       for (int j = itm.r_idx; j < itm.d_idx; j++){
-        itm.wdto += origin_load[i+maxLen] - memLimit;
+        itm.WDOA += origin_load[i+iteration_length] - mem_limit;
       }
     }
-    sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); 
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); 
   }
-  cout<<"===============select block one by one================="<<endl;
+
+  if (mode == "majority_voting"){
+    //add order for DOA
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    //add order for AOA
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    //add order for WDOA
+    for (int i = 0; i < vec_swap.size(); i++){
+      auto itm = vec_swap[i];
+      for (int j = itm.r_idx; j < itm.d_idx; j++){
+        itm.WDOA += origin_load[i+iteration_length] - mem_limit;
+      }
+    }
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_majority_voting_ascending()); 
+  }
+
+
+
+  //select block one by one till updated peak load is no larger than limit.
   for (int i=0; i<vec_swap.size(); i++){
-    load_update(tempLoad,vec_swap[i].r_idx_ready,vec_swap[i].d_idx,-1,vec_swap[i].size,maxLen);
+    UpdateLoad(temp_load,vec_swap[i].r_idx_ready,vec_swap[i].d_idx,-1,vec_swap[i].size,iteration_length);
     vec_swap_selct.push_back(vec_swap[i]);
-    auto tempOverLimit_ = load_over_limit(tempLoad,memLimit,0,maxLen,maxLen);
-    cout<<vec_swap[i].r_idx_ready<<","<<vec_swap[i].d_idx<<" ((("<<tempOverLimit_.first<<","<<tempOverLimit_.second<<")))"<<endl;
-    auto max_current = load_peak(tempLoad,maxLen);
-    auto newMaxLoad = max_current.first;
-    if (newMaxLoad < memLimit){
+    auto temp_over_limit_ = GetOptIdxAboveLoadLimit(temp_load,mem_limit,0,iteration_length,iteration_length);
+    auto max_current = GetLoadPeak(temp_load,iteration_length);
+    auto newmax_load = max_current.first;
+    if (newmax_load < mem_limit){
       break;
     }
   }
-  cout<<"=============selection done================"<<endl;
   
   return vec_swap_selct;
 }
 
-vector<double> SwapGPU::swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct){
+vector<double> SwapGPU::GetIdealLoad(vector<double>vec_load,vector<SwapBlock> vec_swap_selct){
+  /*
+  get load_ideal, which is equivalent to load by synchronous swapping.
+  */
   auto vec_load_return = vec_load;
   for (int i =0; i<vec_swap_selct.size(); i++){
     int auto_buffer = 0;
     auto itm = vec_swap_selct[i];
     if (itm.cat == "A2") auto_buffer = data_buffer;
     if (itm.cat == "A3") auto_buffer = mutable_data_buffer;
-    load_update(vec_load_return, itm.r_idx+auto_buffer, itm.d_idx, -1, itm.size, maxLen);
+    UpdateLoad(vec_load_return, itm.r_idx+auto_buffer, itm.d_idx, -1, itm.size, iteration_length);
   }
   return vec_load_return;
 }
 
-void SwapGPU::swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode){
+void SwapGPU::Scheduling(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double mem_limit,string mode){
   /*
-    update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit.
+  Swap Scheduling algo
+  update idx_out_end, idx_in_start 
+  compute overhead time 
+  mode selection: no overhead or stick to limit.
   */ 
-  //TODO(junzhe) wordy, can merge in common part.
+
   overhead = 0;
-  cout<<"----------------swap_sched----------------"<<endl;
+
+  /// mode that stick to the mem_limit
   if (mode == "stick-to-limit"){
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = itm.r_idx_ready;
-      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
-      cout<<readyIdx;
+      int ready_idx = itm.r_idx_ready;
+
       if (i > 0){
-        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+        ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end);
       }
-      cout<<" -> "<<readyIdx;
-      itm.i1 = readyIdx;
-      // cout<<"check t1((("<<itm.t1;
-      itm.t1 = vec_run[readyIdx+maxLen].t;
-      // cout<<" "<<readyIdx<<" "<<maxLen<<" || "<<vec_run[readyIdx+maxLen].t<<" "<<itm.t1<<")))";
-      itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      total_swapOutTime+=SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity.
-        readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead.
+
+      itm.idx_out_start = ready_idx;
+      itm.t_out_start = vec_run[ready_idx+iteration_length].t;
+      itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size);
+      total_swap_out_time+=SwapOutTime(itm.size);
+      while (itm.t_out_end > vec_run[ready_idx+iteration_length].t){ 
+        //ready means when able to finish swapOut, w/ or w/o overhead.
+        ready_idx++; 
       }
-      //get min compare with maxIdx and readyIdx.
-      readyIdx = std::min(maxIdx,readyIdx);
-      cout<<" || "<<readyIdx;
-      //TODO(junzhe) 1st should be desired.
-      load_update(vec_load_temp,readyIdx+1,itm.d_idx,-1,itm.size,maxLen);
-      // load_update(vec_load_temp,itm.r_idx_ready,itm.d_idx,-1,itm.size,maxLen);
-      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<" ((("<<tempOverLimit_.first<<','<<tempOverLimit_.second<<"))) ";
-      if ((tempOverLimit_.first != -1) && (tempOverLimit_.first <= readyIdx)) { 
-        load_update(vec_load_temp,tempOverLimit_.first-1,readyIdx+1,-1,itm.size,maxLen);
-        // cout<<" ((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
-        readyIdx = tempOverLimit_.first - 1; //TODO(junzhe) boundary
-        overhead+=(itm.t1p-vec_run[readyIdx+maxLen].t);
-        cout<<"==== overhead added "<<itm.t1p-vec_run[readyIdx+maxLen].t<<"... ";
-        // cout<<"time spent "<<SwapOutTime(itm.size)<<endl;
-        // cout<<"so time "<<itm.t1<<endl;
-        // cout<<"eo 1 time "<<itm.t1p<<endl;
-        // cout<<"eo 2 time "<<vec_run[readyIdx+maxLen].t<<endl;
 
+      //get min compare with max_idx and ready_idx.
+      ready_idx = std::min(max_idx,ready_idx);
+      UpdateLoad(vec_load_temp,ready_idx+1,itm.d_idx,-1,itm.size,iteration_length);
+      auto temp_over_limit_ = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
+      if ((temp_over_limit_.first != -1) && (temp_over_limit_.first <= ready_idx)) { 
+        UpdateLoad(vec_load_temp,temp_over_limit_.first-1,ready_idx+1,-1,itm.size,iteration_length);
+        ready_idx = temp_over_limit_.first - 1; 
+        overhead+=(itm.t_out_end-vec_run[ready_idx+iteration_length].t);
       }
-      cout<<" -> "<<readyIdx<<endl;   
-      itm.i1p = readyIdx;
-      vec_swap_selct[i] = itm;
-      // auto tempOverLimit_1 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      // cout<<"end: overlimit first and i1p "<<tempOverLimit_1.first<<' '<<itm.i1p<<endl;
-      
+      itm.idx_out_end = ready_idx;
+      vec_swap_selct[i] = itm; 
     }
-    cout<<"----------------sched part II-------------"<<endl;
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
-      int needIdx = itm.d_idx;
-      cout<<needIdx;
-      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
-      cout<<" -> "<<needIdx;
-      itm.i2 = needIdx;
-      double prepareTime = vec_run[needIdx+maxLen].t - SwapInTime(itm.size);
-      total_swapInTime+=SwapInTime(itm.size);
-      while (prepareTime < vec_run[needIdx+maxLen].t){
-        needIdx--;
+      int need_idx = itm.d_idx;
+      if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); }
+      itm.idx_in_end = need_idx;
+      double prepareTime = vec_run[need_idx+iteration_length].t - SwapInTime(itm.size);
+      total_swap_in_time+=SwapInTime(itm.size);
+      while (prepareTime < vec_run[need_idx+iteration_length].t){
+        need_idx--;
       }
-      needIdx = std::max(needIdx,maxIdx+1);
-      cout<<" || "<<needIdx;
-      itm.i2p = needIdx;
-      itm.t2p = prepareTime;
-      // auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      // cout<<"(((before come back (right over limit): "<<tempOverLimit_2.second<<endl;
-      load_update(vec_load_temp,itm.i2p,itm.d_idx,1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-      auto tempOverLimit_3 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<" ((("<<tempOverLimit_3.first<<","<<tempOverLimit_3.second<<")))";
-      // cout<<"|||after come back (right over limit): "<<tempOverLimit_3.second<<endl;
-      // if (tempOverLimit_3.second > 0){
-      //   cout<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<itm.i1p<<' '<<itm.i2p<<' '<<itm.i2<<")))"<<endl;
-      // }
-
-      if ((tempOverLimit_3.second != -1) && (vec_run[tempOverLimit_3.second+maxLen].t > itm.t2p)) {
-        overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p);
-        cout<<"==== overhead added "<<vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p<<"... ";
-        load_update(vec_load_temp,itm.i2p,tempOverLimit_3.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-        itm.i2p = tempOverLimit_3.second+1;
-        auto tempOverLimit_4 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      need_idx = std::max(need_idx,max_idx+1);
+      itm.idx_in_start = need_idx;
+      itm.t_in_start = prepareTime;
+     UpdateLoad(vec_load_temp,itm.idx_in_start,itm.d_idx,1,itm.size,iteration_length); 
+      auto temp_over_limit_3 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
+
+      if ((temp_over_limit_3.second != -1) && (vec_run[temp_over_limit_3.second+iteration_length].t > itm.t_in_start)) {
+        overhead+=(vec_run[temp_over_limit_3.second+iteration_length].t - itm.t_in_start);
+        UpdateLoad(vec_load_temp,itm.idx_in_start,temp_over_limit_3.second+1,-1,itm.size,iteration_length);
+        itm.idx_in_start = temp_over_limit_3.second+1;
+        auto temp_over_limit_4 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
       }
-      cout<<" -> "<<itm.i2p<<endl;
-      //cout<<"after consider overlimit (right over limit): "<<tempOverLimit_4.second<<endl;
       vec_swap_selct[i] = itm;
     }
-    cout<<":::::END OF SCHED, overhead is "<<overhead<<endl;
   }///end of first mode.
 
 
-  ///this mode not really in use, for test purpose only.
+  ///mode that incurs zero overhead
   if (mode == "no-overhead"){
-    //update i1p
-    //sort by r_idx for i1p update
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    //update idx_out_end
+    //sort by r_idx for idx_out_end update
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = 0;
-      if (itm.cat == "A1") { readyIdx = itm.r_idx; }
-      if (itm.cat == "A2") { readyIdx = itm.r_idx + data_buffer; }
-      if (itm.cat == "A3") { readyIdx = itm.r_idx + mutable_data_buffer; }
+      int ready_idx = 0;
+      if (itm.cat == "A1") { ready_idx = itm.r_idx; }
+      if (itm.cat == "A2") { ready_idx = itm.r_idx + data_buffer; }
+      if (itm.cat == "A3") { ready_idx = itm.r_idx + mutable_data_buffer; }
 
       if (i > 0){
-        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+        ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end);
       }
-      itm.i1 = readyIdx;
-      itm.t1 = vec_run[readyIdx].t;
-      itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx].t){
-        readyIdx++;
+      itm.idx_out_start = ready_idx;
+      itm.t_out_start = vec_run[ready_idx].t;
+      itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size);
+      while (itm.t_out_end > vec_run[ready_idx].t){
+        ready_idx++;
       }
-      itm.i1p = readyIdx;
+      itm.idx_out_end = ready_idx;
       vec_swap_selct[i] = itm;
     }
-    //update i2p
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+    //update idx_in_start
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int needIdx = itm.d_idx;
-      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
-      itm.i2 = needIdx;
-      double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
-      while (prepareTime < vec_run[needIdx].t){
-        needIdx--;
+      int need_idx = itm.d_idx;
+      if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); }
+      itm.idx_in_end = need_idx;
+      double prepareTime = vec_run[need_idx].t - SwapInTime(itm.size);
+      while (prepareTime < vec_run[need_idx].t){
+        need_idx--;
       }
-      itm.i2p = needIdx;
-      itm.t2p = prepareTime;
+      itm.idx_in_start = need_idx;
+      itm.t_in_start = prepareTime;
       vec_swap_selct[i] = itm;
-      load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+      UpdateLoad(vec_load_temp,itm.idx_out_end,itm.idx_in_start+1,-1,itm.size,iteration_length);
     }
 
   }
@@ -548,365 +532,219 @@ void SwapGPU::swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_lo
 }
 
 
-void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
+void SwapGPU::BuildMetaTables(vector<SwapBlock>vec_swap_selct){
+  /*
+  construct tables: table_sched, and table_meta
+  */
   cudaStream_t stream1;
   cudaStream_t stream2;
-  cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;
-  sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
-  //for each swap select, make Table_sched and Table_meta
+  sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
+  //for each swap select, make table_sched and table_meta
   // for (int i = static_cast<int>(vec_swap_selct.size()-1);i>=0; i--){
   for (int i =0; i<vec_swap_selct.size(); i++){
     auto itm = vec_swap_selct[i];
-    // if (itm.r_idx >= 0){
-    //TODO(junzhe) for time being, remove negative r_idx itms.
-      cout<<itm.r_idx<<" || "<<itm.i1<<" "<<itm.i1p<<" "<<itm.i2p<<" "<<itm.i2<<endl;
-      //i1 swap
-      if (Table_sched.find(itm.i1) == Table_sched.end()){
-        Table_sched[itm.i1] = std::make_tuple(itm.r_idx,0,-1,-1);
-      } else {
-        std::get<0>(Table_sched.find(itm.i1)->second) = itm.r_idx;
-        std::get<1>(Table_sched.find(itm.i1)->second) = 0;
-      }
-      //i2p swap
-      if (Table_sched.find(itm.i2p) == Table_sched.end()){
-        Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1);      
-      } else {
-        std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx;
-        std::get<1>(Table_sched.find(itm.i2p)->second) = 1;
-      }
-      // i1p sync
-      if (Table_sched.find(itm.i1p) == Table_sched.end()){
-        Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0);
-      } else {
-        std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx;
-        std::get<3>(Table_sched.find(itm.i1p)->second) = 0; 
-      }
-      //i2 sync
-      if (Table_sched.find(itm.i2) == Table_sched.end()){
-        Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1);
-      } else {
-        std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx;
-        std::get<3>(Table_sched.find(itm.i2)->second) = 1;
-      }
 
-      ///Make Table_meta
-      void* tempPtr = nullptr;
-      cudaMallocHost(&tempPtr,itm.size); //pinned memory.
-      BlockMeta meta;
-      meta.size = itm.size;
-      meta.cpu_ptr = tempPtr;
-      meta.out_stream = stream1;
-      meta.in_stream = stream2;
-      //meta.last_out_idx = vec_swap_selct[i].last_out_idx;
-      //meta.last_in_idx = vec_swap_selct[i].last_in_idx;
-      //meta.i2 = vec_swap_selct[i].i2;
-      Table_meta[itm.r_idx] = meta;
-    // }
-
-  }
-  cout<<"---------------print all 1, 1', 2', 2-----------DONE"<<endl;
-  cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
-  cout<<"size of Table_sched =================="<<Table_sched.size()<<endl;
-  cout<<"print Table_sched, idx, r_idx, sync, direction"<<endl;
-  for (int i = -500; i<maxLen; i++){
-    if (!(Table_sched.find(i) == Table_sched.end())){
-      cout<<i<<"-->";
-      cout<<std::get<0>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<1>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<2>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<3>(Table_sched.find(i)->second)<<endl;
+    if (table_sched.find(itm.idx_out_start) == table_sched.end()){
+      table_sched[itm.idx_out_start] = std::make_tuple(itm.r_idx,0,-1,-1);
+    } else {
+      std::get<0>(table_sched.find(itm.idx_out_start)->second) = itm.r_idx;
+      std::get<1>(table_sched.find(itm.idx_out_start)->second) = 0;
     }
+    //idx_in_start swap
+    if (table_sched.find(itm.idx_in_start) == table_sched.end()){
+      table_sched[itm.idx_in_start] = std::make_tuple(itm.r_idx,1,-1,-1);      
+    } else {
+      std::get<0>(table_sched.find(itm.idx_in_start)->second) = itm.r_idx;
+      std::get<1>(table_sched.find(itm.idx_in_start)->second) = 1;
+    }
+    // idx_out_end sync
+    if (table_sched.find(itm.idx_out_end) == table_sched.end()){
+      table_sched[itm.idx_out_end] = std::make_tuple(-1,-1,itm.r_idx,0);
+    } else {
+      std::get<2>(table_sched.find(itm.idx_out_end)->second) = itm.r_idx;
+      std::get<3>(table_sched.find(itm.idx_out_end)->second) = 0; 
+    }
+    //i2 sync
+    if (table_sched.find(itm.idx_in_end) == table_sched.end()){
+      table_sched[itm.idx_in_end] = std::make_tuple(-1,-1,itm.r_idx,1);
+    } else {
+      std::get<2>(table_sched.find(itm.idx_in_end)->second) = itm.r_idx;
+      std::get<3>(table_sched.find(itm.idx_in_end)->second) = 1;
+    }
+
+    ///Make table_meta
+    void* temp_ptr = nullptr;
+    cudaMallocHost(&temp_ptr,itm.size); //pinned memory.
+    BlockMeta meta;
+    meta.size = itm.size;
+    meta.cpu_ptr = temp_ptr;
+    meta.out_stream = stream1;
+    meta.in_stream = stream2;
+    table_meta[itm.r_idx] = meta;
   }
 
 }
 
-void SwapGPU::swap_update_tables(Block* tempBlock_){
-  // update Table_meta's block_ and data_; update once atfer swap test is passed.
-  // enable to update negative r_idx. 
-  // it's safe in below procedure, as r_gc and r_gc_n should never be the same.
-  if (testFlag == 1) {
+void SwapGPU::UpdateMetaTables(Block* block_ptr){
+  /*
+  update table_meta's block_ and data_; update once atfer swap test is passed.
+  enable to update negative r_idx. 
+  it's safe in below procedure, as r_global_index and relative_counter should never be the same.
+  */
+
+  if (past_test_flag == 1) {
     //update positive r_idx
-    int r_gc = (gc-location)%maxLen;
-    if (!(Table_meta.find(r_gc)==Table_meta.end())){
-      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
-      //TODO(junzhe) verify the length change, if go in, value update
-      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
-      Table_meta.find(r_gc)->second.block_ = tempBlock_;
-      Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
+    int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length;
+    if (!(table_meta.find(r_global_index)==table_meta.end())){
+     table_meta.find(r_global_index)->second.block_ = block_ptr;
+      table_meta.find(r_global_index)->second.data_ = block_ptr->get_data();
     }
 
     //update negative r_idx
-    int r_gc_n = r_gc - maxLen;
-    if (!(Table_meta.find(r_gc_n)==Table_meta.end())){
-      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
-      //TODO(junzhe) verify the length change, if go in, value update
-      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
-      Table_meta.find(r_gc_n)->second.block_ = tempBlock_;
-      Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data();
+    int relative_counter = r_global_index - iteration_length;
+    if (!(table_meta.find(relative_counter)==table_meta.end())){
+      table_meta.find(relative_counter)->second.block_ = block_ptr;
+      table_meta.find(relative_counter)->second.data_ = block_ptr->get_data();
     }
   }
 
 }
 
-int SwapGPU::swap_test(vector<string>vec_block,int &maxLen, int &location){
+int SwapGPU::Detection(vector<string>vec_block,int &iteration_length, int &location_of_2nd_iteration){
+  /*
+  test repeatability, detect iteration, and return global_index_threshold.
+  */
+
+  ///vec_str (vec_block) to vec_opt_info, sort by ptr and idx.
+  int idx_range = 0; 
+  vector<DeviceOptInfo> vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range);
 
-  ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx.
-  int idxRange = 0;
-  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
-  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
   ///rep test
-  vector<size_t> vec_rep = Swap_piece2rep(vec_pieceMsg);
-  //int idxRange3=0; //rename TODO(junzhe)
-  //int maxLen=0, location =0;
-  repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc);
-  cout<<"maxLen and location are: "<<maxLen<<' '<<location<<endl;
-  cout<<"test rep"<<endl;
-  //Note here location not exactly start of one iteration, 
+  vector<size_t> vec_rep = DeviceOptSeqRepeatableTestPreProcess(vec_opt_info);
+  RepeatableTest(vec_rep,iteration_length,location_of_2nd_iteration,iteration_length_threshold,global_index);
+
+  //Note here location_of_2nd_iteration not exactly start of one iteration, 
   //adjust to nearly start of one by restricting "Malloc"
   int shift_counter = 0;
-  for (int i=0;i<maxLen;i++){
-    vector<string> v = swap_split(vec_block[location+i], " ");
+  for (int i=0;i<iteration_length;i++){
+    vector<string> v = SplitOptString(vec_block[location_of_2nd_iteration+i], " ");
     if (v[0]=="Malloc"){
       shift_counter = i; 
       break;
     }
   }
-  location =location+shift_counter;
-  cout<<"shift_counter is "<<shift_counter<<endl;
-  cout<<"location changed to "<<location<<endl;
+  location_of_2nd_iteration =location_of_2nd_iteration+shift_counter;
 
-  if (maxLen<maxLen_threshold) {return -1;}
+  if (iteration_length<iteration_length_threshold) {return -1;}
 
-  return gc+maxLen-(gc-location)%maxLen;
+  return global_index+iteration_length-(global_index-location_of_2nd_iteration)%iteration_length;
 } 
 
-void SwapGPU::swap_plan(){
-  cout<<":::::::::::::::::::::::::start swap_plan()"<<endl;
+void SwapGPU::Plan(){
+  /*
+  major stream of functions: from make candidate blocks, selection swaps, make tables, etc.
+  */
 
-  int idxRange = 0;
-  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
-  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
-  sort(vec_pieceMsg.begin(),vec_pieceMsg.end(),less_than_Idx());
+  int idx_range = 0;
+  vector<DeviceOptInfo> vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range);
+  sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_idx_ascending());
+  
   // scale down idx, to middle iteration.
-  tempTime_baseline = vec_pieceMsg[three_more_location].t;
-  for (int i=0; i<vec_pieceMsg.size();i++){
-    vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - three_more_location - maxLen;
-    vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline;
+  temp_time_baseline = vec_opt_info[location_of_5th_iteration].t;
+  for (int i=0; i<vec_opt_info.size();i++){
+    vec_opt_info[i].idx = vec_opt_info[i].idx - location_of_5th_iteration - iteration_length;
+    vec_opt_info[i].t = vec_opt_info[i].t - temp_time_baseline;
   }
 
   // build opsSqn, and sizeSqn
-  // cout<<"------printing sequenc--------"<<endl;
-  vector<onePieceMsg>one_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]);
+  vector<DeviceOptInfo>one_itr(&vec_opt_info[location_of_2nd_iteration+4*iteration_length],&vec_opt_info[location_of_2nd_iteration+5*iteration_length]);
   for (int i =0; i<one_itr.size();i++){
-    opsSequence.push_back(one_itr[i].MallocFree);
-    sizeSequence.push_back(one_itr[i].size);
-    // cout<<one_itr[i].MallocFree<<' '<<one_itr[i].size<<endl;
+    operation_sequence.push_back(one_itr[i].operation_type);
+    size_sequence.push_back(one_itr[i].size);
   }
-  //3 iterations of vec_run and vec_load, maxIdx and maxLoad
-  vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]);
+  
+  //3 iterations of vec_run and vec_load, max_idx and max_load
+  vector<DeviceOptInfo>temp_vec_run(&vec_opt_info[location_of_2nd_iteration+3*iteration_length],&vec_opt_info[location_of_2nd_iteration+6*iteration_length]);
   vec_run = temp_vec_run;
-  fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app);
-  for (int i =0; i<vec_run.size();i++){
-    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // file_vec_run<<i<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // tempTime2 = vec_run[i].t;
-    file_vec_run<<i-maxLen<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].size<<' '<<vec_run[i].t<<endl;
-  }
 
-  vector<onePieceMsg>temp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]);
+  vector<DeviceOptInfo>temp_vec_run2(&vec_opt_info[location_of_2nd_iteration],&vec_opt_info[location_of_2nd_iteration+3*iteration_length]);
   auto vec_run2 = temp_vec_run2;
-  fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app);
-  for (int i =0; i<vec_run2.size();i++){
-    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // file_vec_run2<<i<<' '<<vec_run2[i].t<<' '<<vec_run2[i].t-tempTime2<<endl;
-    // tempTime2 = vec_run[i].t;
-    file_vec_run2<<i<<' '<<vec_run2[i].MallocFree<<' '<<vec_run2[i].size<<endl;
-  }
 
-  vector<double>vec_load(&global_load[location],&global_load[location+3*maxLen]);
+
+  vector<double>vec_load(&global_load[location_of_2nd_iteration],&global_load[location_of_2nd_iteration+3*iteration_length]);
   origin_load = vec_load;
-  //3 iterations
-  fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app);
-  for (int i=0; i<origin_load.size(); i++){
-    file_load_origin<<i<<' '<<origin_load[i]<<endl;
-  }
-  
-  // vector<double>vec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]);
-  // auto origin_load2 = vec_load2;
-  // //3 iterations
-  // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app);
-  // for (int i=0; i<origin_load2.size(); i++){
-  //   file_load_origin2<<i<<" "<<origin_load2[i]<<endl;
-  // }
-
-  // //one iteration.
-  // fstream file_load_current("load_current.csv", ios::in|ios::out|ios::app);
-  // for (int i=0; i<maxLen; i++){
-  //   file_load_current<<vec_load[i]<<endl;
-  // }
-  cout<<"build from scratch:::::::::::::::;"<<endl;
-  auto max_current = load_peak(vec_load,maxLen);
-  maxLoad = max_current.first;
-  maxIdx = max_current.second;
-  cout<<"------------------print max_load: (current) "<<maxLoad<<" "<<maxIdx<<endl;
 
+  auto max_current = GetLoadPeak(vec_load,iteration_length);
+  max_load = max_current.first;
+  max_idx = max_current.second;
 
   //sort by ptr & idx, sorting the duplicate
   auto vec_run_dup = vec_run;
-  sort(vec_run_dup.begin(),vec_run_dup.end(),less_than_ptrIdx());
+  sort(vec_run_dup.begin(),vec_run_dup.end(),sort_by_ptr_idx_ascending());
+  
   ///formulate swappable items.
-  cout<<"==============================print swappable items, with maxIdx "<<maxIdx<<endl;
   vector<SwapBlock>vec_swap;
-  // size_t load_swap = 0;
+
   for (int i =1; i<vec_run_dup.size(); i++){
-    //SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
-    //ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
-    if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idx<maxIdx) && (vec_run_dup[i].idx>maxIdx) 
+    //SwapBlock(string p, size_t s, int idx_out_start, int i2, double t1, double t2): 
+    //ptr(p), size(s), r_idx(idx_out_start),d_idx(i2),r_time(t1), d_time(t2) {}
+    if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idx<max_idx) && (vec_run_dup[i].idx>max_idx) 
       && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) 
-      && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4)))
+      && ((vec_run_dup[i-1].operation_type==3) or (vec_run_dup[i-1].operation_type==2) or (vec_run_dup[i-1].operation_type==4)))
     {
       SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t);
-      itm.dto = itm.d_time-itm.r_time;
-      itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
-      if (itm.dt>=0){
-        itm.pri = itm.dt * itm.size;
+      itm.DOA_origin = itm.d_time-itm.r_time;
+      itm.DOA = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
+      if (itm.DOA>=0){
+        itm.AOA = itm.DOA * itm.size;
       } else {
-        itm.pri = itm.dt * 1/itm.size;
+        itm.AOA = itm.DOA * 1/itm.size;
       }
       //cat A
-      if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
-      if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
-      if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
+      if (vec_run_dup[i-1].operation_type == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
+      if (vec_run_dup[i-1].operation_type == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
+      if (vec_run_dup[i-1].operation_type == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
 
       vec_swap.push_back(itm);
-      // load_swap+=itm.size;
-      cout<<itm.size<<" ";
-      cout<<"Items Swappable: (r_idx, d_idx, cat, MB, dt/us, PS) || "<<itm.r_idx<<' '<<itm.d_idx;
-      cout<<" ||  "<<itm.cat<<"  || "<<(float)(itm.size)/(float)(1024*1024);
-      cout<<' || '<<itm.dt/1000<<' '<<itm.pri<<endl;
     } 
   }
-  cout<<"size vec_swap: "<<vec_swap.size()<<endl;
 
-  ///load ideal, swap all vec_swap, lest possible memory by one-swap
-  auto vec_load_ideal = swap_load_ideal(vec_load,vec_swap);
+  ///load ideal, swap all vec_swap, lest possible memory by one-swap, for data collection only.
+  auto vec_load_ideal = GetIdealLoad(vec_load,vec_swap);
   fstream file_load_ideal("load_ideal.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
+  for (int i=iteration_length; i<iteration_length*2; i++){
     file_load_ideal<<vec_load_ideal[i]<<endl;
   }
 
-  auto max_ideal = load_peak(vec_load_ideal,maxLen);
-  size_t maxLoad_ideal = max_ideal.first;
-  int maxIdx_ideal = max_ideal.second;
-  cout<<"------------------print max_load: (ideal) "<<maxLoad_ideal<<" "<<maxIdx_ideal<<endl;
-  //maxLoad_ideal = 400;
-  /// select till maxLoad_ideal, dto
-  // auto vec_swap_dto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"dto");
-  // cout<<"size of vec_swap_dto: "<<vec_swap_dto.size()<<endl;
-  // auto vec_load_dto_ideal = swap_load_ideal(vec_load,vec_swap_dto);
-  //   fstream file_load_dto_ideal("load_dto_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_load_dto_ideal<<vec_load_dto_ideal[i]<<endl;
-  // }
-  // auto tempMax_ = load_peak(vec_load_dto_ideal,maxLen);
-  // cout<<"------------------print max_load: (dto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  // /// select till maxLoad_ideal, pri
-  // auto vec_swap_pri = swap_select(vec_swap,maxLoad,maxLoad_ideal,"pri");
-  // cout<<"size of vec_swap_pri: "<<vec_swap_dto.size()<<endl;
-  // auto vec_load_pri_ideal = swap_load_ideal(vec_load,vec_swap_pri);
-  //   fstream file_load_pri_ideal("load_pri_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_load_pri_ideal<<vec_load_pri_ideal[i]<<endl;
-  // }
-  // tempMax_ = load_peak(vec_load_pri_ideal,maxLen);
-  // cout<<"------------------print max_load: (pri ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  /// select till maxLoad_ideal, wdto
-  auto tempLoad = origin_load;
-  auto memLimit_wdto = 550<<20;
-  //TODO(junzhe) memLimit = maxLoad_ideal*1.4
-  auto vec_swap_wdto = swap_select(vec_swap,tempLoad,memLimit_wdto,"wdto");
-  // vec_swap_selct_global = vec_swap_wdto;
-  cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
-  // auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
-    // fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-    // file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
-  // }
-  // auto tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
-  // cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  /// load_1 no overhead, based on pri
-  //auto vec_swap_pri = vec_swap_pri;
-  // auto vec_load_pri = origin_load;
-  // auto vec_load_dto = origin_load;
-  auto vec_load_wdto = origin_load;
+  auto max_ideal = GetLoadPeak(vec_load_ideal,iteration_length);
+  size_t max_load_ideal = max_ideal.first;
+  int max_idx_ideal = max_ideal.second;
+
+  /// majority voting, can specify mode here, can specify load_limit
+  auto temp_load = origin_load;
+  auto mem_limit_majority_voting = 550<<20;
+  auto vec_swap_majority_voting = SelectBlock(vec_swap,temp_load,mem_limit_majority_voting,"majority_voting");
+  // vec_swap_selct_global = vec_swap_majority_voting;
+
+  auto vec_load_WDOA = origin_load;
   string mode = "stick-to-limit";
-  // double overhead_pri = 0;
-  // swap_sched(vec_swap_pri, vec_load_pri,overhead_pri,450<<20, mode);
-  
-  // double overhead_dto = 0;
-  // swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
-  
-  double overhead_wdto = 0;
-  swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,memLimit_wdto,mode);
-
-  swap_construct_tables(vec_swap_wdto);
-
-
-  // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block10<<vec_load_pri[i]<<endl;
-  // }
-  // fstream file_block11("load_1_dto.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block11<<vec_load_dto[i]<<endl;
-  // }
-  // fstream file_block12("load_1_wdto.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block12<<vec_load_wdto[i]<<endl;
-  // }
-  //TODO(junzhe) below verification to be done later.
-  // auto max_1 = load_peak(vec_load_pri,maxLen);
-  // size_t maxLoad_1 = max_1.first;
-  // int maxIdx_1 = max_1.second;
-  // cout<<"------------------print max_load: (1) "<<maxLoad_1<<" "<<maxIdx_1<<endl;
-  // //change back order by Idx.
-  // sort(vec_run.begin(),vec_run.end(),less_than_Idx());
-  // cout<<"done with swap_plan..."<<endl;
-  // cout<<"load 2 overhead pri: "<<(float)(overhead_pri)/(float)(1000000)<<endl;
-  // cout<<"load 2 overhead dto: "<<(float)(overhead_dto)/(float)(1000000)<<endl;
-  // cout<<"load 2 overhead wdto: "<<(float)(overhead_wdto)/(float)(1000000)<<endl;
-  // cout<<"total_swapOutTime: "<<(float)(total_swapOutTime)/(float)(3000000)<<endl;
-  // cout<<"total_swapInTime: "<<(float)(total_swapInTime)/(float)(3000000)<<endl;
-  // auto t1 = vec_run[0].t;
-  // auto t2 = vec_run[maxLen].t;
-  // auto t3 = vec_run[maxLen*2].t;
-  // auto t4 = vec_run[maxLen*3-1].t;
-  // cout<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  // fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-  // file_time<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  // file_time<<"iteration time spent: "<<t2-t1<<" "<<t3-t2<<" "<<t4-t3<<endl;
-  // file_time<<"idx "<<0<<" "<<maxLen<<" "<<maxLen*2<<" "<<maxLen*3-1<<endl;
-  // file_time<<"value "<<t1<<" "<<t2<<" "<<t3<<" "<<t4<<endl;
-}
 
+  double overhead_WDOA = 0;
+  Scheduling(vec_swap_majority_voting, vec_load_WDOA,overhead_WDOA,mem_limit_majority_voting,mode);
 
+  BuildMetaTables(vec_swap_majority_voting);
 
+}
 
 
 
 SwapGPU::~SwapGPU() {
-  //print out push-info TODO(junzhe) can remove
+  //print out push-info
   fstream file_block_full("vec_block_full.csv", ios::in|ios::out|ios::app);
   for (int i =0; i<vec_block.size();i++){
     file_block_full<<vec_block[i]<<endl;
   }
 
-  fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-  file_time<<"=============================="<<endl;
-  //main body
   if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
   if (ctx_.curand_generator)
     CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
@@ -923,7 +761,6 @@ SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) {
 
   MemPoolConf conf;
   conf.add_device(id);
-  //TODO(junzhe) note that it has been <Swap> for building SwapGPU, which doesnt matter.
   pool_ = std::make_shared<SwapPool>(conf); 
   Setup();
 
@@ -988,21 +825,21 @@ void* SwapGPU::Malloc(int size) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Malloc((void**)&ptr, size);
 
-    ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Malloc ";
+    ///append vec_block_mf:for swap & pool
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Malloc ";
       stringstream strm2;
       strm2<<ptr;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm3;
       strm3<<size;
-      string tempStr3 = strm3.str();
-      string temp = tempStr1+tempStr2+" "+tempStr3;
+      string temp_str3 = strm3.str();
+      string temp = temp_str1+temp_str2+" "+temp_str3;
       vec_block_mf.push_back(temp);
     }
     //record mf semantics after swap plan done
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Malloc "<<ptr<<" "<<size;
       file_mf_one_itr<<endl;
@@ -1010,7 +847,6 @@ void* SwapGPU::Malloc(int size) {
     // TODO(wangwei) remove the memset.
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
-  //cout<<"malloc done"<<endl;
   return ptr;
 }
 
@@ -1020,413 +856,318 @@ void SwapGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Free(ptr);
-    ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Free ";
+    ///append vec_block_mf: for swap & pool
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Free ";
       stringstream strm2;
       strm2<<ptr;
-      string tempStr2 = strm2.str();
-      string temp = tempStr1+tempStr2;
+      string temp_str2 = strm2.str();
+      string temp = temp_str1+temp_str2;
       vec_block_mf.push_back(temp);
     }
 
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Free "<<ptr<<endl;
     }
   }
 
-  //cout<<"free done"<<endl; 
 }
 
-void SwapGPU::Test_sched_switch_swap(){
+void SwapGPU::DetectionPlan(){
   /*
-    v1: do Test_sched_switch_swap during (before) Malloc and Free.
-    swap removed to DeploySwap
-    v2: test after every index, at Append. order and index changed.
+    test after every index, at Append. order and index changed.
   */
   ///test iteration
-  if (((gc+1)%(maxLen_threshold) == 0) && (asyncSwapFlag == 0) && (testFlag == 0)){
-    //TODO(junzhe) not lean, chances are globeCounter found more than 300 idx ago: redudant test.
-    cout<<"gc, GC and vec_len before test: "<<gc<<' '<<globeCounter<<' '<<vec_block.size()<<endl;
-    globeCounter = swap_test(vec_block,maxLen,location);
-    maxLen_threshold = std::max(maxLen_threshold,gc/10);
-    maxLen_threshold = std::min(2000,maxLen_threshold);
-    if (maxLen > maxLen_threshold) {
-      testFlag = 1;
-      three_more_globeCounter = globeCounter + 3*maxLen;
-      three_more_location = location + 3*maxLen;
-      cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<<endl;
-      cout<<"impt numbers (maxLen, location, GC) "<<maxLen<<' '<<location<<' '<<globeCounter<<endl;
-      
+  if (((global_index+1)%(iteration_length_threshold) == 0) && (async_swap_flag == 0) && (past_test_flag == 0)){
+    global_index_threshold = Detection(vec_block,iteration_length,location_of_2nd_iteration);
+    iteration_length_threshold = std::max(iteration_length_threshold,global_index/10);
+    iteration_length_threshold = std::min(2000,iteration_length_threshold);
+    if (iteration_length > iteration_length_threshold) {
+      past_test_flag = 1;
+      three_more_iteration_global_index_threshold = global_index_threshold + 3*iteration_length;
+      location_of_5th_iteration = location_of_2nd_iteration + 3*iteration_length;      
    }
  }
  ///switch flag; next idx
- if ((gc+1) == three_more_globeCounter){
-    swap_plan();
-    asyncSwapFlag = 1;
-    // vector<double>vec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]);
-    // origin_load = vec_load2;
-    // //load before swap, write in
-    // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
-    // for (int i=0; i<origin_load.size(); i++){
-    //   file_load_origin<<origin_load[i]<<endl;
-    // }
-    cout<<"switched flag for at "<<three_more_globeCounter<<endl;
-    cout<<"maxLen is "<<maxLen<<endl;
-    cout<<"globeCounter "<<globeCounter<<endl;
+ if ((global_index+1) == three_more_iteration_global_index_threshold){
+    Plan();
+    async_swap_flag = 1;
  }
 }
 
-void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){
+void SwapGPU::AppendAfterMalloc(Block* block_ptr,void* data_ptr,int size){
   /*
-  Append info right after Malloc; make block_ - data_ pair wise table.
+  Append info right after Malloc; make block_ptr - data_ptr pair wise table.
+  as Block* is not available till Malloc() done.
   */
 
   //append info
   stringstream strm1;
   strm1<<size;
-  string tempStr1 = strm1.str();
+  string temp_str1 = strm1.str();
   stringstream strm3;
-  strm3<<block_;
-  string tempStr3 = strm3.str();
+  strm3<<block_ptr;
+  string temp_str3 = strm3.str();
   stringstream strm4;
   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
   strm4<<t2;
-  string tempStr4 = strm4.str();
-  string blockInfo ="Malloc "+tempStr3+" "+tempStr1+" "+tempStr4;
-  Append(blockInfo);
-
+  string temp_str4 = strm4.str();
+  string block_info ="Malloc "+temp_str3+" "+temp_str1+" "+temp_str4;
+  Append(block_info);
   
 }
 
 void SwapGPU::DeploySwap(){
-   ///swap and sync as per schedule.
-  int r_gc = (gc-location)%maxLen; 
-  int r_gc_n = r_gc - maxLen;
-
-  if (asyncSwapFlag == 1){
-    if ((gc < three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
-      cout<<"condition A"<<endl;
-      DeploySwap_exec(r_gc_n);
+  /*
+  swap and sync as per schedule, at every index, by calling DeploySwapExec()
+  */
+
+  int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length; 
+  int r_global_index_n = r_global_index - iteration_length;
+
+  if (async_swap_flag == 1){
+    if ((global_index < three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) {
+      DeploySwapExec(r_global_index_n);
     }
-    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
-      cout<<"condition B"<<endl;
-      DeploySwap_exec(r_gc_n);
+    if ((global_index >= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) {
+      DeploySwapExec(r_global_index_n);
     }
-    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) {
-      cout<<"condition C"<<endl;
-      DeploySwap_exec(r_gc);
+    if ((global_index >= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index) == table_sched.end()))) {
+      DeploySwapExec(r_global_index);
     }
   }
 }
 
 
-void SwapGPU::DeploySwap_exec(int r_gc){
-  cout<<"--------sched action at "<<r_gc<<endl;
-  auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
-  auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
-  auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
-  auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
+void SwapGPU::DeploySwapExec(int r_global_index){
+  //execute DeploySwap 
+  auto swap_idx = std::get<0>(table_sched.find(r_global_index)->second);
+  auto swap_dir = std::get<1>(table_sched.find(r_global_index)->second);
+  auto sync_idx = std::get<2>(table_sched.find(r_global_index)->second);
+  auto sync_dir = std::get<3>(table_sched.find(r_global_index)->second);
   if (swap_dir == 0){ 
-    SwapOut_idx(swap_idx); 
-    cout<<"----Swap Out "<<swap_idx<<endl;
+    SwapOut(swap_idx); 
   }
   if (swap_dir == 1){ 
-    SwapIn_idx(swap_idx); 
-    cout<<"----Swap In "<<swap_idx<<endl;
+    SwapIn(swap_idx); 
   }
-  //TODO(junzhe) verify sync what else to be done
   if (sync_dir == 0){
     ///sync swap-out, including sync, update block's data_ to nullptr, free data_, update meta.
-    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto last_meta = table_meta.find(sync_idx)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaEventSynchronize(last_meta.in_event);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
 
-    Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+    table_not_at_device[last_meta.block_] = sync_idx;
 
     last_meta.block_->update_data(nullptr);
-    // cout<<"to free data_"<<last_meta.data_<<endl;
     pool_->Free(last_meta.data_);
     ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Free ";
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Free ";
       stringstream strm2;
       strm2<<last_meta.data_;
-      string tempStr2 = strm2.str();
-      string temp = tempStr1+tempStr2;
+      string temp_str2 = strm2.str();
+      string temp = temp_str1+temp_str2;
       vec_block_mf.push_back(temp);
     }
 
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Free "<<last_meta.data_<<" SwapOut(Sync)"<<endl;
     }
-    last_meta.data_ = nullptr; //not really needed TODO(junzhe)
-    cout<<"----sync out "<<sync_idx<<endl;
-    Table_meta.find(sync_idx)->second = last_meta;
+    last_meta.data_ = nullptr;
+    table_meta.find(sync_idx)->second = last_meta;
   }
   if (sync_dir == 1){
     ///sync swap-in, including sync, update block's data_ to new gpu address, update meta.
-    //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
-    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto last_meta = table_meta.find(sync_idx)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaEventSynchronize(last_meta.out_event);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    Table_not_at_device.erase(last_meta.block_);
+    table_not_at_device.erase(last_meta.block_);
     last_meta.block_->update_data(last_meta.data_);
-    cout<<"----sync in "<<sync_idx<<endl;
-    Table_meta.find(sync_idx)->second = last_meta;
+    table_meta.find(sync_idx)->second = last_meta;
   }
 }
 
-void SwapGPU::Append(string blockInfo){
+void SwapGPU::Append(string block_info){
+  /*
+  Append Operation block info after each operation
+  Meantime execute following operations:
+    insert size for non-malloc operations
+    update global memory load
+    control swap flag on and off
+    update table_meta and table_sched
+    deploy swap at every index.
+    test moved from start of malloc/free to end of append, only global_index+1 changed
+    call PoolOpt to Construct Pool
+  */
 
-  vector<string> v = swap_split(blockInfo, " ");
-  void* tempPtr;
+  vector<string> v = SplitOptString(block_info, " ");
+  void* temp_ptr;
   stringstream convert(v[1]);
-  convert>>tempPtr;
-  auto tempBlock_ = static_cast<Block*>(tempPtr);
+  convert>>temp_ptr;
+  auto block_ptr = static_cast<Block*>(temp_ptr);
   
   // insert size, malloc : flag, block_, size, t; others: insert size t.
   if (v.size() != 4) {
     stringstream strm1;
-    strm1<<tempBlock_->size();
-    string tempStr1 = strm1.str();
-    blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2];
+    strm1<<block_ptr->size();
+    string temp_str1 = strm1.str();
+    block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2];
   }
 
   // update global load
-  if (maxLen < maxLen_threshold){
+  if (iteration_length < iteration_length_threshold){
     if (v[0] == "Malloc"){
       if (global_load.size()>0){
-        global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size());
+        global_load.push_back(global_load[global_load.size()-1]+block_ptr->size());
       } else {
-        global_load.push_back(tempBlock_->size());
+        global_load.push_back(block_ptr->size());
       }
     } else if (v[0] == "Free"){
-      global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size());
+      global_load.push_back(global_load[global_load.size()-1]-block_ptr->size());
     } else {
       global_load.push_back(global_load[global_load.size()-1]);
     }
   }
 
   //append into vec_block
-  vec_block.push_back(blockInfo);
-
-
-  //cout<<blockInfo<<endl;
-  //cout<<tempBlock_->size()<<endl;
-  //cout<<"load: "<<global_load[global_load.size()-1]<<" len of blockInfo and global_load "<<vec_block.size()<<' '<<global_load.size()<<endl;
-  //std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-  
-  // if (asyncSwapFlag == 1){
-  //   vec_block_fresh.push_back(blockInfo);
-  // }
-  // if ((maxLen>maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){
-  //   fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app);
-  //   for (int i =0; i<vec_block_fresh.size();i++){
-  //     file_block_fresh<<vec_block_fresh[i]<<endl;
-  //   }
-  // }
-  // fstream file_block5("append.csv", ios::in|ios::out|ios::app);
-  // file_block5<<gc<<' '<<blockInfo<<' '<<(gc-1247)%612<<endl;
-
-  //print time duration per iteration
-  if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){
-    if (tempTime != 0){
-      fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-      auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count();
-      file_time<<(float)(t_now - tempTime)/(float)(1000000)<<endl;
-      
+  vec_block.push_back(block_info);
+
+  //change swap flag on and off
+  if (async_swap_flag == 1){
+    int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length;
+    if (block_ptr->size() != size_sequence[r_global_index]){
+      async_swap_flag = 0;
+      cout<<"!!!! async_swap_flag changed back to 0"<<endl;
     }
-    tempTime = (std::chrono::system_clock::now()).time_since_epoch().count();
   }
 
-  //check if last iteration, TODO(junzhe) further verify with MallocFree.
-  if (asyncSwapFlag == 1){
-    int r_gc = (gc-location)%maxLen;
-    if (tempBlock_->size() != sizeSequence[r_gc]){
-      asyncSwapFlag = 0;
-      cout<<"!!!! asyncSwapFlag changed back to 0"<<endl;
-    }
-  }
-
-  //update Table_meta
-  swap_update_tables(tempBlock_);
+  //update table_meta and table_sched
+  UpdateMetaTables(block_ptr);
 
   //deploy swap at every index.
   DeploySwap();
 
-  //test moved from start of malloc/free to end of append, only gc+1 changed
-  Test_sched_switch_swap();
-  //NOTE: this gc includes read/write and AppendLayer as well, in addition to malloc/free.
-  gc++;
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) == three_more_globeCounter)){
-    cout<<"==================to call PoolOpt"<<endl;
-    fstream file_mf_8910("mf_8910.csv", ios::in|ios::out|ios::app);
-    for (int i = 0; i< vec_block_mf.size();i++){
-      file_mf_8910<<vec_block_mf[i]<<endl;
-    }
-    cout<<"len of vec_block_mf: "<<vec_block_mf.size()<<endl;
-    pool_->PoolOpt(vec_block_mf);
-    cout<<"==================to call PoolOpt done"<<endl;
-  }
+  //test moved from start of malloc/free to end of append, only global_index+1 changed
+  DetectionPlan();
 
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) 
-    && ((gc - three_more_globeCounter)%maxLen == 0)){
-      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
-      file_mf_one_itr<<"-----new itr------"<<endl;
+  //NOTE: this global_index includes read/write and AppendLayer as well, in addition to malloc/free.
+  global_index++;
+
+  //call PoolOpt to Construct Pool
+  if ((async_swap_flag == 1) && ((global_index - 4 * iteration_length) == three_more_iteration_global_index_threshold)){
+    pool_->PoolOpt(vec_block_mf);
   }
 
 }
 
-void* SwapGPU::GetRealGpuPtr(const Block* block_){
-  // in case that block is at host memory, swapIn ad hoc.
-  auto r_idx = Table_not_at_device.find(block_)->second;
-
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+void* SwapGPU::UpdateGpuPtr(const Block* block_ptr){
+  /*
+  in case that block is not at device memory, swapIn ad hoc.
+  used in block class to update ptr after swap in done, if variable is not swapped back yet as expected.
+  */ 
+  auto r_idx = table_not_at_device.find(block_ptr)->second;
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(r_idx)->second;
   cudaEventCreate (&meta.in_event);
-  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
-  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
-  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
-  //void* to_rm_ptr = meta.data_;
   meta.data_ = ptr;
-  // cout<<"right before cudaMemcpyAsync In"<<endl;
   err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
   cudaEventRecord(meta.in_event,meta.in_stream);
-  // cout<<"right after cudaMemcpyAsync"<<endl;
-  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
-  //upadte meta's new gpu addr, in_event
-  
-
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
   cudaEventSynchronize(meta.out_event);
-  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // Table_not_at_device.erase(block_);
-  // last_meta.block_->update_data(last_meta.data_);
-  // cout<<"----sync in "<<sync_idx<<endl;
-  // Table_meta.find(sync_idx)->second = last_meta;
-  Table_meta.find(r_idx)->second = meta;
-
-  // //here should be not update_data()
-  // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // cudaEventSynchronize(reading_meta.in_event);
-  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
-  // //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
-  // //reading_meta.block_->update_data(reading_meta.data_);
-  // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
-  // Table_not_at_device.erase(reading_meta.block_);
-  // block_->update_data(static_cast<Block*>(ptr));
-  
-  cout<<"print ptr from function GetRealGpuPtr() "<<ptr<<endl;
+  table_meta.find(r_idx)->second = meta;
 
-  return ptr; //TODO(junzhe) attention, based on no change here.
+  return ptr;
 }
 
-void SwapGPU::SwapOut_idx(const int r_idx){
-  //cout<<"doing asynchrous swapOut of r_idx: "<<r_idx<<' '<<endl;
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();  
+void SwapGPU::SwapOut(const int idx){
+  /*
+  memory copy asynchronously GPU -> CPU, and update meta.
+  */
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(idx)->second;
   cudaEventCreate (&meta.out_event);
-  //cout<<"right before cudaMemcpyAsync Out"<<endl;
   err = cudaMemcpyAsync(meta.cpu_ptr,meta.data_,meta.size,cudaMemcpyDeviceToHost,meta.out_stream);
   cudaEventRecord(meta.out_event,meta.out_stream);
-  //cout<<"right after cudaMemcpyAsync"<<endl;
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // cout<<"To update_data swap for (Out) "<<r_idx<<" "<<meta.block_<<" 0"<<endl;
-  //update meta's out_event
-  Table_meta.find(r_idx)->second = meta;
-  //cout<<"time for asynchrous: "<<t2-t1<<endl;
-  //cudaEventSynchronize(event1);
-  //auto t4 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  //cout<<"time for asynchrous to complete: "<<t4-t1<<endl;
+  table_meta.find(idx)->second = meta;
 }
 
-void SwapGPU::SwapIn_idx(const int r_idx){
-  //logic: extra meta, swap, update meta in Table
-  //TODO(junzhe) to clean up free(), make it in somewhere else.
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+void SwapGPU::SwapIn(const int idx){
+  /*
+  memory copy asynchronously CPU -> GPU, and update meta.
+  */
+
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(idx)->second;
   cudaEventCreate (&meta.in_event);
-  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
+
   ///append vec_block_mf
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-    && ((gc - maxLen) >= three_more_globeCounter)){
-    string tempStr1 ="Malloc ";
+  if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+    && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+    string temp_str1 ="Malloc ";
     stringstream strm2;
     strm2<<ptr;
-    string tempStr2 = strm2.str();
+    string temp_str2 = strm2.str();
     stringstream strm3;
     strm3<<meta.size;
-    string tempStr3 = strm3.str();
-    string temp = tempStr1+tempStr2+" "+tempStr3;
+    string temp_str3 = strm3.str();
+    string temp = temp_str1+temp_str2+" "+temp_str3;
     vec_block_mf.push_back(temp);
   }
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+  if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
     fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
     file_mf_one_itr<<"Malloc "<<ptr<<" "<<meta.size<<" swapIn"<<endl;
   }
-  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
-  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
-  //void* to_rm_ptr = meta.data_;
+
   meta.data_ = ptr;
-  // cout<<"right before cudaMemcpyAsync In"<<endl;
   err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
   cudaEventRecord(meta.in_event,meta.in_stream);
-  // cout<<"right after cudaMemcpyAsync"<<endl;
-  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
-  //upadte meta's new gpu addr, in_event
-  Table_meta.find(r_idx)->second = meta;
-  //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update.
-  //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  //cout<<"time for asynchrous: "<<t2-t1<<endl;
+  table_meta.find(idx)->second = meta;
 }
 
-void SwapGPU::SwapOut(const Block* block_){
-  if (gc < 1000 && block_->size() > 1<<20) {
+void SwapGPU::SwapOutSynchronous(const Block* block_ptr){
+  /*
+  for synchronous swap, collect speed info
+  */
+  if (global_index < 1000 && block_ptr->size() > 1<<20) {
     fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
     BlockMeta meta;
     meta.data_ = meta.block_->get_data();
-    void* tempPtr = nullptr;
-    cudaMallocHost(&tempPtr,block_->size()); //pinned memory.
-    meta.cpu_ptr = tempPtr;
-    Table_block_meta[block_] = meta;
+    void* temp_ptr = nullptr;
+    cudaMallocHost(&temp_ptr,block_ptr->size()); //pinned memory.
+    meta.cpu_ptr = temp_ptr;
+    table_block_meta[block_ptr] = meta;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaError_t err;
-    err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost);
+    err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_ptr->size(),cudaMemcpyDeviceToHost);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    file_block5<<"Out "<<block_->size()<<' '<<t2-t1<<endl;
-    cout<<"swap out done at gc: "<<gc<<endl;
+    file_block5<<"Out "<<block_ptr->size()<<' '<<t2-t1<<endl;
   }
 }
 
-void SwapGPU::SwapIn(const Block* block_){
-  if (gc < 1000 && block_->size() > 1<<20) {
+void SwapGPU::SwapInSynchronous(const Block* block_ptr){
+  /*
+  for synchronous swap, collect speed info
+  */
+  if (global_index < 1000 && block_ptr->size() > 1<<20) {
     fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
-    BlockMeta meta = Table_block_meta.find(block_)->second;
+    BlockMeta meta = table_block_meta.find(block_ptr)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaError_t err;
-    err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice);
+    err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_ptr->size(),cudaMemcpyHostToDevice);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    file_block5<<"In "<<block_->size()<<' '<<t2-t1<<endl;
-    cout<<"swap in done at gc: "<<gc<<endl;
+    file_block5<<"In "<<block_ptr->size()<<' '<<t2-t1<<endl;
   }
 }
 
-
 }  // namespace singa
 #endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index 8c7bc1c057..1bdb1190a5 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -118,75 +118,55 @@ void CudaMemPool::Free(void *ptr) {
 }
 
 //for SmartMemPool
-///vertex of the graph.
-class Vertex {
-public:
-    int name;
-    size_t size;
-    int r; //arrive
-    int d; //depart
-    int crossItr =0;
-    Vertex(int,size_t,int,int);
-    pair<size_t, size_t> colorRange;
-    vector<pair<size_t, size_t>> colorOccupied;
-};
-Vertex::Vertex(int n, size_t s, int r1, int d1){
-    name =n;
-    size = s;
-    r = r1;
-    d = d1;
-}//end of class Vertex
-
 
 ///Section for structs and respective sorting function:
-// onePieceMsg_pool, onePairMsg, oneIterMsg, version 11/30 3pm
-struct onePieceMsg_pool{
+// PoolOptInfo, PoolBlockLifeTime, PoolOptSimplifiedInfo
+struct PoolOptInfo{
     /*
-     members: [ptr, size, MallocFree, idx]
+     members: [ptr, size, operation_type, idx]
      */
     string ptr;
     size_t size;
-    int MallocFree;
+    int operation_type;
     int idx;
-    onePieceMsg_pool(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+    PoolOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_ptrIdx{
-    /*
-     sort onePieceMsg_pool by ptr and then idx.
-     */
-    inline bool operator() (const onePieceMsg_pool& struct1, const onePieceMsg_pool& struct2)
-    {
-        return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
-    }
+struct sort_by_ptr_idx_ascending{
+  /*
+   sort PoolOptInfo by ptr and then idx.
+   */
+  inline bool operator() (const PoolOptInfo& struct1, const PoolOptInfo& struct2)
+  {
+    return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
+  }
 };
 
 
-struct oneIterMsg{
+struct PoolOptSimplifiedInfo{
     /*
-     members: [idx, MallocFree, size_delta]
+     members: [idx, operation_type, size_delta]
      */
     size_t size_delta;// type as size_t in case size if large.
-    int MallocFree;
+    int operation_type;
     int idx;
-    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+    PoolOptSimplifiedInfo(size_t s, int M, int i):size_delta(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_iterIdx{
-    /*
-     sort oneIterMsg by Idx.
-     */
-    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
-    {
-        return (struct1.idx<struct2.idx);
-    }
+struct sort_by_itr_idx_ascending{
+  /*
+   sort PoolOptSimplifiedInfo by Idx.
+   */
+  inline bool operator() (const PoolOptSimplifiedInfo& struct1, const PoolOptSimplifiedInfo& struct2)
+  {
+    return (struct1.idx<struct2.idx);
+  }
 };
 
 
-//TODO(junzhe) to replace vertex with onePairMsg, try combine other structs as well.
-struct onePairMsg{
+struct PoolBlockLifeTime{
     /*
      members: [name (r_idx), size, r_idx, d_idx]
      */
@@ -194,66 +174,54 @@ struct onePairMsg{
     size_t size;
     int r_idx;
     int d_idx;
-    onePairMsg(int n,size_t s, int r,int d):name(n),size(s),r_idx(r),d_idx(d){}
+    PoolBlockLifeTime(int n,size_t s, int r,int d):name(n),size(s),r_idx(r),d_idx(d){}
 };
 
 
-struct less_than_size{
-    /*
-     sort onePairMsg by descending size.
-     */
-    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
-    {
-        return (struct1.size>struct2.size);
-    }
-};
-
-struct less_than_size_rIdx{
-    /*
-     sort onePairMsg by descending size and r_idx
-     */
-    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
-    {
-        return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx<struct2.r_idx)));
-    }
+struct sort_by_size_descending{
+  /*
+  sort PoolBlockLifeTime by descending size.
+  */
+  inline bool operator() (const PoolBlockLifeTime& struct1, const PoolBlockLifeTime& struct2)
+  {
+    return (struct1.size>struct2.size);
+  }
 };
 
-struct less_than_lookupIdx{
-    /*
-     sort lookUpElement by idx.
-     */
-    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
+struct sort_by_size_r_idx_descending{
+  /*
+  sort PoolBlockLifeTime by descending size and r_idx
+  */
+  inline bool operator() (const PoolBlockLifeTime& struct1, const PoolBlockLifeTime& struct2)
+  {
+    return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx<struct2.r_idx)));
+  }
 };
 
 
-/// string delimiter
-vector<string> split(string s, string delimiter) {
-    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
-    string token;
-    vector<string> res;
-    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
-        token = s.substr(pos_start, pos_end - pos_start);
-        pos_start = pos_end + delim_len;
-        res.push_back(token);
-    }
-    res.push_back(s.substr(pos_start));
-    return res;
+vector<string> SplitString(string s, string delimiter) {
+  /// string delimiter
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  string token;
+  vector<string> res;
+  while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+    token = s.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    res.push_back(token);
+  }
+  res.push_back(s.substr(pos_start));
+  return res;
 }
 
-///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
-//vector of pairMsg is used in run.
-//vector of iterMsg is used in test.
 
-vector<onePieceMsg_pool> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+vector<PoolOptInfo> PoolOptSeqStrToStruct(vector<string> vec, int &idx_range){
     /*
-     convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size.
+     convert vector of string into vector of PoolOptInfo, 
+     sorted by ptr and then idx, and update idx_range to pieceMsgVec size.
      */
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_;
+    vector<PoolOptInfo>vec_pool_opt_info;
     for (int i=0;i<vec.size();i++) {
-        vector<string> v = split(vec[i], " ");
+        vector<string> v = SplitString(vec[i], " ");
         if (v[0]=="Malloc"){
             //convert v[2] from str to size_t
             size_t result;
@@ -262,410 +230,346 @@ vector<onePieceMsg_pool> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange)
                 result =-1;
                 cout<<"error for converting size from str to int."<<endl;
             }
-            onePieceMsg_pool tempMsg(v[1],result, 1, i);
-            onePieceMsg_poolVec_.push_back(tempMsg);
+            PoolOptInfo tempMsg(v[1],result, 1, i);
+            vec_pool_opt_info.push_back(tempMsg);
         }else if (v[0]=="Free"){
-            onePieceMsg_pool tempMsg(v[1],-1, -1, i);
-            onePieceMsg_poolVec_.push_back(tempMsg);
+            PoolOptInfo tempMsg(v[1],-1, -1, i);
+            vec_pool_opt_info.push_back(tempMsg);
         }else {
             cout<<"error for process the onePriceMsg."<<endl;
         }
     }
     
-    sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
-    idxRange = static_cast<int>(onePieceMsg_poolVec_.size());
+    sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending());
+    idx_range = static_cast<int>(vec_pool_opt_info.size());
 
-    return onePieceMsg_poolVec_;
-}// end of strVec_2_pieceMsgVec function
+    return vec_pool_opt_info;
+}
 
 
-pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vector<onePieceMsg_pool>onePieceMsg_poolVec_, int idxRange){
-    /*
-     pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks.
-     */
-    vector<onePairMsg>onePairMsgVec_1;
-    vector<onePairMsg>onePairMsgVec_2;
-    int i=0;
-    
-    //while loop processes a pair at each time, if got a pair.
-    while (i<(onePieceMsg_poolVec_.size()-1)){
-        //condition A: start with free. do nothing.
-        if (onePieceMsg_poolVec_[i].MallocFree==-1){
-            i+=1;
-        }
-        //condition B: start with Malloc, next item same ptr and is free.
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
-            onePairMsgVec_1.push_back(tempPair);
-            i+=2;
-        }
-        // condition C: start with Malloc, no free.
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-            onePairMsgVec_2.push_back(tempPair);
-            i+=1;
-        }
-    }//end of while
-    //condition D: if still left with the last item
-    if ((i<onePieceMsg_poolVec_.size())&&(onePieceMsg_poolVec_[i+1].MallocFree==1)){
-        onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-        onePairMsgVec_2.push_back(tempPair);
+pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>> PoolOptInfoToBlockLifeTime(vector<PoolOptInfo>vec_pool_opt_info, int idx_range){
+  /*
+  convert vector of opt info into vector of block life time
+  return a pair of vectors: 1. normal blocks 2. cross-iteration blocks.
+  */
+  vector<PoolBlockLifeTime>vec_block_life_time1;
+  vector<PoolBlockLifeTime>vec_block_life_time2;
+  int i=0;
+  
+  //while loop processes a pair at each time, if got a pair.
+  while (i<(vec_pool_opt_info.size()-1)){
+    //condition: start with free. do nothing.
+    if (vec_pool_opt_info[i].operation_type==-1){
         i+=1;
     }
-
-    //sort both pairVec
-    sort(onePairMsgVec_1.begin(),onePairMsgVec_1.end(),less_than_size_rIdx());
-    sort(onePairMsgVec_2.begin(),onePairMsgVec_2.end(),less_than_size_rIdx());
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2);
-    
-    return pairOfPairMsgVec_;
-}//end of pieceMsgVec_2_pairOfPairMsgVec function
-
-///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph.
-vector<pair<size_t, size_t>>  mergeSeg(vector<pair<size_t,size_t>> colorOccupied){
-    /*
-     version 12/9 11am -- modify to accomodate unsigned int/size_t
-     input:the collection of color ranges that is once occupied by some block during a block's life time.
-     function: merge consecutive/overlapping segments of colorOccupied
-     output: merged segments in ascending order.
-     time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied.
-     */
-    sort(colorOccupied.begin(), colorOccupied.end());
-    
-    if(colorOccupied.size()<=1){
-        return colorOccupied;
-    }
-    
-    int m = 0;
-    while (m<(colorOccupied.size()-1)){
-        
-        if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){
-            pair<int,int>tempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second));
-            //remove m+1 and m
-            colorOccupied.erase(colorOccupied.begin()+m+1);
-            colorOccupied.erase(colorOccupied.begin()+m);
-            //insert the combined range
-            colorOccupied.insert(colorOccupied.begin()+m,tempItem);
-        }else{
-            m+=1;
-        }
-    }//end of while loop
-    
-    //verify if mergeSeg is completed. O(n^2) optional
-//    if(colorOccupied.size()>1){
-//        for (int i=0;i<(colorOccupied.size()-1);i++){
-//            if(colorOccupied[i].second>=colorOccupied[i+1].first){
-//                cout<<"error in mergeSeg"<<endl;
-//            }
-//        }
-//    }
-    
-    return colorOccupied;
-}//end of mergeSeg function
-
-
-pair<size_t,size_t> FFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
-    /*
-     version 12/2 4pm
-     First Fit weighted coloring
-     return a pair standing for colorRange.
-     local_offset shifts the returned colorRange, allowing multiple run().
-     local_offset not changable, whereas offset is changable.
-     */
-    // condition A: if no occupied, put after the local_offset
-    if (colorMerged.size()==0){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    
-    // condition B: able to fit before first block, after the local_offset
-    if ((size+local_offset)<(colorMerged[0].first+1)){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    //condition: start with Malloc, next item same ptr and is free.
+    if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)&&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){
+      PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx);
+      vec_block_life_time1.push_back(temp_block_life_time);
+      i+=2;
     }
-    
-    size_t yLocation= -1;
-    if (colorMerged.size()>1) {
-        int n = 0;
-        while (n<(colorMerged.size()-1)){
-            // condition C: able to fit in between middle blocks.
-            if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){
-                yLocation = colorMerged[n].second+1;
-                break;
-            }
-            n+=1;
-        }//end of while loop.
-        // condition D: allocate after the last block.
-        if (yLocation == -1){
-            yLocation = colorMerged[colorMerged.size()-1].second+1;
-        }
-    }// end of if loop, conditon C and D.
-    
-    // condition E: colorMeger len =1, allocate after the last block.
-    if (colorMerged.size()==1){
-        yLocation = colorMerged[0].second+1;
-    }
-    
-    if (yLocation==-1){
-        cout<<"error in FFallocation!!!"<<endl;
+    // condition: start with Malloc, no free.
+    if ((vec_pool_opt_info[i].operation_type==1)&&(vec_pool_opt_info[i].ptr!=vec_pool_opt_info[i+1].ptr)){
+      PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,idx_range);
+      vec_block_life_time2.push_back(temp_block_life_time);
+      i+=1;
     }
-    
-    return pair<size_t,size_t>(yLocation,yLocation+size-1);
-}//end of FFallocation function
+  }//end of while
+  //condition: if still left with the last item
+  if ((i<vec_pool_opt_info.size())&&(vec_pool_opt_info[i+1].operation_type==1)){
+    PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,idx_range);
+    vec_block_life_time2.push_back(temp_block_life_time);
+    i+=1;
+  }
 
+  //sort both pair
+  sort(vec_block_life_time1.begin(),vec_block_life_time1.end(),sort_by_size_r_idx_descending());
+  sort(vec_block_life_time2.begin(),vec_block_life_time2.end(),sort_by_size_r_idx_descending());
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time(vec_block_life_time1,vec_block_life_time2);
+  
+  return pair_vec_block_life_time;
+}
 
-pair<size_t,size_t> BFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
-    /*
-     version 12/11 1pm
-     Best Fit allocation, input and output same as FFallocation
-     */
-    // condition A: if no occupied, put after the local_offset
-    if (colorMerged.size()==0){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    //condition B: if size=1, able to fit before the first block
-    if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    
-    //condition C: else of B
-    if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){
-        return pair<size_t,size_t>(colorMerged[0].second+1,colorMerged[0].second+size);
-    }
-    
-    //condition D and E:
-    size_t yLocation=-1;
-    pair<int, size_t>tempHole(-1,-1); // n, hole size between n and n+1
-    if (colorMerged.size()>1) {
-        int n = 0;
-        while (n<(colorMerged.size()-1)){
-            // condition C: able to fit in between middle blocks. select smallest.
-            if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)<tempHole.second)){
-                tempHole.first=n;
-                tempHole.second=colorMerged[n+1].first-colorMerged[n].second-1;
-            }
-            n+=1;
-        }//end of while loop.
-        
-        if(tempHole.first==-1){
-            // condition D: allocate after the last block.
-            yLocation = colorMerged[colorMerged.size()-1].second+1;
-        }else{
-            //condition E: best fit in the smallest hole.
-            yLocation = colorMerged[tempHole.first].second+1;
-            
-        }
-    }// end of if loop, conditon D and E.
-    
-    if (yLocation==-1){
-        cout<<"error in BFallocation!"<<endl;
+///Section implementing coloring algorithm.
+vector<pair<size_t, size_t>>  MergeColoredSegments(vector<pair<size_t,size_t>> vec_color_preoccupied){
+  /*
+  merge consecutive/overlapping segments of vec_color_preoccupied
+  input:the collection of color ranges that is once occupied by some block during a block's life time.
+  output: merged segments in ascending order.
+  time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of vec_color_preoccupied.
+  */
+  sort(vec_color_preoccupied.begin(), vec_color_preoccupied.end());
+  
+  if(vec_color_preoccupied.size()<=1){
+    return vec_color_preoccupied;
+  }
+  
+  int m = 0;
+  while (m<(vec_color_preoccupied.size()-1)){
+    if ((vec_color_preoccupied[m].second +2)> vec_color_preoccupied[m+1].first){
+      pair<int,int>tempItem(vec_color_preoccupied[m].first,max(vec_color_preoccupied[m].second,vec_color_preoccupied[m+1].second));
+      //remove m+1 and m
+      vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m+1);
+      vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m);
+      //insert the combined range
+      vec_color_preoccupied.insert(vec_color_preoccupied.begin()+m,tempItem);
+    }else{
+        m+=1;
     }
-    
-    return pair<size_t,size_t>(yLocation,yLocation+size-1);
+  }//end of while loop
+   
+  return vec_color_preoccupied;
 }
 
-vector<Vertex> colorSomeVertices(vector<onePairMsg> pairMsgVec_, size_t &offset,string colorMethod){
-    /*
-     color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset.
-     time complexity: O(n^2).
-     */
-    size_t local_offset = offset; //feed into FFallocation, shall never change.
-    int m = static_cast<int>(pairMsgVec_.size());
-    //init all vertices
-    vector<Vertex>vertices;
-    for (int i=0; i<m;i++){
-        Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
-        vertices.push_back(tempVertex);
 
+pair<size_t,size_t> FirstFitAllocation(vector<pair<size_t,size_t>> vec_color_merged,size_t size, size_t local_offset){
+  /*
+   First Fit weighted coloring
+   return a pair standing for color_range.
+   local_offset shifts the returned color_range, allowing multiple Plan().
+   local_offset not changable, whereas offset is changable.
+   */
+  // condition: if no occupied, put after the local_offset
+  if (vec_color_merged.size()==0){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  // condition: able to fit before first block, after the local_offset
+  if ((size+local_offset)<(vec_color_merged[0].first+1)){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  size_t y_location= -1;
+  if (vec_color_merged.size()>1) {
+    int n = 0;
+    while (n<(vec_color_merged.size()-1)){
+      // condition: able to fit in between middle blocks.
+      if ((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size){
+        y_location = vec_color_merged[n].second+1;
+        break;
+      }
+      n+=1;
+    }//end of while loop.
+    // condition: allocate after the last block.
+    if (y_location == -1){
+      y_location = vec_color_merged[vec_color_merged.size()-1].second+1;
     }
+  }// end of if loop, conditon C and D.
+  
+  // condition: colorMeger len =1, allocate after the last block.
+  if (vec_color_merged.size()==1){
+    y_location = vec_color_merged[0].second+1;
+  }
+  
+  if (y_location==-1){
+    cout<<"error in FirstFitAllocation!!!"<<endl;
+  }
+  
+  return pair<size_t,size_t>(y_location,y_location+size-1);
+}
 
-    int **adj;
-    adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
-    // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
-    for (int i=0; i<m;i++){
-        adj[i] = new int[m];
-        for (int j=0; j<m;j++){
-            if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))){
-                adj[i][j]=1;
-                if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
-                    vertices[i].colorOccupied.push_back(vertices[j].colorRange);
-                }
-            }
-            else { adj[i][j]=0; }
-        }
-        
-        vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
-       
-        if(colorMethod=="FF"){
-            vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
-            
-        }else{ //BF
-            vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset);
-        }
 
-        //update of offset, largest memory footprint as well.
-        if (vertices[i].colorRange.second >=offset){
-            offset = vertices[i].colorRange.second+1;
-        }
-    }//end of for loop.
+pair<size_t,size_t> BestFitAllocation(vector<pair<size_t,size_t>> vec_color_merged,size_t size, size_t local_offset){
+  /*
+   Best Fit allocation, input and output same as FirstFitAllocation
+  */
+  // condition: if no occupied, put after the local_offset
+  if (vec_color_merged.size()==0){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  //condition: if size=1, able to fit before the first block
+  if ((vec_color_merged.size()==1)&&((size+local_offset)<(vec_color_merged[0].first+1))){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  //condition: lese of second condition
+  if ((vec_color_merged.size()==1)&&((size+local_offset)>=(vec_color_merged[0].first+1))){
+    return pair<size_t,size_t>(vec_color_merged[0].second+1,vec_color_merged[0].second+size);
+  }
+  
+  size_t y_location=-1;
+  pair<int, size_t>temp_hole(-1,-1); // n, hole size between n and n+1
+  if (vec_color_merged.size()>1) {
+    int n = 0;
+    while (n<(vec_color_merged.size()-1)){
+      // condition: able to fit in between middle blocks. select smallest.
+      if (((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size)&&((vec_color_merged[n+1].first-vec_color_merged[n].second-1)<temp_hole.second)){
+        temp_hole.first=n;
+        temp_hole.second=vec_color_merged[n+1].first-vec_color_merged[n].second-1;
+      }
+      n+=1;
+    }//end of while loop.
     
-    return vertices;
+    if(temp_hole.first==-1){
+      // condition: allocate after the last block.
+      y_location = vec_color_merged[vec_color_merged.size()-1].second+1;
+    }else{
+      //condition: best fit in the smallest hole.
+      y_location = vec_color_merged[temp_hole.first].second+1;       
+    }
+  }// end of if loop, conditon D and E.
+  
+  if (y_location==-1){
+    cout<<"error in BestFitAllocation!"<<endl;
+  }
+  
+  return pair<size_t,size_t>(y_location,y_location+size-1);
 }
 
+vector<Vertex> AssignColorToVertices(vector<PoolBlockLifeTime> vec_block_life_time, size_t &offset,string color_method){
+  /*
+   color all or 1/2 vertices using MergeColoredSegments() and FirstFitAllocation(), with updated offset.
+   time complexity: O(n^2).
+  */
+  size_t local_offset = offset; //feed into FirstFitAllocation, shall never change.
+  int m = static_cast<int>(vec_block_life_time.size());
+  //init all vertices
+  vector<Vertex>vertices;
+  for (int i=0; i<m;i++){
+    Vertex temp_vertex(vec_block_life_time[i].name,vec_block_life_time[i].size,vec_block_life_time[i].r_idx,vec_block_life_time[i].d_idx);
+    vertices.push_back(temp_vertex);
+  }
 
-///get cross-iteration duration pairs
-pair<map<int,int>,map<int,int>> cross_itr_durations(vector<string>vec_double, int location, int maxLen, int &doubleRange){
-    
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange);
-    
-    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
-    map<int,int>Table_d2r;
-    for (int i=0;i<pairOfPairMsgVec_2.first.size();i++){
-        if(pairOfPairMsgVec_2.first[i].r_idx<maxLen){
-            Table_r2d[pairOfPairMsgVec_2.first[i].r_idx] =pairOfPairMsgVec_2.first[i].d_idx%maxLen;
-            Table_d2r[pairOfPairMsgVec_2.first[i].d_idx%maxLen]=pairOfPairMsgVec_2.first[i].r_idx;
+  int **adj;
+  adj = new int*[m];
+  // build edges with values 1 and 0; combine with mergeSeg and FirstFitAllocation in the loop.
+  for (int i=0; i<m;i++){
+    adj[i] = new int[m];
+    for (int j=0; j<m;j++){
+      if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))){
+        adj[i][j]=1;
+        if (vertices[j].color_range.second){ //as second never be 0, if not empty.
+          vertices[i].vec_color_preoccupied.push_back(vertices[j].color_range);
         }
+      }
+      else { adj[i][j]=0; }
     }
     
-    return pair<map<int,int>,map<int,int>>(Table_r2d,Table_d2r);
-}
-
-/// main run funtion
-vector<Vertex> run(vector<string>vec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){
-    /*
-     run function, input vector of strings, return colored vertices,
-     update idxRange, offset.
-     time complexity: O(n^2) where n is maxLen.
-     */
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange);
-    //1. normal blocks 2. cross-iteration blocks.
-    vector<onePairMsg>pairMsgVec_1 = pairOfPairMsgVec_.first;
-    vector<onePairMsg>pairMsgVec_2 = pairOfPairMsgVec_.second;
-  
-    vector<Vertex>vertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod);
-    for (int i=0; i<vertices_2.size();i++){
-      vertices_2[i].crossItr = 1;
+    vector<pair<size_t,size_t>>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied);
+   
+    if(color_method=="FF"){
+      vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset);
+        
+    }else{ //BF
+      vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, local_offset);
     }
-    offsetCrossItr = offset;
-    offset = offsetCrossItr*2;
-    vector<Vertex>vertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod);
-    //merge
-    vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end());
 
-    return vertices;
+    //update of offset, largest memory footprint as well.
+    if (vertices[i].color_range.second >=offset){
+      offset = vertices[i].color_range.second+1;
+    }
+  }//end of for loop.
+  
+  return vertices;
 }
 
 
-///Section of test functions.
-vector<size_t> pairOfPairMsgVec_2_repSeq(pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_){
-    int counter_1M=0; int counter_1F=0; int counter_2=0;
-    vector<onePairMsg>onePairMsgVec_1 = pairOfPairMsgVec_.first;
-    vector<onePairMsg>onePairMsgVec_2 = pairOfPairMsgVec_.second;
-    vector<oneIterMsg>oneIterMsgVec_;
-    for (int i =0; i<onePairMsgVec_1.size(); i++){
-        oneIterMsg tempIterM(onePairMsgVec_1[i].size,1,onePairMsgVec_1[i].r_idx);
-        oneIterMsgVec_.push_back(tempIterM);
-        counter_1M++;
-        
-        size_t temp_s_d = static_cast<size_t>(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx);
-        oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx);
-        oneIterMsgVec_.push_back(tempIterF);
-        counter_1F++;
-    }
-    
-    for (int i =0; i<onePairMsgVec_2.size(); i++){
-        oneIterMsg tempIterM(onePairMsgVec_2[i].size,1,onePairMsgVec_2[i].r_idx);
-        oneIterMsgVec_.push_back(tempIterM);
-        counter_2++;
-    }
-    
-    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
-    //only after sort then can create rep.
-    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
-    for (int i =0; i<oneIterMsgVec_.size(); i++){
-        rep.push_back(oneIterMsgVec_[i].size_delta);
+pair<map<int,int>,map<int,int>> GetCrossIterationBlocks(vector<string>vec_double, int location_2nd_iteration, int iteration_length, int &double_range){
+  ///get cross-iteration duration blocks
+  vector<PoolOptInfo>vec_pool_opt_info2 = PoolOptSeqStrToStruct(vec_double,double_range);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time2=PoolOptInfoToBlockLifeTime(vec_pool_opt_info2,double_range);
+  
+  map<int,int>table_ridx_to_didx; //full duration info, cross-iteration duration.
+  map<int,int>table_didx_to_ridx;
+  for (int i=0;i<pair_vec_block_life_time2.first.size();i++){
+    if(pair_vec_block_life_time2.first[i].r_idx<iteration_length){
+      table_ridx_to_didx[pair_vec_block_life_time2.first[i].r_idx] =pair_vec_block_life_time2.first[i].d_idx%iteration_length;
+      table_didx_to_ridx[pair_vec_block_life_time2.first[i].d_idx%iteration_length]=pair_vec_block_life_time2.first[i].r_idx;
     }
+  }
+  
+  return pair<map<int,int>,map<int,int>>(table_ridx_to_didx,table_didx_to_ridx);
+}
 
-    return rep;
-}//end of pairOfPairMsgVec_2_repSeq function
 
+///Section of test functions.
+vector<size_t> PoolOptSeqRepeatableTestPreProcess(pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time){
+  /*
+  pre process pair of vector of block life time info, for ease of repeatable test.
+  */
+  vector<PoolBlockLifeTime>vec_block_life_time1 = pair_vec_block_life_time.first;
+  vector<PoolBlockLifeTime>vec_block_life_time2 = pair_vec_block_life_time.second;
+  vector<PoolOptSimplifiedInfo>vec_pool_opt_simplified_info;
+
+  //process Malloc and Free pair, i.e. normal blocks
+  for (int i =0; i<vec_block_life_time1.size(); i++){
+    PoolOptSimplifiedInfo tempIterM(vec_block_life_time1[i].size,1,vec_block_life_time1[i].r_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterM);
+    size_t temp_s_d = static_cast<size_t>(vec_block_life_time1[i].d_idx-vec_block_life_time1[i].r_idx);
+    PoolOptSimplifiedInfo tempIterF(temp_s_d,-1,vec_block_life_time1[i].d_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterF);
+  }
+  
+  //process Malloc-only blocks, i.e. cross-iteration blocks
+  for (int i =0; i<vec_block_life_time2.size(); i++){
+    PoolOptSimplifiedInfo tempIterM(vec_block_life_time2[i].size,1,vec_block_life_time2[i].r_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterM);
+  }
+  
+  //sort then can create vec_rep.
+  sort(vec_pool_opt_simplified_info.begin(),vec_pool_opt_simplified_info.end(),sort_by_itr_idx_ascending());
+  vector<size_t>vec_rep; // vector of size_delta, name it as vec_rep for simlisity.
+  for (int i =0; i<vec_pool_opt_simplified_info.size(); i++){
+    vec_rep.push_back(vec_pool_opt_simplified_info[i].size_delta);
+  }
 
-vector<size_t> maxRepeatedSeg(vector<size_t>rep, int idxRange, int &maxLen, int &location){
-    /*
-     get max repeated non-overlapping Seg of a vector, return the repeated segment,
-     update maxLen, and location of where Seg starts to repeat.
-     brtue force method using equal()
-     time complexity O(n^2)
-     */
-    for (int i=0; i<idxRange;i++){
-        for (int len=1; len<(idxRange-i);len++){
-            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
-                maxLen = len;
-                location = i;
-                cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
-            }
-        }
-    }
-    //TODO(junzhe) verify the subSeq returned, below poped up error in vgg.
-    vector<size_t>subSeq(&rep[location],&rep[location+maxLen]);
-    if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){
-        cout<<"error in get the maxRep"<<endl;
-    }
-    return subSeq;
+  return vec_rep;
 }
 
 
-void verifyAndCut (vector<size_t>subSeq, int &maxLen, int &location){
-    /*
-     to cut, in case the repeated Seg contains multiple iterations.
-     */
-    int tempMaxLen=0;
-    int tempLocation =0;
-    int tempIdxRange = maxLen;
-    
-    vector<size_t>tempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation);
-    //TODO(junzhe), tunable threshold.
-    int threshold =50;
-    if (tempMaxLen>threshold){
-        maxLen = tempMaxLen;
-        location += tempLocation;
-        cout<<"max length get cut"<<endl;
+vector<size_t> PoolRepeatableTest(vector<size_t>rep, int idx_range, int &iteration_length, int &location_2nd_iteration){
+  /*
+  get max repeated non-overlapping Seg of a vector, return the repeated segment,
+  update iteration_length, and location_2nd_iteration of where Seg starts to repeat.
+  brtue force method using equal()
+  time complexity O(n^2)
+  */
+  for (int i=0; i<idx_range;i++){
+    for (int len=1; len<(idx_range-i);len++){
+      if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_length<len)) {
+        iteration_length = len;
+        location_2nd_iteration = i;
+      }
     }
-    cout<<tempMaxLen<<endl;
+  }
+  //obtain sub_sequence based on iteration_length and location_2nd_iteration
+  vector<size_t>sub_sequence(&rep[location_2nd_iteration],&rep[location_2nd_iteration+iteration_length]);
+  if(!(equal(rep.begin()+location_2nd_iteration,rep.begin()+iteration_length-1+location_2nd_iteration,sub_sequence.begin()) && equal(rep.begin()+location_2nd_iteration+iteration_length,rep.begin()+2*iteration_length-1+location_2nd_iteration,sub_sequence.begin()))){
+    cout<<"error in get the maxRep"<<endl;
+  }
+
+  return sub_sequence;
 }
 
 
-//main function of test
-int test(vector<string>vec3, int &maxLen, int &location){
+void VerifyRepeatableTest(vector<size_t>sub_sequence, int &iteration_length, int &location_2nd_iteration){
     /*
-     main function of test, returns globeCounter, which is when flag shall be switched,
-     update maxLen and location of where the repeated Seg starts.
-     */
-    cout<<"====================== test ========================="<<endl;
-    int idxRange3=0;
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3);
-    vector<size_t>rep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_);
+     to cut, in case the repeated Segment returned by PoolRepeatableTest contains multiple iterations.
+    */
+    int temp_iteration_length = 0;
+    int temp_location_2nd_iteration = 0;
+    int temp_idx_range = iteration_length;
+    
+    //verify by testing its subsequence again
+    vector<size_t>tempsub_sequence = PoolRepeatableTest(sub_sequence,temp_idx_range,temp_iteration_length, temp_location_2nd_iteration);
     
-    //get repeated sub vector.
-    vector<size_t>subSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location);
-    //cout<<subSeq.size()<<endl;
-    verifyAndCut(subSeq, maxLen, location);
-    int globeCounter=-1;
-    if (maxLen>100){ //TODO(junzhe) tunable threshold.
-        cout<<"new location and maxLen: "<<location<<' '<<maxLen<<endl;
-        globeCounter = idxRange3+maxLen-(idxRange3-location)%maxLen;
+    //tunable threshold.
+    int threshold = 50;
+    
+    if (temp_iteration_length>threshold){
+        iteration_length = temp_iteration_length;
+        location_2nd_iteration += temp_location_2nd_iteration;
     }
-    return globeCounter;
 }
 
+
 ///verify if coloring got overlapping
-void overlap_test(vector<Vertex> vertices){
+void OverlapVerification(vector<Vertex> vertices){
     size_t s = vertices.size();
     int i,j;
     for (i=0; i<s; i++){
         for (j=i+1; j<s; j++){
-            if (((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d)))&& ((max(vertices[i].colorRange.first,vertices[j].colorRange.first))<(1+min(vertices[i].colorRange.second,vertices[j].colorRange.second)))){
+            if (((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d)))&& ((max(vertices[i].color_range.first,vertices[j].color_range.first))<(1+min(vertices[i].color_range.second,vertices[j].color_range.second)))){
                 cout<<"error overlapping"<<endl;
             }
         }
@@ -674,13 +578,11 @@ void overlap_test(vector<Vertex> vertices){
 
 
 SmartMemPool::SmartMemPool(const MemPoolConf &conf){
-    //TODO(junzhe) to figure out what to do here.
-    colorMethod = "BF";
+    color_method = "BF";
     conf_ = conf;
 }
 
 void SmartMemPool::Init(){
-  //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
   mtx_.lock();
   if(!initialized_){
     initialized_ =true;
@@ -689,291 +591,332 @@ void SmartMemPool::Init(){
 }
 
 
+
+int SmartMemPool::Detection(vector<string>vec_string_test, int &iteration_length, int &location_2nd_iteration){
+  /*
+  Testing repeatability from raw operation sequence
+  returns global_index_threshold, which is when flag shall be switched,
+  update iteration_length and location_2nd_iteration of where the repeated Seg starts.
+  */
+  int idx_range_test=0;
+  vector<PoolOptInfo>vec_pool_opt_info3 = PoolOptSeqStrToStruct(vec_string_test,idx_range_test);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time = PoolOptInfoToBlockLifeTime(vec_pool_opt_info3,idx_range_test);
+  vector<size_t>vec_rep = PoolOptSeqRepeatableTestPreProcess(pair_vec_block_life_time);
+  
+  //repeatable test with verification
+  vector<size_t>sub_sequence = PoolRepeatableTest(vec_rep,idx_range_test,iteration_length,location_2nd_iteration);
+  VerifyRepeatableTest(sub_sequence, iteration_length, location_2nd_iteration);
+  
+  //update global_index_threshold if test past, i.e. iteration_length exceed certain threshold
+  if (iteration_length>100){ //tunable threshold.
+    global_index_threshold = idx_range_test+iteration_length-(idx_range_test-location_2nd_iteration)%iteration_length;
+  }
+  return global_index_threshold;
+}
+
+
+/// main run funtion
+vector<Vertex> SmartMemPool::Plan(vector<string>vec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method){
+  /*
+  Planning, i.e. Assign Color to Vertices from raw operation sequence info.
+  input vector of strings, return colored vertices,
+  update idx_range, offset.
+  time complexity: O(n^2) where n is iteration_length.
+  */
+
+  vector<PoolOptInfo>vec_pool_opt_info = PoolOptSeqStrToStruct(vec,idx_range);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time=PoolOptInfoToBlockLifeTime(vec_pool_opt_info,idx_range);
+  
+  //coloring normal blocks and cross-iteration blocks separately, cannot be miss-matched.
+  vector<PoolBlockLifeTime>vec_block_life_time1 = pair_vec_block_life_time.first;
+  vector<PoolBlockLifeTime>vec_block_life_time2 = pair_vec_block_life_time.second;
+
+  //color cross-iteration blocks
+  vector<Vertex>vertices_2 = AssignColorToVertices(vec_block_life_time2,offset,color_method);
+
+  for (int i=0; i<vertices_2.size();i++){
+    vertices_2[i].cross_iteration = 1;
+  }
+  //update offset
+  offset_cross_iteration = offset;
+  offset = offset_cross_iteration*2;
+  //color normal blocks
+  vector<Vertex>vertices = AssignColorToVertices(vec_block_life_time1,offset,color_method);
+  
+  //merge after coloring
+  vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end());
+
+  return vertices;
+}
+
+
 ///Malloc
 void SmartMemPool::Malloc(void** ptr, const size_t size){
-    /*
-     1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
-     2. if flag=0, malloc/cudaMalloc, collect vec string
-     3. if flag=1, look up table, malloc/cudaMalloc if not in the Table
-     4. test repeated sequence every 100 blocks, update globeCounter.
-     */
-
-    //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
-    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
-    //file<<gc<<' '<<"Malloc"; //a.
-    if (!initialized_){
+  /*
+   1. switch flag when global_index == global_index_threshold, construct lookup table and malloc the whole pool.
+   2. if flag=0, malloc/cudaMalloc, collect vec string
+   3. if flag=1, look up table, malloc/cudaMalloc if not in the Table
+   4. test repeated sequence every 100 blocks, update global_index_threshold.
+   */
+
+  if (!initialized_){
     Init();
   }
 
-    void* allocatedPtr = NULL; //ptr to be returned
+  void* allocated_ptr = NULL; //ptr to be returned
+
+  /// 1. switch flag when global_index == global_index_threshold, construct lookup table and malloc the whole pool.    
+  if (global_index == global_index_threshold){
+
+    malloc_flag = 1;
+    vector<string>vec_raw_opt_info(&vec[location_2nd_iteration],&vec[location_2nd_iteration+iteration_length]);
     
-    if (gc == globeCounter){
-        /// 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
-        
-        mallocFlag=1;
-        cout<<"switched to color-malloc"<<endl;
-        vector<string>vec_run(&vec[location],&vec[location+maxLen]);
-        
-        vector<Vertex>vertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod);
+    //color vertices
+    vector<Vertex>vertices = Plan(vec_raw_opt_info,idx_range,offset,offset_cross_iteration,color_method);
 
-        //here to verify if the coloring got overlapping. TODO(junzhe) optional
-        //overlap_test(vertices);
-        
-        //obtain the cross-iteration duration info
-        int doubleRange=0;
-        vector<string>vec_double(&vec[location],&vec[location+2*maxLen]);
-        pair<map<int,int>,map<int,int>>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange);
-        Table_r2d = pairs.first;
-        Table_d2r = pairs.second;
-        
-        //update ptrPool
-        cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
-        cout<<"ptrPool is: "<<ptrPool<<endl;
-
-        //b.  below 2 loops: vec_r2Ver to replace Table_r2Ver
-        for (int i=0; i<idxRange; i++){
-            lookUpElement tempElement;
-            Vec_r2Ver.push_back(make_pair(i,tempElement));
-        }
-        for (int i=0; i<vertices.size(); i++){
-            lookUpElement temp;
-            temp.r_idx =vertices[i].r;
-            temp.d_idx =Table_r2d.find(vertices[i].r)->second;
-            temp.size =vertices[i].size;
-            temp.offset=vertices[i].colorRange.first;
-            temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
-            temp.Occupied =0;
-            temp.crossItr = vertices[i].crossItr;
-            temp.Occupied_backup =0; 
-            //build tables for lookup.
-            Vec_r2Ver[vertices[i].r].second= temp;
-        }
-    }
+    //here to verify if the coloring got overlapping. for verify purpose only.
+    //OverlapVerification(vertices);
     
-    if(mallocFlag==0){
-        ///  2. if flag=0, malloc/cudaMalloc
-        cudaMalloc(ptr, size);
-        allocatedPtr = *ptr;
-        //update load
-        if(loadLogFlag==1){
-            if (gc>0){
-                Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
-            }else{ //very first block
-                Table_load[gc]=make_pair(size,0);
-            }
-        }
-        //push_back the string for later test and run.
-        string tempStr1 ="Malloc ";
-        stringstream strm2;
-        strm2<<allocatedPtr;
-        string tempStr2 = strm2.str();
-        stringstream strm3;
-        strm3<<size;
-        string tempStr3 = strm3.str();
-        string temp = tempStr1+tempStr2+" "+tempStr3;
-        vec.push_back(temp);
-        //file<<" Condition M1, addr: "<<*ptr<<endl;  //a.
-    }else{
-        /// 3. if flag=1, look up table.
-        int lookupIdx = (gc-location)%maxLen;
-        if ((Vec_r2Ver[lookupIdx].second.size ==size)&&(Vec_r2Ver[lookupIdx].second.Occupied*Vec_r2Ver[lookupIdx].second.Occupied_backup==0)){
-             if (Vec_r2Ver[lookupIdx].second.Occupied==0){
-                //condition M2, normal and crossItr's primary.
-                //assign ptr and mark as occupied, and add in ptr2rIdx
-                allocatedPtr = Vec_r2Ver[lookupIdx].second.ptr;
-                Vec_r2Ver[lookupIdx].second.Occupied= 1;
-                Table_p2r[allocatedPtr]=lookupIdx;                
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
-                }
-                //file<<" Condition M2, addr: "<<*ptr<<endl;  //a.
-              }else if ((Vec_r2Ver[lookupIdx].second.crossItr==1) && (Vec_r2Ver[lookupIdx].second.Occupied==1) && (Vec_r2Ver[lookupIdx].second.Occupied_backup ==0)) {
-                //condition M4, crossItr's backup
-                allocatedPtr = (void*)((char*)Vec_r2Ver[lookupIdx].second.ptr+offsetCrossItr*sizeof(char));
-                Vec_r2Ver[lookupIdx].second.Occupied_backup=1;
-                Table_p2r[allocatedPtr]=lookupIdx;
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
-                }
-                //file<<" Condition M4, addr: "<<*ptr<<endl;  //a.
-              }
-        }else{  //condition M3, size not proper or both occupied.
-                cudaMalloc(ptr, size);
-                allocatedPtr = *ptr;       
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
-                }
-                //file<<" Condition M3, addr: "<<*ptr<<endl;  //a.
-        } 
-    } //end of loop for flag=1
+    //obtain the cross-iteration duration info
+    int double_range=0;
+    vector<string>vec_double(&vec[location_2nd_iteration],&vec[location_2nd_iteration+2*iteration_length]);
+    pair<map<int,int>,map<int,int>>pairs =GetCrossIterationBlocks(vec_double,location_2nd_iteration,iteration_length,double_range);
+    table_ridx_to_didx = pairs.first;
+    table_didx_to_ridx = pairs.second;
     
-    ///4. test repeated sequence every 100 blocks, update globeCounter.
-    if (((gc+1)%300==0) && (mallocFlag==0) && (globeCounter==-1)&&(gc+2>checkPoint)){
-        cout<<"gc and GC before test: "<<gc<<' '<<globeCounter<<endl;
-        globeCounter = test(vec,maxLen,location);
-        checkPoint=checkPoint*2;
+    //make pool
+    cudaMalloc(&ptr_pool,offset); //poolSize or memory foot print  offset.
+
+    //make vec_block_meta for lookup purpose after pool is constructed
+    for (int i=0; i<idx_range; i++){
+        PoolBlockMeta tempElement;
+        vec_block_meta.push_back(make_pair(i,tempElement));
     }
-    
-    ///get load info, when gc == GC+2maxLen
-    if (gc==(globeCounter+2*maxLen)&& (globeCounter>0)){
-        getMaxLoad();
-        loadLogFlag=0;
+    for (int i=0; i<vertices.size(); i++){
+        PoolBlockMeta temp;
+        temp.r_idx =vertices[i].r;
+        temp.d_idx =table_ridx_to_didx.find(vertices[i].r)->second;
+        temp.size =vertices[i].size;
+        temp.offset=vertices[i].color_range.first;
+        temp.ptr = (void*)((char*)ptr_pool+temp.offset*sizeof(char));
+        temp.occupied =0;
+        temp.cross_iteration = vertices[i].cross_iteration;
+        temp.occupied_backup =0; 
+        //build tables for lookup.
+        vec_block_meta[vertices[i].r].second= temp;
     }
-    
-    gc++;
-    Table_p2s[allocatedPtr]=size; //update it for load tracking purpose.
-    *ptr = allocatedPtr;
-    ///update block_RWMF
-    string tempStr1 ="Malloc ";
+  }
+  ///  2. if flag=0, malloc/cudaMalloc, accumulate vec_info at the beginning iterations.
+  if(malloc_flag ==0){    
+    cudaMalloc(ptr, size);
+    allocated_ptr = *ptr;
+    //update load
+    if(load_flag==1){
+      if (global_index>0){
+        table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second);
+      }else{ //very first block
+        table_load[global_index]=make_pair(size,0);
+      }
+    }
+    //push_back the string for later test and run.
+    string temp_str1 ="Malloc ";
     stringstream strm2;
-    strm2<<allocatedPtr;
-    string tempStr2 = strm2.str();
+    strm2<<allocated_ptr;
+    string temp_str2 = strm2.str();
     stringstream strm3;
     strm3<<size;
-    string tempStr3 = strm3.str();
-    string temp = tempStr1+tempStr2+" "+tempStr3;
-    ///below time can be removed TODO(junzhe)
-    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-    stringstream strm4;
-    strm4<<now;
-    string tempStr4 = strm4.str();
-    temp = temp+" "+tempStr4;
-    vec_block_RWMF.push_back(temp);
+    string temp_str3 = strm3.str();
+    string temp = temp_str1+temp_str2+" "+temp_str3;
+    vec.push_back(temp);
+  }else{
+
+    /// 3. if flag=1, look up table.
+    int lookup_idx = (global_index-location_2nd_iteration)%iteration_length;
+    if ((vec_block_meta[lookup_idx].second.size ==size)&&(vec_block_meta[lookup_idx].second.occupied*vec_block_meta[lookup_idx].second.occupied_backup==0)){
+     if (vec_block_meta[lookup_idx].second.occupied==0){
+        //condition: normal and cross_iteration's primary.
+        //assign ptr and mark as occupied, and add in ptr2rIdx
+        allocated_ptr = vec_block_meta[lookup_idx].second.ptr;
+        vec_block_meta[lookup_idx].second.occupied= 1;
+        table_ptr_to_ridx[allocated_ptr]=lookup_idx;                
+        //update load
+        if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size);
+        }
+      }else if ((vec_block_meta[lookup_idx].second.cross_iteration==1) && (vec_block_meta[lookup_idx].second.occupied==1) && (vec_block_meta[lookup_idx].second.occupied_backup ==0)) {
+        //condition: cross_iteration's backup
+        allocated_ptr = (void*)((char*)vec_block_meta[lookup_idx].second.ptr+offset_cross_iteration*sizeof(char));
+        vec_block_meta[lookup_idx].second.occupied_backup=1;
+        table_ptr_to_ridx[allocated_ptr]=lookup_idx;
+        //update load
+        if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size);
+        }
+      }
+    }else{  
+      //condition: size not proper or both occupied.
+      cudaMalloc(ptr, size);
+      allocated_ptr = *ptr;       
+      //update load
+      if(load_flag==1){
+        table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second);
+      }
+    } 
+  } //end of loop for flag=1
+    
+  ///4. test repeated sequence every 300 index, update global_index_threshold.
+  if (((global_index+1)%300==0) && (malloc_flag ==0) && (global_index_threshold==-1)&&(global_index+2>check_point)){
+    global_index_threshold = Detection(vec,iteration_length,location_2nd_iteration);
+    check_point=check_point*2;
+  }
+    
+  ///get load info, when global_index == global_index+2iteration_length
+  if (global_index==(global_index_threshold+2*iteration_length)&& (global_index_threshold>0)){
+    GetMaxLoad();
+    load_flag=0;
+  }
+    
+  global_index++;
+  //update it for load tracking purpose.
+  table_ptr_to_size[allocated_ptr]=size; 
+ 
+  //update *ptr
+  *ptr = allocated_ptr;
+  
+  ///update block_RWMF
+  string temp_str1 ="Malloc ";
+  stringstream strm2;
+  strm2<<allocated_ptr;
+  string temp_str2 = strm2.str();
+  stringstream strm3;
+  strm3<<size;
+  string temp_str3 = strm3.str();
+  string temp = temp_str1+temp_str2+" "+temp_str3;
+  int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+  stringstream strm4;
+  strm4<<now;
+  string temp_str4 = strm4.str();
+  temp = temp+" "+temp_str4;
+  vec_block_rw_mf.push_back(temp);
 }
 
 ///Free
 void SmartMemPool::Free(void* ptr){
     
-    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
-    //file<<gc<<' '<<"Free"; //a.
-    
-    size_t deallocatedSize = Table_p2s.find(ptr)->second;
+  size_t deallocatedSize = table_ptr_to_size.find(ptr)->second;
+  
+  /// at the begining iterations, via cudaFree, accumulate opt info.  
+  if ((global_index_threshold==-1)||(global_index<global_index_threshold)){
+    //push_back the string for later test and run.
+    string temp_str1 ="Free ";
+    stringstream strm2;
+    strm2<<ptr;
+    string temp_str2 = strm2.str();
+    string temp = temp_str1+temp_str2;
+    vec.push_back(temp);
     
-    if ((globeCounter==-1)||(gc<globeCounter)){
-        //push_back the string for later test and run.
-        string tempStr1 ="Free ";
-        stringstream strm2;
-        strm2<<ptr;
-        string tempStr2 = strm2.str();
-        string temp = tempStr1+tempStr2;
-        vec.push_back(temp);
-        
-        //file<<" Condition F1, addr: "<<ptr<<endl;  //a.
-        //update load before free
-        if(loadLogFlag==1){
-            Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
-        }
-        /// before flag switch, for sure all free shall be done by free()
-        cudaFree(ptr);
-    }else{
-        if (!(Table_p2r.find(ptr)==Table_p2r.end())){
-            int resp_rIdx = Table_p2r.find(ptr)->second;
-            Table_p2r.erase(ptr);
-            
-            if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){
-              //Condition F2, from M2
-              Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again.
-              //file<<" Condition F2, addr: "<<ptr<<endl;  //a.
-            }else if (ptr == (void*)((char*)Vec_r2Ver[resp_rIdx].second.ptr+offsetCrossItr*sizeof(char))){
-              //Condition F4, from M4
-              Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
-              //file<<" Condition F4, addr: "<<ptr<<endl;  //a.
-            } else{
-              //Condition F5, from M2, M4 but idx switched.
-              if (((float)((char*)ptr-((char*)ptrPool+offsetCrossItr*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){
-                     Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
-              }else{
-                     Vec_r2Ver[resp_rIdx].second.Occupied =0;
-              }
-            }
-            //update load
-             if(loadLogFlag==1){
-                 Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize);
-             }
+    //update load before free
+    if(load_flag==1){
+      table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second);
+    }
+    // before flag switch, for sure all free shall be done by free()
+    cudaFree(ptr);
+  }else{
+    /// cases that no need accumulating opt info
+
+    /// free a ptr that is in the memory pool
+    if (!(table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end())){
+      int resp_rIdx = table_ptr_to_ridx.find(ptr)->second;
+      table_ptr_to_ridx.erase(ptr);
+      
+      if (ptr == vec_block_meta[resp_rIdx].second.ptr){
+        vec_block_meta[resp_rIdx].second.occupied =0; //freed, able to allocate again.
+      }else if (ptr == (void*)((char*)vec_block_meta[resp_rIdx].second.ptr+offset_cross_iteration*sizeof(char))){
+        vec_block_meta[resp_rIdx].second.occupied_backup =0;
+      } else{
+        if (((float)((char*)ptr-((char*)ptr_pool+offset_cross_iteration*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptr_pool+2*offset_cross_iteration*sizeof(char)))<0)){
+          vec_block_meta[resp_rIdx].second.occupied_backup =0;
         }else{
-          //update load
-          if(loadLogFlag==1){
-              Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
-          }
-          //file<<" Condition F3, addr: "<<ptr<<endl;  //a.
-          cudaFree(ptr);
+          vec_block_meta[resp_rIdx].second.occupied =0;
         }
-            
+      }
+      //update load
+       if(load_flag==1){
+           table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second-deallocatedSize);
+       }
+    }else{
+      /// free a ptr that is NOT in the memory pool
+      
+      //update load
+      if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second);
+      }
+      cudaFree(ptr);
     }
-    gc++;
-    ///update block_RWMF
-    string tempStr1 ="Free ";
-    stringstream strm2;
-    strm2<<ptr;
-    string tempStr2 = strm2.str();
-    string temp = tempStr1+tempStr2;
-    ///below time can be removed TODO(junzhe)
-    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-    stringstream strm4;
-    strm4<<now;
-    string tempStr4 = strm4.str();
-    temp = temp+" "+tempStr4;
-    vec_block_RWMF.push_back(temp);
+            
+  }
+
+  global_index++;
+
+  ///update block_RWMF
+  string temp_str1 ="Free ";
+  stringstream strm2;
+  strm2<<ptr;
+  string temp_str2 = strm2.str();
+  string temp = temp_str1+temp_str2;
+  int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+  stringstream strm4;
+  strm4<<now;
+  string temp_str4 = strm4.str();
+  temp = temp+" "+temp_str4;
+  vec_block_rw_mf.push_back(temp);
 }//end of Free.
 
 
 SmartMemPool::~SmartMemPool(){
-    fstream file_block1("blockInfo_RW.text", ios::in|ios::out|ios::app);
-    fstream file_block2("blockInfo_RWMF.text", ios::in|ios::out|ios::app);
-    for (int i=0; i< vec_block_RW.size();i++){
-        file_block1<<vec_block_RW[i]<<endl;
-    }
-    for (int i=0; i< vec_block_RWMF.size();i++){
-        file_block2<<vec_block_RWMF[i]<<endl;
-    }
-    cudaFree(ptrPool);
-    //TODO(junzhe) verify what else shall be cleaned up.
+
+  fstream file_block1("blockInfo_RW.text", ios::in|ios::out|ios::app);
+  fstream file_block2("blockInfo_RWMF.text", ios::in|ios::out|ios::app);
+  for (int i=0; i< vec_block_rw.size();i++){
+    file_block1<<vec_block_rw[i]<<endl;
+  }
+  for (int i=0; i< vec_block_rw_mf.size();i++){
+    file_block2<<vec_block_rw_mf[i]<<endl;
+  }
+  cudaFree(ptr_pool);
+
 }
 
-void SmartMemPool::getMaxLoad(){
+void SmartMemPool::GetMaxLoad(){
     
-    vector<size_t>cudaLoadLog;
-    for (int i=0; i<Table_load.size();i++){
-        cudaLoadLog.push_back(Table_load.find(i)->second.first);
-    }
-    size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end());
-    int idxMaxCudaLoad = static_cast<int>(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end())));
-    
-    vector<size_t>colorLoadLog;
-    for (int i=0; i<Table_load.size();i++){
-        colorLoadLog.push_back(Table_load.find(i)->second.second);
-    }
-    size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end());
-    int idxMaxColorLoad = static_cast<int>(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end())));
-    size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first;
-    
-    maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad);
-    maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad);
-    memRatio = (float)maxMemUsage/(float)maxTotalLoad;
-    
-    cout<<"=============================memory usage stats print: ================================"<<endl;
-    cout<<"maxColorLoad vs memPoolSize: (at idx "<<idxMaxColorLoad<<")"<<endl;
-    cout<<maxColorLoad<<endl;
-    cout<<offset<<endl;
-    cout<<"maxTotalLoad vs maxCudaLoad(at idx "<<idxMaxCudaLoad<<") maxMemUsage"<<endl;
-    cout<<maxTotalLoad<<endl;
-    cout<<maxCudaLoad<<endl;
-    cout<<maxMemUsage<<endl;
-    cout<<"memRatio: "<<memRatio<<endl;   
+  vector<size_t>vec_load_log;
+  for (int i=0; i<table_load.size();i++){
+      vec_load_log.push_back(table_load.find(i)->second.first);
+  }
+  size_t max_cuda_load = *max_element(vec_load_log.begin(),vec_load_log.end());
+  int idx_max_cuda_load = static_cast<int>(distance(vec_load_log.begin(),max_element(vec_load_log.begin(),vec_load_log.end())));
+  
+  vector<size_t>vec_color_load;
+  for (int i=0; i<table_load.size();i++){
+      vec_color_load.push_back(table_load.find(i)->second.second);
+  }
+  size_t max_color_load = *max_element(vec_color_load.begin(),vec_color_load.end());
+  int idx_max_color_load = static_cast<int>(distance(vec_color_load.begin(),max_element(vec_color_load.begin(),vec_color_load.end())));
+  size_t offset_color_load = table_load.find(idx_max_color_load)->second.first;
+  
+  max_total_load = max(max_cuda_load,max_color_load+offset_color_load);
+  max_mem_usage = max(max_cuda_load,offset+offset_color_load);
+  
 }
 
 std::pair<size_t, size_t> SmartMemPool::GetMemUsage() {
-  //TODO(junzhe) note here the pair is different from that of CnMemPool.
-  return std::make_pair(maxMemUsage, maxTotalLoad);
+  //note here the pair is different from that of CnMemPool.
+  return std::make_pair(max_mem_usage, max_total_load);
 }
     
 void SmartMemPool::Append(string blockInfo) {
-     //TODO(junzhe) add idx later
-    vec_block_RW.push_back(blockInfo);
-    vec_block_RWMF.push_back(blockInfo);
+  vec_block_rw.push_back(blockInfo);
+  vec_block_rw_mf.push_back(blockInfo);
 }
 
 ///SwapPool
@@ -992,121 +935,86 @@ void SwapPool::Init(){
 
 
 void SwapPool::PoolOpt(vector<string> &vec_mf) {
-  //TODO(junzhe) redo 9/17
-
-  ///process vec_mf of 3itr into blocks,maxLen
-  //assume format of string: MF ptr size;
-  //onePieceMsg_pool verified
-  // for (int i = 0; i< vec_mf.size();i++){
-  //   cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
-  // }
-  // cout<<"===================print done"<<endl;
-  vector<onePieceMsg_pool>onePieceMsg_poolVec_;
-  maxLen_mf = vec_mf.size()/3;
-  cout<<"maxLen_mf "<<maxLen_mf<<endl;
-  cout<<"len of vec_mf "<<vec_mf.size()<<endl;
+
+  vector<PoolOptInfo>vec_pool_opt_info;
+  iteration_length_mf = vec_mf.size()/3; //cos input vec_mf is of 3 iteration 
+
+  //convert raw opt info into struct: PoolOptInfo
   for (int i = 0;i < vec_mf.size();i++){
-    vector<string> v = split(vec_mf[i], " ");
-    // cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
-    // cout<<"||"<<v[0]<<"||"<<v[1]<<"||"<<endl;
+    vector<string> v = SplitString(vec_mf[i], " ");
 
     if (v[0]=="Malloc"){
-      //convert v[2] from str to size_t
       size_t result;
       stringstream convert(v[2]);
-      // cout<<"1"<<endl;
       if (!(convert>>result)){
-        result =-1;
+        result = -1;
         cout<<"error for converting size from str to int."<<endl;
       }
-      // cout<<"2"<<endl;
-      onePieceMsg_pool tempMsg(v[1],result, 1, i-maxLen_mf);
-      // cout<<"3"<<endl;
-      onePieceMsg_poolVec_.push_back(tempMsg);
-      // cout<<"4"<<endl;
+      PoolOptInfo tempMsg(v[1],result, 1, i-iteration_length_mf);
+      vec_pool_opt_info.push_back(tempMsg);
     }else if (v[0]=="Free"){
-      // cout<<"1"<<endl;
-      onePieceMsg_pool tempMsg(v[1],-1, -1, i-maxLen_mf);
-      // cout<<"2"<<endl;
-      onePieceMsg_poolVec_.push_back(tempMsg);
-      // cout<<"3"<<endl;
+      PoolOptInfo tempMsg(v[1],-1, -1, i-iteration_length_mf);
+      vec_pool_opt_info.push_back(tempMsg);
     }else {
       cout<<"error for process the onePriceMsg."<<endl;
-      // cout<<i<<" "<<v[0]<<"||"<<vec_mf[i]<<endl;
     }
   }
-  sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
-  // cout<<"===================print done(2nd loop)"<<endl;
-  // for (int i = 0;i < vec_mf.size();i++){
-
+  //sort by ptr and then idx
+  sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending());
   
-  //pair
-  vector<onePairMsg>pairMsgVec_;
+  //convert into block lifetime
+  vector<PoolBlockLifeTime>vec_block_life_time;
   int i = 0;
-  // cout<<"before while loop"<<endl;
 
-  while (i<(onePieceMsg_poolVec_.size()-1)){
+  while (i<(vec_pool_opt_info.size()-1)){
     
-    // cout<<i<<" before 1st if"<<endl;
-    if (onePieceMsg_poolVec_[i].MallocFree==-1){
-      //condition A: start with free. do nothing.
+    if (vec_pool_opt_info[i].operation_type==-1){
+      //condition: start with free. do nothing.
       i+=1;
-      // cout<<i<<" condition A"<<endl;
     } else {
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)
-          &&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
-          //condition B: start with Malloc, next item same ptr and is free.
-          if ((onePieceMsg_poolVec_[i].idx >=0 && onePieceMsg_poolVec_[i].idx <maxLen_mf)
-            ||(onePieceMsg_poolVec_[i+1].idx >=0 && onePieceMsg_poolVec_[i+1].idx <maxLen_mf)){
-            //condition B1: at least one of the index in range [0,maxLen_mf]
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
-            pairMsgVec_.push_back(tempPair); 
-            // cout<<i<<" condition B"<<endl;
-          }
-          i+=2; //no matter in the middle iteration or not, plus 2.
-        } else {
-          //condiction C: not one pair, single one.
-          i+=1;
-          // cout<<" condiction C"<<endl;
-          //       if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
-          //     // onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-          //     // onePairMsgVec_2.push_back(tempPair);
-          //     i+=1;
-          //     cout<<i<<" condition C"<<endl;
-          // }
+      if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)
+        &&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){
+        //condition: start with Malloc, next item same ptr and is free.
+        if ((vec_pool_opt_info[i].idx >=0 && vec_pool_opt_info[i].idx <iteration_length_mf)
+          ||(vec_pool_opt_info[i+1].idx >=0 && vec_pool_opt_info[i+1].idx <iteration_length_mf)){
+          //condition: at least one of the index in range [0,iteration_length_mf]
+          PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx);
+          vec_block_life_time.push_back(temp_block_life_time); 
         }
+        i+=2; //no matter in the middle iteration or not, plus 2.
+      } else {
+        //condiction: not one pair, Malloc-only block, no free..
+        i+=1;
+      }
     } 
   }
-  sort(pairMsgVec_.begin(),pairMsgVec_.end(),less_than_size_rIdx());
-  // cout<<"after while loop"<<endl;
-  cout<<"size of pairs "<<pairMsgVec_.size()<<endl;
+  sort(vec_block_life_time.begin(),vec_block_life_time.end(),sort_by_size_r_idx_descending());
 
 
   ///get E, V of the blocks， coloring
   //V
-  int m = static_cast<int>(pairMsgVec_.size());
+  int m = static_cast<int>(vec_block_life_time.size());
   vector<Vertex>vertices;
   for (int i=0; i<m;i++){
-    Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
-    vertices.push_back(tempVertex);
+    Vertex temp_vertex(vec_block_life_time[i].name,vec_block_life_time[i].size,vec_block_life_time[i].r_idx,vec_block_life_time[i].d_idx);
+    vertices.push_back(temp_vertex);
   }
 
-  //E and coloring
-  
+  //E and coloring  
   int offset = 0;
   int **adj;
-  adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
-  // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
+  adj = new int*[m];
+
+  // build edges with values 1 and 0; combine with mergeSeg and FirstFitAllocation in the loop.
   for (int i=0; i<m;i++){
     adj[i] = new int[m];
     for (int j=0; j<m;j++){
       if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))
         || (min(vertices[i].d,vertices[j].d)<0 && 
-        min(vertices[i].d,vertices[j].d)+2*maxLen_mf< max(vertices[i].r,vertices[j].r))){
-        //TODO(junzhe) verify after ||
+        min(vertices[i].d,vertices[j].d)+2*iteration_length_mf< max(vertices[i].r,vertices[j].r))){
         adj[i][j]=1;
-        if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
-          vertices[i].colorOccupied.push_back(vertices[j].colorRange);
+        if (vertices[j].color_range.second){ //as second never be 0, if not empty.
+          vertices[i].vec_color_preoccupied.push_back(vertices[j].color_range);
         }
       }
       else { 
@@ -1114,98 +1022,93 @@ void SwapPool::PoolOpt(vector<string> &vec_mf) {
       }
     }
     
-    vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
+    vector<pair<size_t,size_t>>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied);
 
-    // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
-    vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset);
+    // vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset);
+    vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, offset);
 
     //update of offset, largest memory footprint as well.
-    if (vertices[i].colorRange.second >=offset){
-      offset = vertices[i].colorRange.second+1;
+    if (vertices[i].color_range.second >=offset){
+      offset = vertices[i].color_range.second+1;
     }
   }//end of for loop.
-  cout<<"offset is "<<offset<<endl;
-  cout<<"===================to print vertices"<<endl;
-  cout<<"a, d, size, color["<<endl;
+
+  //delete adj, the edges
   for (int i=0; i<m;i++){
-    cout<<"("<<vertices[i].r<<","<<vertices[i].d<<") "<<vertices[i].size<<" "<<vertices[i].colorRange.first<<endl;
+    delete[] adj[i]; 
   }
-  cout<<"===================to print vertices done"<<endl;
+  delete[] adj;
 
-  ///Make pool, and make table
-  cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
-  cout<<"ptrPool is: "<<ptrPool<<endl;
+  //make pool
+  cudaMalloc(&ptr_pool,offset); //poolSize or memory foot print  offset.
 
+  //make table
   for (int i=0; i<vertices.size();i++){
-    lookUpElement temp;
-    temp.r_idx = vertices[i].r;
-    temp.d_idx = vertices[i].d;
-    temp.size = vertices[i].size;
-    temp.offset = vertices[i].colorRange.first;
-    temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
-    temp.Occupied = 0;
-    Table_r2v[vertices[i].r] = temp;
+    PoolBlockMeta itm;
+    itm.r_idx = vertices[i].r;
+    itm.d_idx = vertices[i].d;
+    itm.size = vertices[i].size;
+    itm.offset = vertices[i].color_range.first;
+    itm.ptr = (void*)((char*)ptr_pool+itm.offset*sizeof(char));
+    itm.occupied = 0;
+    table_pool_meta[vertices[i].r] = itm;
   }
-  cout<<"===================Table_r2v done"<<endl;
-  poolFlag = 1;
+  pool_flag = 1;
     
 }
 
 void SwapPool::Malloc(void** ptr, const size_t size){
   
-  void* allocatedPtr =nullptr;
+  void* allocated_ptr =nullptr;
   
-  if (poolFlag == 0) {
+  if (pool_flag == 0) {
     cudaError_t status = cudaMalloc(ptr, size);
     CHECK_EQ(status, cudaError_t::cudaSuccess);
   } else {
-    //POOLFLAG = 1 
-    if (pc < maxLen_mf){
-      if ((Table_r2v.find(pc-maxLen_mf) == Table_r2v.end()) || (!(size == Table_r2v.find(pc-maxLen_mf)->second.size))){
+    //pool_flag = 1 
+    if (pool_index < iteration_length_mf){
+      if ((table_pool_meta.find(pool_index - iteration_length_mf) == table_pool_meta.end()) || (!(size == table_pool_meta.find(pool_index - iteration_length_mf)->second.size))){
         //not in table of negative r_idx
         cudaError_t status = cudaMalloc(ptr, size);
         CHECK_EQ(status, cudaError_t::cudaSuccess);
       } else{
         //in the table of negative r_idx
-        auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second;
-        allocatedPtr = tempMeta.ptr;
-        *ptr = allocatedPtr;
-        Table_p2r[allocatedPtr]=pc-maxLen_mf; 
+        auto temp_meta = table_pool_meta.find(pool_index - iteration_length_mf)->second;
+        allocated_ptr = temp_meta.ptr;
+        *ptr = allocated_ptr;
+        table_ptr_to_ridx[allocated_ptr]=pool_index - iteration_length_mf; 
 
       }
     } else{
-      //8 9 10
-      int r_pc = pc%maxLen_mf;
-      if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){
+      //8 9 10th iteration
+      int r_pool_index = pool_index%iteration_length_mf;
+      if ((table_pool_meta.find(r_pool_index) == table_pool_meta.end()) || (!(size == table_pool_meta.find(r_pool_index)->second.size))){
         //not here, should be abnormal
         cudaError_t status = cudaMalloc(ptr, size);
         CHECK_EQ(status, cudaError_t::cudaSuccess);
       } else{
         //in the table
-        auto tempMeta = Table_r2v.find(r_pc)->second;
-        allocatedPtr = tempMeta.ptr;
-        *ptr = allocatedPtr;
-        Table_p2r[allocatedPtr]=r_pc; 
-
+        auto temp_meta = table_pool_meta.find(r_pool_index)->second;
+        allocated_ptr = temp_meta.ptr;
+        *ptr = allocated_ptr;
+        table_ptr_to_ridx[allocated_ptr]=r_pool_index; 
       }
     }
   }
 
-
-    pc++;     
+    pool_index++;     
   }
 
 
 void SwapPool::Free(void *ptr) {
-  if (poolFlag == 0){
+  if (pool_flag == 0){
     cudaError_t status = cudaFree(ptr);
     CHECK_EQ(status, cudaError_t::cudaSuccess);
   } else{
-    if (Table_p2r.find(ptr)==Table_p2r.end()){
+    if (table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end()){
       cudaError_t status = cudaFree(ptr);
       CHECK_EQ(status, cudaError_t::cudaSuccess);
     }
-    
   }
 
 }
@@ -1215,15 +1118,7 @@ void SwapPool::Append(string blockInfo) {
 }
 
 
-void SwapPool::SwapOut(void* data_){
-  //NA
-}
-
-void SwapPool::SwapIn(void* data_){
-  //NA 
-}
-
-void getMaxLoad (){
+void GetMaxLoad (){
   //empty
 }
 

From 8e8a7e15fca593bea7fed35e1073e7f1dd408877 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Sun, 20 Jan 2019 15:58:01 +0800
Subject: [PATCH 19/19] Replace the strings with struct for function Append.

---
 include/singa/core/common.h | 13 ++++++++
 include/singa/core/device.h | 22 +++++++-------
 src/core/common/common.cc   | 33 +++++++++------------
 src/core/device/cuda_gpu.cc |  4 ---
 src/core/device/device.cc   | 19 +++++-------
 src/core/device/swap_gpu.cc | 59 +++++++++++++++----------------------
 6 files changed, 68 insertions(+), 82 deletions(-)

diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 47c1068db0..ee6f07ce3d 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -53,6 +53,9 @@ typedef struct _Opencl { } Opencl;
 }  // namespace lang
 
 class Device;
+struct DeviceOptInfoToAppend;
+
+
 /// Block represent a chunk of memory (on device or host).
 class Block {
  public:
@@ -97,6 +100,16 @@ class Block {
   std::atomic<int> ref_count_;
 };
 
+// struct for Append purpose in device class.
+struct DeviceOptInfoToAppend{
+    string operation_type;
+    string block_ptr;
+    int size;
+    long t = (std::chrono::system_clock::now()).time_since_epoch().count();
+
+    DeviceOptInfoToAppend(string opt_type, string ptr,int s):operation_type(opt_type),block_ptr(ptr),size(s){}
+};
+
 typedef struct _Context {
   std::mt19937 random_generator;
 #ifdef USE_CUDA
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index e9dcc1402d..7d9ed57757 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -66,7 +66,6 @@ class Device {
   /// Called by Tensor.
   void FreeBlock(Block* block);
   
-  void AppendInfo(string block_info);
   void* UpdateGpuPtrInfo(const Block* block_ptr);
 
   /// Return the size (bytes) of memory in use
@@ -107,7 +106,7 @@ class Device {
   int id() const { return id_; }
 
   virtual void* UpdateGpuPtr(const Block* block_ptr) = 0;
-
+  virtual void Append(DeviceOptInfoToAppend dev_opt_info) = 0;
  private:
   Device() {};
 
@@ -124,7 +123,7 @@ class Device {
   /// Free device memory.
   virtual void Free(void* ptr) = 0;
   virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0;
-  virtual void Append(string block_info) = 0;
+  
 
  protected:
   int id_ = 0;
@@ -154,6 +153,7 @@ class CppCPU : public Device {
 
   std::shared_ptr<Device> host() const override { return defaultDevice;}
   void SetRandSeed(unsigned seed) override;
+  void Append(DeviceOptInfoToAppend dev_opt_info) override {}
 
  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
@@ -167,7 +167,7 @@ class CppCPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
   void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
-  void Append(string block_info) override {}
+
   void* UpdateGpuPtr(const Block* block_ptr) override {}
 
 };
@@ -188,6 +188,8 @@ class CudaGPU : public Device {
 
   void SetRandSeed(unsigned seed) override;
   size_t GetAllocatedMem() override;
+  void Append(DeviceOptInfoToAppend dev_opt_info) override {}
+
 
  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
@@ -201,7 +203,6 @@ class CudaGPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
   void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
-  void Append(string block_info) override;
   void* UpdateGpuPtr(const Block* block_ptr) override;
 
  private:
@@ -284,6 +285,8 @@ class SwapGPU : public Device {
 
   void SetRandSeed(unsigned seed) override;
   size_t GetAllocatedMem() override;
+  //Append at every index: free, read, mutable
+  void Append(DeviceOptInfoToAppend dev_opt_info) override;
 
  protected:
   void DoExec(function<void(Context*)>&& fn, int executor) override;
@@ -295,10 +298,7 @@ class SwapGPU : public Device {
   void* Malloc(int size) override;
 
   /// Free cpu memory.
-  void Free(void* ptr) override;
-
-  //Append at every index: free, read, mutable
-  void Append(string block_info) override;
+  void Free(void* ptr) override; 
 
   //append info after Malloc, as Block* is not available till Malloc() done.
   void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; 
@@ -408,7 +408,7 @@ class OpenclDevice : public singa::Device {
   virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset = 0,
                       int src_offset = 0) override;
-
+  void Append(DeviceOptInfoToAppend dev_opt_info) override {}
 protected:
   /// The OpenCL device that this object represents.
   /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
@@ -439,7 +439,7 @@ class OpenclDevice : public singa::Device {
   /// This has the effect of freeing up device memory.
   void Free(void* ptr) override;
   void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
-  void Append(string block_info) override {}
+  
   void* UpdateGpuPtr(const Block* block_ptr) override {}
 
 
diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index d6e9c5a301..692c1c451f 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -30,15 +30,13 @@ void* Block::mutable_data() {
 
     //Append block info: opt_type, ptr, time_stamp
     if (ptr_device_!=nullptr){
-      stringstream strm2;
-      strm2<<this;
-      string temp_str2 = strm2.str();
-      stringstream strm4;
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      strm4<<t2;
-      string temp_str4 = strm4.str();
-      string temp = "Mutable "+temp_str2+" "+temp_str4;   
-      ptr_device_->AppendInfo(temp);
+      stringstream strm;
+      strm<<this;
+      string temp_str = strm.str();
+      DeviceOptInfoToAppend dev_opt_info("Mutable", temp_str,size());
+      auto t = (std::chrono::system_clock::now()).time_since_epoch().count();
+      dev_opt_info.t = t;
+      ptr_device_->Append(dev_opt_info);
     }
 
     //update ptr after swap in done, if variable is not swapped back yet as expected.
@@ -56,16 +54,13 @@ const void* Block::data() const {
 
     //Append block info: opt_type, ptr, time_stamp
     if (ptr_device_!=nullptr){
-      //Append info.
-      stringstream strm2;
-      strm2<<this;
-      string temp_str2 = strm2.str();
-      stringstream strm4;
-      auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-      strm4<<t2;
-      string temp_str4 = strm4.str();
-      string temp = "Read "+temp_str2+" "+temp_str4;
-      ptr_device_->AppendInfo(temp);
+      stringstream strm;
+      strm<<this;
+      string temp_str = strm.str();
+      DeviceOptInfoToAppend dev_opt_info("Read", temp_str,size());
+      auto t = (std::chrono::system_clock::now()).time_since_epoch().count();
+      dev_opt_info.t = t;
+      ptr_device_->Append(dev_opt_info);
     }
 
     //update ptr after swap in done, if variable is not swapped back yet as expected.
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 7ec8a9deb5..523986f4f7 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -123,10 +123,6 @@ void CudaGPU::Free(void* ptr) {
   }
 }
 
-void CudaGPU::Append(string blockInfo){
-    pool_->Append(blockInfo);
-}
-
 void* CudaGPU::UpdateGpuPtr(const Block* block_){
   return nullptr;
 }
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 59faddc5c6..5a1ac270ac 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -55,23 +55,18 @@ void Device::FreeBlock(Block* block) {
     Free(tempPtr);
     
     //append block info for free operation.
-    stringstream strm1;
-    strm1<<block;
-    string temp_str1 = strm1.str();
-    stringstream strm4;
-    auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    strm4<<t2;
-    string temp_str4 = strm4.str();
-    string blockInfo ="Free "+temp_str1+" "+temp_str4;
-    Append(blockInfo);
+    stringstream strm;
+    strm<<block;
+    string temp_str = strm.str();
+    DeviceOptInfoToAppend dev_opt_info("Free", temp_str,block->size());
+    auto t = (std::chrono::system_clock::now()).time_since_epoch().count();
+    dev_opt_info.t = t;
+    Append(dev_opt_info);
 
     delete block;
   }
 }
 
-void Device::AppendInfo(string blockInfo){
-  Append(blockInfo);
-}
 
 void* Device::UpdateGpuPtrInfo(const Block* block_){
   return UpdateGpuPtr(block_);
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index 85a4061f30..4228f6e16a 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -904,18 +904,13 @@ void SwapGPU::AppendAfterMalloc(Block* block_ptr,void* data_ptr,int size){
   */
 
   //append info
-  stringstream strm1;
-  strm1<<size;
-  string temp_str1 = strm1.str();
-  stringstream strm3;
-  strm3<<block_ptr;
-  string temp_str3 = strm3.str();
-  stringstream strm4;
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  strm4<<t2;
-  string temp_str4 = strm4.str();
-  string block_info ="Malloc "+temp_str3+" "+temp_str1+" "+temp_str4;
-  Append(block_info);
+  stringstream strm;
+  strm<<block_ptr;
+  string temp_str = strm.str();
+  DeviceOptInfoToAppend dev_opt_info("Malloc", temp_str,size);
+  auto t = (std::chrono::system_clock::now()).time_since_epoch().count();
+  dev_opt_info.t = t;
+  Append(dev_opt_info);
   
 }
 
@@ -994,42 +989,23 @@ void SwapGPU::DeploySwapExec(int r_global_index){
   }
 }
 
-void SwapGPU::Append(string block_info){
-  /*
-  Append Operation block info after each operation
-  Meantime execute following operations:
-    insert size for non-malloc operations
-    update global memory load
-    control swap flag on and off
-    update table_meta and table_sched
-    deploy swap at every index.
-    test moved from start of malloc/free to end of append, only global_index+1 changed
-    call PoolOpt to Construct Pool
-  */
+void SwapGPU::Append(DeviceOptInfoToAppend dev_opt_info){
 
-  vector<string> v = SplitOptString(block_info, " ");
+  //convert block_ptr from string to Block*
   void* temp_ptr;
-  stringstream convert(v[1]);
+  stringstream convert(dev_opt_info.block_ptr);
   convert>>temp_ptr;
   auto block_ptr = static_cast<Block*>(temp_ptr);
-  
-  // insert size, malloc : flag, block_, size, t; others: insert size t.
-  if (v.size() != 4) {
-    stringstream strm1;
-    strm1<<block_ptr->size();
-    string temp_str1 = strm1.str();
-    block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2];
-  }
 
   // update global load
   if (iteration_length < iteration_length_threshold){
-    if (v[0] == "Malloc"){
+    if (dev_opt_info.operation_type == "Malloc"){
       if (global_load.size()>0){
         global_load.push_back(global_load[global_load.size()-1]+block_ptr->size());
       } else {
         global_load.push_back(block_ptr->size());
       }
-    } else if (v[0] == "Free"){
+    } else if (dev_opt_info.operation_type  == "Free"){
       global_load.push_back(global_load[global_load.size()-1]-block_ptr->size());
     } else {
       global_load.push_back(global_load[global_load.size()-1]);
@@ -1037,6 +1013,15 @@ void SwapGPU::Append(string block_info){
   }
 
   //append into vec_block
+  stringstream strm1;
+  strm1<<dev_opt_info.size;
+  string temp_str1 = strm1.str();
+  stringstream strm4;
+  strm4<<dev_opt_info.t;
+  string temp_str4 = strm4.str();
+  string block_info = dev_opt_info.operation_type + " " + dev_opt_info.block_ptr + " " +
+  temp_str1 + " " + temp_str4;
+  //cout<<"1 "<<block_info<<endl;
   vec_block.push_back(block_info);
 
   //change swap flag on and off
@@ -1067,6 +1052,8 @@ void SwapGPU::Append(string block_info){
 
 }
 
+
+
 void* SwapGPU::UpdateGpuPtr(const Block* block_ptr){
   /*
   in case that block is not at device memory, swapIn ad hoc.