apache · Nov 29, 2024
diff --git a/‎examples/cnn_ms/msmlp/model.py
Lines changed: 27 additions & 5 deletions b/‎examples/cnn_ms/msmlp/model.py
Lines changed: 27 additions & 5 deletions
diff --git a/‎examples/cnn_ms/train_ms_model.py
Lines changed: 552 additions & 0 deletions b/‎examples/cnn_ms/train_ms_model.py
Lines changed: 552 additions & 0 deletions
diff --git a/‎examples/healthcare/Hematologic_Disease/ClassDemo.py
Lines changed: 270 additions & 0 deletions b/‎examples/healthcare/Hematologic_Disease/ClassDemo.py
Lines changed: 270 additions & 0 deletions
diff --git a/‎examples/healthcare/Hematologic_Disease/Readme.md
Lines changed: 45 additions & 0 deletions b/‎examples/healthcare/Hematologic_Disease/Readme.md
Lines changed: 45 additions & 0 deletions
diff --git a/‎examples/healthcare/Hematologic_Disease/transforms.py
Lines changed: 166 additions & 0 deletions b/‎examples/healthcare/Hematologic_Disease/transforms.py
Lines changed: 166 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/README.md
Lines changed: 44 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/README.md
Lines changed: 44 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/data/malaria.py
Lines changed: 122 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/data/malaria.py
Lines changed: 122 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/model/cnn.py
Lines changed: 94 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/model/cnn.py
Lines changed: 94 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/model/mlp.py
Lines changed: 85 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/model/mlp.py
Lines changed: 85 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/run.sh
Lines changed: 20 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/run.sh
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/healthcare/Malaria_Detection/train_cnn.py
Lines changed: 294 additions & 0 deletions b/‎examples/healthcare/Malaria_Detection/train_cnn.py
Lines changed: 294 additions & 0 deletions
diff --git a/‎examples/healthcare/application/Malaria_Detection/train_cnn.py
Lines changed: 318 additions & 0 deletions b/‎examples/healthcare/application/Malaria_Detection/train_cnn.py
Lines changed: 318 additions & 0 deletions
diff --git a/‎examples/healthcare/application/TED_CT_Detection/README.md
Lines changed: 11 additions & 0 deletions b/‎examples/healthcare/application/TED_CT_Detection/README.md
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/healthcare/application/TED_CT_Detection/model.py
Lines changed: 119 additions & 0 deletions b/‎examples/healthcare/application/TED_CT_Detection/model.py
Lines changed: 119 additions & 0 deletions
diff --git a/‎examples/healthcare/application/TED_CT_Detection/train.py
Lines changed: 191 additions & 0 deletions b/‎examples/healthcare/application/TED_CT_Detection/train.py
Lines changed: 191 additions & 0 deletions
diff --git a/‎examples/healthcare/data/bloodmnist.py
Lines changed: 240 additions & 0 deletions b/‎examples/healthcare/data/bloodmnist.py
Lines changed: 240 additions & 0 deletions
diff --git a/‎examples/healthcare/data/malaria.py
Lines changed: 122 additions & 0 deletions b/‎examples/healthcare/data/malaria.py
Lines changed: 122 additions & 0 deletions
diff --git a/‎examples/healthcare/models/malaria_net.py
Lines changed: 146 additions & 0 deletions b/‎examples/healthcare/models/malaria_net.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎examples/malaria_cnn/train_cnn.py
Lines changed: 294 additions & 0 deletions b/‎examples/malaria_cnn/train_cnn.py
Lines changed: 294 additions & 0 deletions
diff --git a/‎examples/msmodel_mlp/native.py
Lines changed: 135 additions & 0 deletions b/‎examples/msmodel_mlp/native.py
Lines changed: 135 additions & 0 deletions
diff --git a/‎examples/trans/README.md
Lines changed: 7 additions & 2 deletions b/‎examples/trans/README.md
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/trans/data.py
Lines changed: 2 additions & 2 deletions b/‎examples/trans/data.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/trans/run.sh
Lines changed: 1 addition & 1 deletion b/‎examples/trans/run.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/trans/train.py
Lines changed: 2 additions & 3 deletions b/‎examples/trans/train.py
Lines changed: 2 additions & 3 deletions
@@ -32,6 +32,7 @@
 
 singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
 
+
 #### self-defined loss begin
 
 ### from autograd.py
@@ -62,11 +63,13 @@ def backward(self, dy=1.0):
         dx *= dy
         return dx
 
+
 def se_loss(x):
     # assert x.shape == t.shape, "input and target shape different: %s, %s" % (
     #     x.shape, t.shape)
     return SumError()(x)[0]
 
+
 ### from layer.py
 class SumErrorLayer(Layer):
     """
@@ -79,6 +82,7 @@ def __init__(self):
     def forward(self, x):
         return se_loss(x)
 
+
 #### self-defined loss end
 
 class MSMLP(model.Model):
@@ -92,7 +96,6 @@ def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
         self.linear1 = layer.Linear(perceptron_size)
         self.linear2 = layer.Linear(num_classes)
         self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
-
         self.sum_error = SumErrorLayer()
 
     def forward(self, inputs):
@@ -101,12 +104,24 @@ def forward(self, inputs):
         y = self.linear2(y)
         return y
 
-    def train_one_batch(self, x, y, synflow_flag, dist_option, spars):
+    def train_one_batch(self, x, y, dist_option, spars, synflow_flag):
+        # print ("in train_one_batch")
         out = self.forward(x)
-        loss = self.softmax_cross_entropy(out, y)
+        # print ("train_one_batch x.data: \n", x.data)
+        # print ("train_one_batch y.data: \n", y.data)
+        # print ("train_one_batch out.data: \n", out.data)
+        if synflow_flag:
+            # print ("sum_error")
+            loss = self.sum_error(out)
+        else:  # normal training
+            # print ("softmax_cross_entropy")
+            loss = self.softmax_cross_entropy(out, y)
+        # print ("train_one_batch loss.data: \n", loss.data)
 
         if dist_option == 'plain':
+            # print ("before pn_p_g_list = self.optimizer(loss)")
             pn_p_g_list = self.optimizer(loss)
+            # print ("after pn_p_g_list = self.optimizer(loss)")
         elif dist_option == 'half':
             self.optimizer.backward_and_update_half(loss)
         elif dist_option == 'partialUpdate':
@@ -119,17 +134,24 @@ def train_one_batch(self, x, y, synflow_flag, dist_option, spars):
             self.optimizer.backward_and_sparse_update(loss,
                                                       topK=False,
                                                       spars=spars)
+        # print ("len(pn_p_g_list): \n", len(pn_p_g_list))
+        # print ("len(pn_p_g_list[0]): \n", len(pn_p_g_list[0]))
+        # print ("pn_p_g_list[0][0]: \n", pn_p_g_list[0][0])
+        # print ("pn_p_g_list[0][1].data: \n", pn_p_g_list[0][1].data)
+        # print ("pn_p_g_list[0][2].data: \n", pn_p_g_list[0][2].data)
         return pn_p_g_list, out, loss
+        # return pn_p_g_list[0], pn_p_g_list[1], pn_p_g_list[2], out, loss
 
     def set_optimizer(self, optimizer):
         self.optimizer = optimizer
 
 
 def create_model(pretrained=False, **kwargs):
     """Constructs a CNN model.
+
     Args:
         pretrained (bool): If True, returns a pre-trained model.
-    
+
     Returns:
         The created CNN model.
     """
@@ -196,4 +218,4 @@ def create_model(pretrained=False, **kwargs):
         out, loss = model(tx, ty, 'fp32', spars=None)
 
         if i % 100 == 0:
-            print("training loss = ", tensor.to_numpy(loss)[0])
+            print("training loss = ", tensor.to_numpy(loss)[0])
@@ -0,0 +1,270 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import json
+import os
+import time
+from glob import glob
+
+import numpy as np
+from PIL import Image
+from singa import device, layer, model, opt, tensor
+from tqdm import tqdm
+
+from transforms import Compose, Normalize, ToTensor
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+class ClassDataset(object):
+    """Fetch data from file and generate batches.
+
+    Load data from folder as PIL.Images and convert them into batch array.
+
+    Args:
+        img_folder (Str): Folder path of the training/validation images.
+        transforms (Transform):  Preprocess transforms.
+    """
+    def __init__(self, img_folder, transforms):
+        super(ClassDataset, self).__init__()
+
+        self.img_list = list()
+        self.transforms = transforms
+
+        classes = os.listdir(img_folder)
+        for i in classes:
+            images = glob(os.path.join(img_folder, i, "*"))
+            for img in images:
+                self.img_list.append((img, i))
+    
+    def __len__(self) -> int:
+        return len(self.img_list)
+
+    def __getitem__(self, index: int):
+        img_path, label_str = self.img_list[index]
+        img = Image.open(img_path)
+        img = self.transforms.forward(img)
+        label = np.array(label_str, dtype=np.int32)
+
+        return img, label
+    
+    def batchgenerator(self, indexes, batch_size, data_size):
+        """Generate batch arrays from transformed image list.
+
+        Args:
+            indexes (Sequence): current batch indexes list, e.g. [n, n + 1, ..., n + batch_size]
+            batch_size (int): 
+            data_size (Tuple): input image size of shape (C, H, W) 
+
+        Return:
+            batch_x (Numpy ndarray): batch array of input images (B, C, H, W)
+            batch_y (Numpy ndarray): batch array of ground truth lables (B,)
+        """
+        batch_x = np.zeros((batch_size,) + data_size)
+        batch_y = np.zeros((batch_size,) + (1,), dtype=np.int32)
+        for idx, i in enumerate(indexes):
+            sample_x, sample_y = self.__getitem__(i)
+            batch_x[idx, :, :, :] = sample_x
+            batch_y[idx, :] = sample_y
+
+        return batch_x, batch_y
+
+
+class CNNModel(model.Model):
+    def __init__(self, num_classes):
+        super(CNNModel, self).__init__()
+        self.input_size = 28
+        self.dimension = 4
+        self.num_classes = num_classes
+        
+        self.layer1 = layer.Conv2d(16, kernel_size=3, activation="RELU")
+        self.bn1 = layer.BatchNorm2d()
+        self.layer2 = layer.Conv2d(16, kernel_size=3, activation="RELU")
+        self.bn2 = layer.BatchNorm2d()        
+        self.pooling2 = layer.MaxPool2d(kernel_size=2, stride=2)
+        self.layer3 = layer.Conv2d(64, kernel_size=3, activation="RELU")
+        self.bn3 = layer.BatchNorm2d()
+        self.layer4 = layer.Conv2d(64, kernel_size=3, activation="RELU")
+        self.bn4 = layer.BatchNorm2d()
+        self.layer5 = layer.Conv2d(64, kernel_size=3, padding=1, activation="RELU")
+        self.bn5 = layer.BatchNorm2d()
+        self.pooling5 = layer.MaxPool2d(kernel_size=2, stride=2)
+
+        self.flatten = layer.Flatten()
+
+        self.linear1 = layer.Linear(128)
+        self.linear2 = layer.Linear(128)
+        self.linear3 = layer.Linear(self.num_classes)
+
+        self.relu = layer.ReLU()
+
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+        self.dropout = layer.Dropout(ratio=0.3)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.bn1(x)
+        x = self.layer2(x)
+        x = self.bn2(x) 
+        x = self.pooling2(x)
+        
+        x = self.layer3(x)
+        x = self.bn3(x) 
+        x = self.layer4(x)
+        x = self.bn4(x) 
+        x = self.layer5(x)
+        x = self.bn5(x) 
+        x = self.pooling5(x)
+        x = self.flatten(x)
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        x = self.relu(x)
+        x = self.linear3(x)
+        return x
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == 'plain':
+            self.optimizer(loss)
+        elif dist_option == 'half':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+
+def accuracy(pred, target):
+    """Compute recall accuracy.
+
+    Args:
+        pred (Numpy ndarray): Prediction array, should be in shape (B, C)
+        target (Numpy ndarray): Ground truth array, should be in shape (B, ) 
+
+    Return:
+        correct (Float): Recall accuracy
+    """
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = (y[:,None]==target).sum()
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+# Define pre-processing methods (transforms)
+transforms = Compose([
+    ToTensor(),
+    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+
+# Dataset loading
+dataset_path = "./bloodmnist"
+train_path = os.path.join(dataset_path, "train")
+val_path = os.path.join(dataset_path, "val") 
+cfg_path = os.path.join(dataset_path, "param.json")
+
+with open(cfg_path,'r') as load_f:
+    num_class = json.load(load_f)["num_classes"]
+
+train_dataset = ClassDataset(train_path, transforms)
+val_dataset = ClassDataset(val_path, transforms)
+
+batch_size = 256
+
+# Model configuration for CNN
+model = CNNModel(num_classes=num_class)
+criterion = layer.SoftMaxCrossEntropy()
+optimizer_ft = opt.Adam(lr=1e-3)
+
+# Start training
+dev = device.create_cpu_device()
+dev.SetRandSeed(0)
+np.random.seed(0)
+
+tx = tensor.Tensor(
+        (batch_size, 3, model.input_size, model.input_size), dev,
+        singa_dtype['float32'])
+ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+
+num_train_batch = train_dataset.__len__() // batch_size
+num_val_batch = val_dataset.__len__() // batch_size
+idx = np.arange(train_dataset.__len__(), dtype=np.int32)
+
+model.set_optimizer(optimizer_ft)
+model.compile([tx], is_train=True, use_graph=False, sequential=False)
+dev.SetVerbosity(0)
+
+max_epoch = 100
+for epoch in range(max_epoch):
+    print(f'Epoch {epoch}:')
+    
+    start_time = time.time()
+
+    train_correct = np.zeros(shape=[1], dtype=np.float32)
+    test_correct = np.zeros(shape=[1], dtype=np.float32)
+    train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+    # Training part
+    model.train()
+    for b in tqdm(range(num_train_batch)):
+        # Extract batch from image list
+        x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size], 
+            batch_size=batch_size, data_size=(3, model.input_size, model.input_size))
+        x = x.astype(np_dtype['float32'])
+
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+
+        out, loss = model(tx, ty, dist_option="plain", spars=None)
+        train_correct += accuracy(tensor.to_numpy(out), y)
+        train_loss += tensor.to_numpy(loss)[0]
+    print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size)))
+
+    # Validation part
+    model.eval()
+    for b in tqdm(range(num_val_batch)):
+        x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size], 
+            batch_size=batch_size, data_size=(3, model.input_size, model.input_size))
+        x = x.astype(np_dtype['float32'])
+
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+
+        out = model(tx)
+        test_correct += accuracy(tensor.to_numpy(out), y)
+    
+    print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_val_batch * batch_size),
+                   time.time() - start_time))
@@ -0,0 +1,45 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+# CNN demo model on BloodMnist dataset
+
+## About dataset
+Download address: https://drive.google.com/drive/folders/1Ze9qri1UtAsIRoI0SJ4YRpdt5kUUMBEn?usp=sharing
+
+The BloodMNIST , as a sub set of [MedMNIST](https://medmnist.com/), is based on a dataset of individual normal cells, captured from individuals without infection, hematologic or oncologic disease and free of any pharmacologic treatment at the moment of blood collection. 
+It contains a total of 17,092 images and is organized into 8 classes. 
+it is split with a ratio of 7:1:2 into training, validation and test set. 
+The source images with resolution 3×360×363 pixels are center-cropped into 3×200×200, and then resized into 3×28×28.
+
+8 classes of the dataset: 
+```python
+"0": "basophil",
+"1": "eosinophil",
+"2": "erythroblast",
+"3": "ig (immature granulocytes)",
+"4": "lymphocyte",
+"5": "monocyte",
+"6": "neutrophil",
+"7": "platelet"
+```
+
+# Run the demo
+Run
+```
+python ClassDemo.py
+```
@@ -0,0 +1,166 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+
+import numpy as np
+from PIL import Image
+
+
+class Compose(object):
+    """Compose several transforms together.
+
+    Args:
+        transforms: list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.ToTensor(),
+        >>>     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        >>> ])
+
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or numpy array): Image to be processed.
+
+        Returns:
+            PIL Image or numpy array: Processed image.
+        """
+        for t in self.transforms:
+            img = t.forward(img)
+        return img
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+class ToTensor(object):
+    """Convert a ``PIL Image`` to ``numpy.ndarray``.
+
+    Converts a PIL Image (H x W x C) in the range [0, 255] to a ``numpy.array`` of shape 
+    (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1).
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. 
+    """
+
+    def forward(self, pic):
+        """
+        Args:
+            pic (PIL Image): Image to be converted to array.
+
+        Returns:
+            Array: Converted image.
+        """
+        if not isinstance(pic, Image.Image):
+           raise TypeError('pic should be PIL Image. Got {}'.format(type(pic)))
+
+        # Handle PIL Image
+        mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32}
+        img = np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)
+
+        if pic.mode == '1':
+            img = 255 * img
+
+        # Put it from HWC to CHW format
+        img = np.transpose(img, (2, 0, 1))
+
+        if img.dtype == np.uint8:
+            return np.array(np.float32(img)/255.0, dtype=np.float)
+        else:
+            return np.float(img)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class Normalize(object):
+    """Normalize a ``numpy.array`` image with mean and standard deviation.
+    
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``numpy.array`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input array.
+
+    Args:
+        mean (Sequence): Sequence of means for each channel.
+        std (Sequence): Sequence of standard deviations for each channel.
+        inplace(bool, optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, img: np.ndarray):
+        """
+        Args:
+            img (Numpy ndarray): Array image to be normalized.
+
+        Returns:
+            d_res (Numpy ndarray): Normalized Tensor image.
+        """
+        if not isinstance(img, np.ndarray):
+            raise TypeError('Input img should be a numpy array. Got {}.'.format(type(img)))
+
+        if not img.dtype == np.float:
+            raise TypeError('Input array should be a float array. Got {}.'.format(img.dtype))
+
+        if img.ndim < 3:
+            raise ValueError('Expected array to be an array image of size (..., C, H, W). Got img.shape = '
+                            '{}.'.format(img.shape))
+
+        if not self.inplace:
+            img = img.copy()
+
+        dtype = img.dtype
+        mean = np.array(self.mean, dtype=dtype)
+        std = np.array(self.std, dtype=dtype)
+        if (std == 0).any():
+            raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype))
+        s_res = np.subtract(img, mean[:, None, None])
+        d_res = np.divide(s_res, std[:, None, None])
+
+        return d_res
+
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
+
+
@@ -0,0 +1,44 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+# Singa for Malaria Detection Task
+
+## Malaria
+
+Malaria is caused by parasites and could be transmitted through infected mosquitoes. There are about 200 million cases worldwide, and about 400,000 deaths per year, therefore, malaria does lots of harm to global health.
+
+Although Malaria is a curable disease, inadequate diagnostics make it harder to reduce mortality, as a result, a fast and reliable diagnostic test is a promising and effective way to fight malaria.
+
+To mitigate the problem, we use Singa to implement a machine learning model to help with Malaria diagnosis. The dataset is from Kaggle https://www.kaggle.com/datasets/miracle9to9/files1?resource=download. Please download the dataset before running the scripts.
+
+## Structure
+
+* `data` includes the scripts for preprocessing Malaria image datasets.
+
+* `model` includes the CNN model construction codes by creating
+  a subclass of `Module` to wrap the neural network operations 
+  of each model.
+
+* `train_cnn.py` is the training script, which controls the training flow by
+  doing BackPropagation and SGD update.
+
+## Command
+```bash
+python train_cnn.py cnn malaria -dir pathToDataset
+```
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+try:
+    import pickle
+except ImportError:
+    import cPickle as pickle
+
+import numpy as np
+import os
+import sys
+from PIL import Image
+
+
+# need to save to specific local directories
+def load_train_data(dir_path="/tmp/malaria", resize_size=(128, 128)):
+    dir_path = check_dataset_exist(dirpath=dir_path)
+    path_train_label_1 = os.path.join(dir_path, "training_set/Parasitized")
+    path_train_label_0 = os.path.join(dir_path, "training_set/Uninfected")
+    train_label_1 = load_image_path(os.listdir(path_train_label_1))
+    train_label_0 = load_image_path(os.listdir(path_train_label_0))
+    labels = []
+    Images = np.empty((len(train_label_1) + len(train_label_0),
+                       3, resize_size[0], resize_size[1]), dtype=np.uint8)
+    for i in range(len(train_label_0)):
+        image_path = os.path.join(path_train_label_0, train_label_0[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i] = temp_image
+        labels.append(0)
+    for i in range(len(train_label_1)):
+        image_path = os.path.join(path_train_label_1, train_label_1[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i + len(train_label_0)] = temp_image
+        labels.append(1)
+
+    Images = np.array(Images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return Images, labels
+
+
+# need to save to specific local directories
+def load_test_data(dir_path='/tmp/malaria', resize_size=(128, 128)):
+    dir_path = check_dataset_exist(dirpath=dir_path)
+    path_test_label_1 = os.path.join(dir_path, "testing_set/Parasitized")
+    path_test_label_0 = os.path.join(dir_path, "testing_set/Uninfected")
+    test_label_1 = load_image_path(os.listdir(path_test_label_1))
+    test_label_0 = load_image_path(os.listdir(path_test_label_0))
+    labels = []
+    Images = np.empty((len(test_label_1) + len(test_label_0),
+                       3, resize_size[0], resize_size[1]), dtype=np.uint8)
+    for i in range(len(test_label_0)):
+        image_path = os.path.join(path_test_label_0, test_label_0[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i] = temp_image
+        labels.append(0)
+    for i in range(len(test_label_1)):
+        image_path = os.path.join(path_test_label_1, test_label_1[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i + len(test_label_0)] = temp_image
+        labels.append(1)
+
+    Images = np.array(Images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return Images, labels
+
+
+def load_image_path(list):
+    new_list = []
+    for image_path in list:
+        if (image_path.endswith(".png") or image_path.endswith(".jpg")):
+            new_list.append(image_path)
+    return new_list
+
+
+def check_dataset_exist(dirpath):
+    if not os.path.exists(dirpath):
+        print(
+            'Please download the malaria dataset first'
+        )
+        sys.exit(0)
+    return dirpath
+
+
+def normalize(train_x, val_x):
+    mean = [0.5339, 0.4180, 0.4460]  # mean for malaria dataset
+    std = [0.3329, 0.2637, 0.2761]  # std for malaria dataset
+    train_x /= 255
+    val_x /= 255
+    for ch in range(0, 2):
+        train_x[:, ch, :, :] -= mean[ch]
+        train_x[:, ch, :, :] /= std[ch]
+        val_x[:, ch, :, :] -= mean[ch]
+        val_x[:, ch, :, :] /= std[ch]
+    return train_x, val_x
+
+
+def load(dir_path):
+    train_x, train_y = load_train_data(dir_path=dir_path)
+    val_x, val_y = load_test_data(dir_path=dir_path)
+    train_x, val_x = normalize(train_x, val_x)
+    train_y = train_y.flatten()
+    val_y = val_y.flatten()
+    return train_x, train_y, val_x, val_y
@@ -0,0 +1,94 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from singa import layer
+from singa import model
+
+
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.num_classes = num_classes
+        self.input_size = 128
+        self.dimension = 4
+        self.conv1 = layer.Conv2d(num_channels, 32, 3, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(32, 64, 3, padding=0, activation="RELU")
+        self.conv3 = layer.Conv2d(64, 64, 3, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(128)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling3 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+        self.sigmoid = layer
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.conv3(y)
+        y = self.pooling3(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == 'plain':
+            self.optimizer(loss)
+        elif dist_option == 'half':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+def create_model(**kwargs):
+    """Constructs a CNN model.
+
+    Args:
+        pretrained (bool): If True, returns a pre-trained model.
+
+    Returns:
+        The created CNN model.
+    """
+    model = CNN(**kwargs)
+
+    return model
+
+
+__all__ = ['CNN', 'create_model']
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from singa import layer
+from singa import model
+from singa import tensor
+from singa import opt
+from singa import device
+import argparse
+import numpy as np
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+class MLP(model.Model):
+
+    def __init__(self, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.num_classes = num_classes
+        self.dimension = 2
+
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, inputs):
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == 'plain':
+            self.optimizer(loss)
+        elif dist_option == 'half':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+def create_model(**kwargs):
+    """Constructs a CNN model.
+
+    Returns:
+        The created CNN model.
+    """
+    model = MLP(**kwargs)
+
+    return model
+
+
+__all__ = ['MLP', 'create_model']
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+### malaria dataset
+python train_cnn.py cnn malaria -dir pathToDataset
@@ -0,0 +1,294 @@
+from singa import singa_wrap as singa
+from singa import device
+from singa import tensor
+from singa import opt
+import numpy as np
+import time
+import argparse
+import sys
+from PIL import Image
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+# Data augmentation
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :,
+                               offset[0]:offset[0] + x.shape[2],
+                               offset[1]:offset[1] + x.shape[2]]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+# Calculate accuracy
+def accuracy(pred, target):
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = y == target
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+# Data partition according to the rank
+def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
+    # Partition training data
+    data_per_rank = train_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    train_x = train_x[idx_start:idx_end]
+    train_y = train_y[idx_start:idx_end]
+
+    # Partition evaluation data
+    data_per_rank = val_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    val_x = val_x[idx_start:idx_end]
+    val_y = val_y[idx_start:idx_end]
+    return train_x, train_y, val_x, val_y
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+def resize_dataset(x, image_size):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, image_size, image_size),
+                 dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+                (image_size, image_size), Image.BILINEAR),
+                dtype=np.float32)
+    return X
+
+
+def run(global_rank,
+        world_size,
+        dir_path,
+        max_epoch,
+        batch_size,
+        model,
+        data,
+        sgd,
+        graph,
+        verbosity,
+        dist_option='plain',
+        spars=None,
+        precision='float32'):
+    # now CPU version only, could change to GPU device for GPU-support machines
+    dev = device.get_default_device()
+    dev.SetRandSeed(0)
+    np.random.seed(0)
+    if data == 'malaria':
+        from data import malaria
+        train_x, train_y, val_x, val_y = malaria.load(dir_path=dir_path)
+    else:
+        print(
+            'Wrong dataset!'
+        )
+        sys.exit(0)
+
+    num_channels = train_x.shape[1]
+    image_size = train_x.shape[2]
+    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
+    num_classes = (np.max(train_y) + 1).item()
+
+    if model == 'cnn':
+        from model import cnn
+        model = cnn.create_model(num_channels=num_channels,
+                                 num_classes=num_classes)
+    else:
+        print(
+            'Wrong model!'
+        )
+        sys.exit(0)
+
+    # For distributed training, sequential has better performance
+    if hasattr(sgd, "communicator"):
+        DIST = True
+        sequential = True
+    else:
+        DIST = False
+        sequential = False
+
+    if DIST:
+        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
+                                                   train_x, train_y, val_x,
+                                                   val_y)
+
+    if model.dimension == 4:
+        tx = tensor.Tensor(
+            (batch_size, num_channels, model.input_size, model.input_size), dev,
+            singa_dtype[precision])
+    elif model.dimension == 2:
+        tx = tensor.Tensor((batch_size, data_size),
+                           dev, singa_dtype[precision])
+        np.reshape(train_x, (train_x.shape[0], -1))
+        np.reshape(val_x, (val_x.shape[0], -1))
+
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_val_batch = val_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    # Attach model to graph
+    model.set_optimizer(sgd)
+    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
+    dev.SetVerbosity(verbosity)
+
+    # Training and evaluation loop
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if global_rank == 0:
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        model.train()
+        for b in range(num_train_batch):
+            # if b % 100 == 0:
+            #     print ("b: \n", b)
+            # Generate the patch data in this iteration
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            if model.dimension == 4:
+                x = augmentation(x, batch_size)
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+
+            # Copy the patch data into input tensors
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+
+            # Train the model
+            out, loss = model(tx, ty, dist_option, spars)
+            train_correct += accuracy(tensor.to_numpy(out), y)
+            train_loss += tensor.to_numpy(loss)[0]
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, sgd, reducer)
+            train_loss = reduce_variable(train_loss, sgd, reducer)
+
+        if global_rank == 0:
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        # Evaluation phase
+        model.eval()
+        for b in range(num_val_batch):
+            x = val_x[b * batch_size:(b + 1) * batch_size]
+            if model.dimension == 4:
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = val_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+        if DIST:
+            # Reduce the evaulation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, sgd, reducer)
+
+        # Output the evaluation accuracy
+        if global_rank == 0:
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_val_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+    dev.PrintTimeProfiling()
+
+
+if __name__ == '__main__':
+    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
+    parser = argparse.ArgumentParser(
+        description='Training using the autograd and graph.')
+    parser.add_argument(
+        'model',
+        choices=['cnn'],
+        default='cnn')
+    parser.add_argument('data',
+                        choices=['malaria'],
+                        default='malaria')
+    parser.add_argument('-p',
+                        choices=['float32', 'float16'],
+                        default='float32',
+                        dest='precision')
+    parser.add_argument('-dir',
+                        '--dir-path',
+                        default="/tmp/malaria",
+                        type=str,
+                        help='the directory to store the malaria dataset',
+                        dest='dir_path')
+    parser.add_argument('-m',
+                        '--max-epoch',
+                        default=100,
+                        type=int,
+                        help='maximum epochs',
+                        dest='max_epoch')
+    parser.add_argument('-b',
+                        '--batch-size',
+                        default=64,
+                        type=int,
+                        help='batch size',
+                        dest='batch_size')
+    parser.add_argument('-l',
+                        '--learning-rate',
+                        default=0.005,
+                        type=float,
+                        help='initial learning rate',
+                        dest='lr')
+    parser.add_argument('-g',
+                        '--disable-graph',
+                        default='True',
+                        action='store_false',
+                        help='disable graph',
+                        dest='graph')
+    parser.add_argument('-v',
+                        '--log-verbosity',
+                        default=0,
+                        type=int,
+                        help='logging verbosity',
+                        dest='verbosity')
+
+    args = parser.parse_args()
+
+    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5,
+                  dtype=singa_dtype[args.precision])
+    run(0,
+        1,
+        args.dir_path,
+        args.max_epoch,
+        args.batch_size,
+        args.model,
+        args.data,
+        sgd,
+        args.graph,
+        args.verbosity,
+        precision=args.precision)
@@ -0,0 +1,318 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from singa import singa_wrap as singa
+from singa import device
+from singa import tensor
+from singa import opt
+import numpy as np
+import time
+import argparse
+import sys
+sys.path.append("../../..")
+
+from PIL import Image
+
+from healthcare.data import malaria
+from healthcare.models import malaria_net
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+# Data augmentation
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :,
+                               offset[0]:offset[0] + x.shape[2],
+                               offset[1]:offset[1] + x.shape[2]]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+# Calculate accuracy
+def accuracy(pred, target):
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = y == target
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+# Data partition according to the rank
+def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
+    # Partition training data
+    data_per_rank = train_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    train_x = train_x[idx_start:idx_end]
+    train_y = train_y[idx_start:idx_end]
+
+    # Partition evaluation data
+    data_per_rank = val_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    val_x = val_x[idx_start:idx_end]
+    val_y = val_y[idx_start:idx_end]
+    return train_x, train_y, val_x, val_y
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+def resize_dataset(x, image_size):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, image_size, image_size),
+                 dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+                (image_size, image_size), Image.BILINEAR),
+                dtype=np.float32)
+    return X
+
+
+def run(global_rank,
+        world_size,
+        dir_path,
+        max_epoch,
+        batch_size,
+        model,
+        data,
+        sgd,
+        graph,
+        verbosity,
+        dist_option='plain',
+        spars=None,
+        precision='float32'):
+    # now CPU version only, could change to GPU device for GPU-support machines
+    dev = device.get_default_device()
+    dev.SetRandSeed(0)
+    np.random.seed(0)
+    if data == 'malaria':
+
+        train_x, train_y, val_x, val_y = malaria.load(dir_path=dir_path)
+    else:
+        print(
+            'Wrong dataset!'
+        )
+        sys.exit(0)
+
+    num_channels = train_x.shape[1]
+    image_size = train_x.shape[2]
+    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
+    num_classes = (np.max(train_y) + 1).item()
+
+    if model == 'cnn':
+        model = malaria_net.create_model(model_option='cnn', num_channels=num_channels,
+                                         num_classes=num_classes)
+    else:
+        print(
+            'Wrong model!'
+        )
+        sys.exit(0)
+
+    # For distributed training, sequential has better performance
+    if hasattr(sgd, "communicator"):
+        DIST = True
+        sequential = True
+    else:
+        DIST = False
+        sequential = False
+
+    if DIST:
+        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
+                                                   train_x, train_y, val_x,
+                                                   val_y)
+
+    if model.dimension == 4:
+        tx = tensor.Tensor(
+            (batch_size, num_channels, model.input_size, model.input_size), dev,
+            singa_dtype[precision])
+    elif model.dimension == 2:
+        tx = tensor.Tensor((batch_size, data_size),
+                           dev, singa_dtype[precision])
+        np.reshape(train_x, (train_x.shape[0], -1))
+        np.reshape(val_x, (val_x.shape[0], -1))
+
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_val_batch = val_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    # Attach model to graph
+    model.set_optimizer(sgd)
+    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
+    dev.SetVerbosity(verbosity)
+
+    # Training and evaluation loop
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if global_rank == 0:
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        model.train()
+        for b in range(num_train_batch):
+            # if b % 100 == 0:
+            #     print ("b: \n", b)
+            # Generate the patch data in this iteration
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            if model.dimension == 4:
+                x = augmentation(x, batch_size)
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+
+            # Copy the patch data into input tensors
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+
+            # Train the model
+            out, loss = model(tx, ty, dist_option, spars)
+            train_correct += accuracy(tensor.to_numpy(out), y)
+            train_loss += tensor.to_numpy(loss)[0]
+
+            # print('batch training loss = %f' % train_loss, flush=True)
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, sgd, reducer)
+            train_loss = reduce_variable(train_loss, sgd, reducer)
+
+        if global_rank == 0:
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        # Evaluation phase
+        model.eval()
+        for b in range(num_val_batch):
+            x = val_x[b * batch_size:(b + 1) * batch_size]
+            if model.dimension == 4:
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = val_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+        if DIST:
+            # Reduce the evaulation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, sgd, reducer)
+
+        # Output the evaluation accuracy
+        if global_rank == 0:
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_val_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+    dev.PrintTimeProfiling()
+
+
+if __name__ == '__main__':
+    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
+    parser = argparse.ArgumentParser(
+        description='Training using the autograd and graph.')
+    parser.add_argument(
+        'model',
+        choices=['cnn'],
+        default='cnn')
+    parser.add_argument('data',
+                        choices=['malaria'],
+                        default='malaria')
+    parser.add_argument('-p',
+                        choices=['float32', 'float16'],
+                        default='float32',
+                        dest='precision')
+    parser.add_argument('-dir',
+                        '--dir-path',
+                        default="/tmp/malaria",
+                        type=str,
+                        help='the directory to store the malaria dataset',
+                        dest='dir_path')
+    parser.add_argument('-m',
+                        '--max-epoch',
+                        default=100,
+                        type=int,
+                        help='maximum epochs',
+                        dest='max_epoch')
+    parser.add_argument('-b',
+                        '--batch-size',
+                        default=64,
+                        type=int,
+                        help='batch size',
+                        dest='batch_size')
+    parser.add_argument('-l',
+                        '--learning-rate',
+                        default=0.005,
+                        type=float,
+                        help='initial learning rate',
+                        dest='lr')
+    parser.add_argument('-g',
+                        '--disable-graph',
+                        default='True',
+                        action='store_false',
+                        help='disable graph',
+                        dest='graph')
+    parser.add_argument('-v',
+                        '--log-verbosity',
+                        default=0,
+                        type=int,
+                        help='logging verbosity',
+                        dest='verbosity')
+
+    args = parser.parse_args()
+
+    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5,
+                  dtype=singa_dtype[args.precision])
+    run(0,
+        1,
+        args.dir_path,
+        args.max_epoch,
+        args.batch_size,
+        args.model,
+        args.data,
+        sgd,
+        args.graph,
+        args.verbosity,
+        precision=args.precision);
@@ -0,0 +1,11 @@
+# Convolutional Prototype Learning
+
+We have successfully applied the idea of prototype loss in various medical image classification task to improve performance, for example detection thyroid eye disease from CT images. Here we provide the implementation of the convolution prototype model in Singa. Due to data privacy, we are not able to release the CT image dataset used. The training scripts `./train.py` demonstrate how to apply this model on cifar-10 dataset.
+
+## run
+
+At Singa project root directory `python examples/healthcare/application/TED_CT_Detection/train.py`
+
+## reference
+
+[Robust Classification with Convolutional Prototype Learning](https://arxiv.org/abs/1805.03438)
@@ -0,0 +1,119 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from singa import layer
+from singa import model
+import singa.tensor as tensor
+from singa import autograd
+from singa.tensor import Tensor
+
+
+class CPLayer(layer.Layer):
+    def __init__(self, prototype_count=2, temp=10.0):
+        super(CPLayer, self).__init__()
+        self.prototype_count = prototype_count
+        self.temp = temp
+
+    def initialize(self, x):
+        self.feature_dim = x.shape[1]
+        self.prototype = tensor.random(
+            (self.feature_dim, self.prototype_count), device=x.device
+        )
+
+    def forward(self, feat):
+        self.device_check(feat, self.prototype)
+        self.dtype_check(feat, self.prototype)
+
+        feat_sq = autograd.mul(feat, feat)
+        feat_sq_sum = autograd.reduce_sum(feat_sq, axes=[1], keepdims=1)
+        feat_sq_sum_tile = autograd.tile(feat_sq_sum, repeats=[1, self.feature_dim])
+
+        prototype_sq = autograd.mul(self.prototype, self.prototype)
+        prototype_sq_sum = autograd.reduce_sum(prototype_sq, axes=[0], keepdims=1)
+        prototype_sq_sum_tile = autograd.tile(prototype_sq_sum, repeats=feat.shape[0])
+
+        cross_term = autograd.matmul(feat, self.prototype)
+        cross_term_scale = Tensor(
+            shape=cross_term.shape, device=cross_term.device, requires_grad=False
+        ).set_value(-2)
+        cross_term_scaled = autograd.mul(cross_term, cross_term_scale)
+
+        dist = autograd.add(feat_sq_sum_tile, prototype_sq_sum_tile)
+        dist = autograd.add(dist, cross_term_scaled)
+
+        logits_coeff = (
+            tensor.ones((feat.shape[0], self.prototype.shape[1]), device=feat.device)
+            * -1.0
+            / self.temp
+        )
+        logits_coeff.requires_grad = False
+        logits = autograd.mul(logits_coeff, dist)
+
+        return logits
+
+    def get_params(self):
+        return {self.prototype.name: self.prototype}
+
+    def set_params(self, parameters):
+        self.prototype.copy_from(parameters[self.prototype.name])
+
+
+class CPL(model.Model):
+
+    def __init__(
+        self,
+        backbone: model.Model,
+        prototype_count=2,
+        lamb=0.5,
+        temp=10,
+        label=None,
+        prototype_weight=None,
+    ):
+        super(CPL, self).__init__()
+        # config
+        self.lamb = lamb
+        self.prototype_weight = prototype_weight
+        self.prototype_label = label
+
+        # layer
+        self.backbone = backbone
+        self.cplayer = CPLayer(prototype_count=prototype_count, temp=temp)
+        # optimizer
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, x):
+        feat = self.backbone.forward(x)
+        logits = self.cplayer(feat)
+        return logits
+
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+def create_model(backbone, prototype_count=2, lamb=0.5, temp=10.0):
+    model = CPL(backbone, prototype_count=prototype_count, lamb=lamb, temp=temp)
+    return model
+
+
+__all__ = ["CPL", "create_model"]
@@ -0,0 +1,191 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from singa import device
+from singa import opt
+from singa import tensor
+import argparse
+import numpy as np
+import time
+from PIL import Image
+
+import sys
+
+sys.path.append(".")
+print(sys.path)
+
+import examples.cnn.model.cnn as cnn
+from examples.cnn.data import cifar10
+import model as cpl
+
+
+def accuracy(pred, target):
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = y == target
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+def resize_dataset(x, image_size):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, image_size, image_size), dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(
+                Image.fromarray(x[n, d, :, :]).resize(
+                    (image_size, image_size), Image.BILINEAR
+                ),
+                dtype=np.float32,
+            )
+    return X
+
+
+def run(
+    local_rank,
+    max_epoch,
+    batch_size,
+    sgd,
+    graph,
+    verbosity,
+    dist_option="plain",
+    spars=None,
+):
+    dev = device.create_cuda_gpu_on(local_rank)
+    dev.SetRandSeed(0)
+    np.random.seed(0)
+
+    train_x, train_y, val_x, val_y = cifar10.load()
+
+    num_channels = train_x.shape[1]
+    data_size = np.prod(train_x.shape[1 : train_x.ndim]).item()
+    num_classes = (np.max(train_y) + 1).item()
+
+    backbone = cnn.create_model(num_channels=num_channels, num_classes=num_classes)
+    model = cpl.create_model(backbone, prototype_count=10, lamb=0.5, temp=10)
+
+    if backbone.dimension == 4:
+        tx = tensor.Tensor(
+            (batch_size, num_channels, backbone.input_size, backbone.input_size), dev
+        )
+        train_x = resize_dataset(train_x, backbone.input_size)
+        val_x = resize_dataset(val_x, backbone.input_size)
+    elif backbone.dimension == 2:
+        tx = tensor.Tensor((batch_size, data_size), dev)
+        np.reshape(train_x, (train_x.shape[0], -1))
+        np.reshape(val_x, (val_x.shape[0], -1))
+
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_val_batch = val_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    model.set_optimizer(sgd)
+    model.compile([tx], is_train=True, use_graph=graph, sequential=True)
+    dev.SetVerbosity(verbosity)
+
+    for epoch in range(max_epoch):
+        print(f"Epoch {epoch}")
+        np.random.shuffle(idx)
+
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        model.train()
+        for b in range(num_train_batch):
+            x = train_x[idx[b * batch_size : (b + 1) * batch_size]]
+            y = train_y[idx[b * batch_size : (b + 1) * batch_size]]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+
+            out, loss = model(tx, ty, dist_option, spars)
+            train_correct += accuracy(tensor.to_numpy(out), y)
+            train_loss += tensor.to_numpy(loss)[0]
+        print(
+            "Training loss = %f, training accuracy = %f"
+            % (train_loss, train_correct / (num_train_batch * batch_size)),
+            flush=True,
+        )
+
+    model.eval()
+    for b in range(num_val_batch):
+        x = val_x[b * batch_size : (b + 1) * batch_size]
+        y = val_y[b * batch_size : (b + 1) * batch_size]
+
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+
+        out_test = model(tx, ty, dist_option="fp32", spars=None)
+        test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train a CPL model")
+    parser.add_argument(
+        "-m",
+        "--max-epoch",
+        default=20,
+        type=int,
+        help="maximum epochs",
+        dest="max_epoch",
+    )
+    parser.add_argument(
+        "-b", "--batch-size", default=64, type=int, help="batch size", dest="batch_size"
+    )
+    parser.add_argument(
+        "-l",
+        "--learning-rate",
+        default=0.005,
+        type=float,
+        help="initial learning rate",
+        dest="lr",
+    )
+    parser.add_argument(
+        "-i",
+        "--device-id",
+        default=0,
+        type=int,
+        help="which GPU to use",
+        dest="device_id",
+    )
+    parser.add_argument(
+        "-g",
+        "--disable-graph",
+        default="True",
+        action="store_false",
+        help="disable graph",
+        dest="graph",
+    )
+    parser.add_argument(
+        "-v",
+        "--log-verbosity",
+        default=0,
+        type=int,
+        help="logging verbosity",
+        dest="verbosity",
+    )
+    args = parser.parse_args()
+    print(args)
+
+    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5)
+    run(
+        args.device_id, args.max_epoch, args.batch_size, sgd, args.graph, args.verbosity
+    )
@@ -0,0 +1,240 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import os
+import json
+from glob import glob
+import numpy as np
+from PIL import Image
+
+
+class Compose(object):
+    """Compose several transforms together.
+
+    Args:
+        transforms: list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.ToTensor(),
+        >>>     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        >>> ])
+
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or numpy array): Image to be processed.
+
+        Returns:
+            PIL Image or numpy array: Processed image.
+        """
+        for t in self.transforms:
+            img = t.forward(img)
+        return img
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+class ToTensor(object):
+    """Convert a ``PIL Image`` to ``numpy.ndarray``.
+
+    Converts a PIL Image (H x W x C) in the range [0, 255] to a ``numpy.array`` of shape
+    (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1).
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks.
+    """
+
+    def forward(self, pic):
+        """
+        Args:
+            pic (PIL Image): Image to be converted to array.
+
+        Returns:
+            Array: Converted image.
+        """
+        if not isinstance(pic, Image.Image):
+            raise TypeError('pic should be PIL Image. Got {}'.format(type(pic)))
+
+        # Handle PIL Image
+        mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32}
+        img = np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)
+
+        if pic.mode == '1':
+            img = 255 * img
+
+        # Put it from HWC to CHW format
+        img = np.transpose(img, (2, 0, 1))
+
+        if img.dtype == np.uint8:
+            return np.array(np.float32(img) / 255.0, dtype=np.float)
+        else:
+            return np.float(img)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class Normalize(object):
+    """Normalize a ``numpy.array`` image with mean and standard deviation.
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``numpy.array`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input array.
+
+    Args:
+        mean (Sequence): Sequence of means for each channel.
+        std (Sequence): Sequence of standard deviations for each channel.
+        inplace(bool, optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, img: np.ndarray):
+        """
+        Args:
+            img (Numpy ndarray): Array image to be normalized.
+
+        Returns:
+            d_res (Numpy ndarray): Normalized Tensor image.
+        """
+        if not isinstance(img, np.ndarray):
+            raise TypeError('Input img should be a numpy array. Got {}.'.format(type(img)))
+
+        if not img.dtype == np.float:
+            raise TypeError('Input array should be a float array. Got {}.'.format(img.dtype))
+
+        if img.ndim < 3:
+            raise ValueError('Expected array to be an array image of size (..., C, H, W). Got img.shape = '
+                             '{}.'.format(img.shape))
+
+        if not self.inplace:
+            img = img.copy()
+
+        dtype = img.dtype
+        mean = np.array(self.mean, dtype=dtype)
+        std = np.array(self.std, dtype=dtype)
+        if (std == 0).any():
+            raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype))
+        s_res = np.subtract(img, mean[:, None, None])
+        d_res = np.divide(s_res, std[:, None, None])
+
+        return d_res
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
+
+
+class ClassDataset(object):
+    """Fetch data from file and generate batches.
+
+    Load data from folder as PIL.Images and convert them into batch array.
+
+    Args:
+        img_folder (Str): Folder path of the training/validation images.
+        transforms (Transform):  Preprocess transforms.
+    """
+
+    def __init__(self, img_folder, transforms):
+        super(ClassDataset, self).__init__()
+
+        self.img_list = list()
+        self.transforms = transforms
+
+        classes = os.listdir(img_folder)
+        for i in classes:
+            images = glob(os.path.join(img_folder, i, "*"))
+            for img in images:
+                self.img_list.append((img, i))
+
+    def __len__(self) -> int:
+        return len(self.img_list)
+
+    def __getitem__(self, index: int):
+        img_path, label_str = self.img_list[index]
+        img = Image.open(img_path)
+        img = self.transforms.forward(img)
+        label = np.array(label_str, dtype=np.int32)
+
+        return img, label
+
+    def batchgenerator(self, indexes, batch_size, data_size):
+        """Generate batch arrays from transformed image list.
+
+        Args:
+            indexes (Sequence): current batch indexes list, e.g. [n, n + 1, ..., n + batch_size]
+            batch_size (int):
+            data_size (Tuple): input image size of shape (C, H, W)
+
+        Return:
+            batch_x (Numpy ndarray): batch array of input images (B, C, H, W)
+            batch_y (Numpy ndarray): batch array of ground truth lables (B,)
+        """
+        batch_x = np.zeros((batch_size,) + data_size)
+        batch_y = np.zeros((batch_size,) + (1,), dtype=np.int32)
+        for idx, i in enumerate(indexes):
+            sample_x, sample_y = self.__getitem__(i)
+            batch_x[idx, :, :, :] = sample_x
+            batch_y[idx, :] = sample_y
+
+        return batch_x, batch_y
+
+
+def load(dir_path="tmp/bloodmnist"):
+    # Dataset loading
+    train_path = os.path.join(dir_path, "train")
+    val_path = os.path.join(dir_path, "val")
+    cfg_path = os.path.join(dir_path, "param.json")
+
+    with open(cfg_path, 'r') as load_f:
+        num_class = json.load(load_f)["num_classes"]
+
+    # Define pre-processing methods (transforms)
+    transforms = Compose([
+        ToTensor(),
+        Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    train_dataset = ClassDataset(train_path, transforms)
+    val_dataset = ClassDataset(val_path, transforms)
+    return train_dataset, val_dataset, num_class
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+try:
+    import pickle
+except ImportError:
+    import cPickle as pickle
+
+import numpy as np
+import os
+import sys
+from PIL import Image
+
+
+# need to save to specific local directories
+def load_train_data(dir_path="/tmp/malaria", resize_size=(128, 128)):
+    dir_path = check_dataset_exist(dirpath=dir_path)
+    path_train_label_1 = os.path.join(dir_path, "training_set/Parasitized")
+    path_train_label_0 = os.path.join(dir_path, "training_set/Uninfected")
+    train_label_1 = load_image_path(os.listdir(path_train_label_1))
+    train_label_0 = load_image_path(os.listdir(path_train_label_0))
+    labels = []
+    Images = np.empty((len(train_label_1) + len(train_label_0),
+                       3, resize_size[0], resize_size[1]), dtype=np.uint8)
+    for i in range(len(train_label_0)):
+        image_path = os.path.join(path_train_label_0, train_label_0[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i] = temp_image
+        labels.append(0)
+    for i in range(len(train_label_1)):
+        image_path = os.path.join(path_train_label_1, train_label_1[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i + len(train_label_0)] = temp_image
+        labels.append(1)
+
+    Images = np.array(Images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return Images, labels
+
+
+# need to save to specific local directories
+def load_test_data(dir_path='/tmp/malaria', resize_size=(128, 128)):
+    dir_path = check_dataset_exist(dirpath=dir_path)
+    path_test_label_1 = os.path.join(dir_path, "testing_set/Parasitized")
+    path_test_label_0 = os.path.join(dir_path, "testing_set/Uninfected")
+    test_label_1 = load_image_path(os.listdir(path_test_label_1))
+    test_label_0 = load_image_path(os.listdir(path_test_label_0))
+    labels = []
+    Images = np.empty((len(test_label_1) + len(test_label_0),
+                       3, resize_size[0], resize_size[1]), dtype=np.uint8)
+    for i in range(len(test_label_0)):
+        image_path = os.path.join(path_test_label_0, test_label_0[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i] = temp_image
+        labels.append(0)
+    for i in range(len(test_label_1)):
+        image_path = os.path.join(path_test_label_1, test_label_1[i])
+        temp_image = np.array(Image.open(image_path).resize(
+            resize_size).convert("RGB")).transpose(2, 0, 1)
+        Images[i + len(test_label_0)] = temp_image
+        labels.append(1)
+
+    Images = np.array(Images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return Images, labels
+
+
+def load_image_path(list):
+    new_list = []
+    for image_path in list:
+        if (image_path.endswith(".png") or image_path.endswith(".jpg")):
+            new_list.append(image_path)
+    return new_list
+
+
+def check_dataset_exist(dirpath):
+    if not os.path.exists(dirpath):
+        print(
+            'Please download the malaria dataset first'
+        )
+        sys.exit(0)
+    return dirpath
+
+
+def normalize(train_x, val_x):
+    mean = [0.5339, 0.4180, 0.4460]  # mean for malaria dataset
+    std = [0.3329, 0.2637, 0.2761]  # std for malaria dataset
+    train_x /= 255
+    val_x /= 255
+    for ch in range(0, 2):
+        train_x[:, ch, :, :] -= mean[ch]
+        train_x[:, ch, :, :] /= std[ch]
+        val_x[:, ch, :, :] -= mean[ch]
+        val_x[:, ch, :, :] /= std[ch]
+    return train_x, val_x
+
+
+def load(dir_path):
+    train_x, train_y = load_train_data(dir_path=dir_path)
+    val_x, val_y = load_test_data(dir_path=dir_path)
+    train_x, val_x = normalize(train_x, val_x)
+    train_y = train_y.flatten()
+    val_y = val_y.flatten()
+    return train_x, train_y, val_x, val_y
@@ -0,0 +1,146 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from singa import layer
+from singa import model
+from singa import tensor
+from singa import opt
+from singa import device
+
+import numpy as np
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.num_classes = num_classes
+        self.input_size = 128
+        self.dimension = 4
+        self.conv1 = layer.Conv2d(num_channels, 32, 3, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(32, 64, 3, padding=0, activation="RELU")
+        self.conv3 = layer.Conv2d(64, 64, 3, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(128)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling3 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+        self.sigmoid = layer
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.conv3(y)
+        y = self.pooling3(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == 'plain':
+            self.optimizer(loss)
+        elif dist_option == 'half':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+class MLP(model.Model):
+
+    def __init__(self, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.num_classes = num_classes
+        self.dimension = 2
+
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, inputs):
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == 'plain':
+            self.optimizer(loss)
+        elif dist_option == 'half':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+def create_model(model_option='cnn', **kwargs):
+    """Constructs a CNN model.
+
+    Args:
+        pretrained (bool): If True, returns a pre-trained model.
+
+    Returns:
+        The created CNN model.
+    """
+    model = CNN(**kwargs)
+    if model_option=='mlp':
+        model = MLP(**kwargs)
+
+    return model
+
+
+__all__ = ['CNN', 'MLP', 'create_model']
@@ -0,0 +1,294 @@
+from singa import singa_wrap as singa
+from singa import device
+from singa import tensor
+from singa import opt
+import numpy as np
+import time
+import argparse
+import sys
+from PIL import Image
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+# Data augmentation
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :,
+                               offset[0]:offset[0] + x.shape[2],
+                               offset[1]:offset[1] + x.shape[2]]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+# Calculate accuracy
+def accuracy(pred, target):
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = y == target
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+# Data partition according to the rank
+def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
+    # Partition training data
+    data_per_rank = train_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    train_x = train_x[idx_start:idx_end]
+    train_y = train_y[idx_start:idx_end]
+
+    # Partition evaluation data
+    data_per_rank = val_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    val_x = val_x[idx_start:idx_end]
+    val_y = val_y[idx_start:idx_end]
+    return train_x, train_y, val_x, val_y
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+def resize_dataset(x, image_size):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, image_size, image_size),
+                 dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+                (image_size, image_size), Image.BILINEAR),
+                dtype=np.float32)
+    return X
+
+
+def run(global_rank,
+        world_size,
+        dir_path,
+        max_epoch,
+        batch_size,
+        model,
+        data,
+        sgd,
+        graph,
+        verbosity,
+        dist_option='plain',
+        spars=None,
+        precision='float32'):
+    # now CPU version only, could change to GPU device for GPU-support machines
+    dev = device.get_default_device()
+    dev.SetRandSeed(0)
+    np.random.seed(0)
+    if data == 'malaria':
+        from data import malaria
+        train_x, train_y, val_x, val_y = malaria.load(dir_path=dir_path)
+    else:
+        print(
+            'Wrong dataset!'
+        )
+        sys.exit(0)
+
+    num_channels = train_x.shape[1]
+    image_size = train_x.shape[2]
+    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
+    num_classes = (np.max(train_y) + 1).item()
+
+    if model == 'cnn':
+        from model import cnn
+        model = cnn.create_model(num_channels=num_channels,
+                                 num_classes=num_classes)
+    else:
+        print(
+            'Wrong model!'
+        )
+        sys.exit(0)
+
+    # For distributed training, sequential has better performance
+    if hasattr(sgd, "communicator"):
+        DIST = True
+        sequential = True
+    else:
+        DIST = False
+        sequential = False
+
+    if DIST:
+        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
+                                                   train_x, train_y, val_x,
+                                                   val_y)
+
+    if model.dimension == 4:
+        tx = tensor.Tensor(
+            (batch_size, num_channels, model.input_size, model.input_size), dev,
+            singa_dtype[precision])
+    elif model.dimension == 2:
+        tx = tensor.Tensor((batch_size, data_size),
+                           dev, singa_dtype[precision])
+        np.reshape(train_x, (train_x.shape[0], -1))
+        np.reshape(val_x, (val_x.shape[0], -1))
+
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_val_batch = val_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    # Attach model to graph
+    model.set_optimizer(sgd)
+    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
+    dev.SetVerbosity(verbosity)
+
+    # Training and evaluation loop
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if global_rank == 0:
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        model.train()
+        for b in range(num_train_batch):
+            # if b % 100 == 0:
+            #     print ("b: \n", b)
+            # Generate the patch data in this iteration
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            if model.dimension == 4:
+                x = augmentation(x, batch_size)
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+
+            # Copy the patch data into input tensors
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+
+            # Train the model
+            out, loss = model(tx, ty, dist_option, spars)
+            train_correct += accuracy(tensor.to_numpy(out), y)
+            train_loss += tensor.to_numpy(loss)[0]
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, sgd, reducer)
+            train_loss = reduce_variable(train_loss, sgd, reducer)
+
+        if global_rank == 0:
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        # Evaluation phase
+        model.eval()
+        for b in range(num_val_batch):
+            x = val_x[b * batch_size:(b + 1) * batch_size]
+            if model.dimension == 4:
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = val_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+        if DIST:
+            # Reduce the evaulation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, sgd, reducer)
+
+        # Output the evaluation accuracy
+        if global_rank == 0:
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_val_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+    dev.PrintTimeProfiling()
+
+
+if __name__ == '__main__':
+    # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
+    parser = argparse.ArgumentParser(
+        description='Training using the autograd and graph.')
+    parser.add_argument(
+        'model',
+        choices=['cnn'],
+        default='cnn')
+    parser.add_argument('data',
+                        choices=['malaria'],
+                        default='malaria')
+    parser.add_argument('-p',
+                        choices=['float32', 'float16'],
+                        default='float32',
+                        dest='precision')
+    parser.add_argument('-dir',
+                        '--dir-path',
+                        default="/tmp/malaria",
+                        type=str,
+                        help='the directory to store the malaria dataset',
+                        dest='dir_path')
+    parser.add_argument('-m',
+                        '--max-epoch',
+                        default=100,
+                        type=int,
+                        help='maximum epochs',
+                        dest='max_epoch')
+    parser.add_argument('-b',
+                        '--batch-size',
+                        default=64,
+                        type=int,
+                        help='batch size',
+                        dest='batch_size')
+    parser.add_argument('-l',
+                        '--learning-rate',
+                        default=0.005,
+                        type=float,
+                        help='initial learning rate',
+                        dest='lr')
+    parser.add_argument('-g',
+                        '--disable-graph',
+                        default='True',
+                        action='store_false',
+                        help='disable graph',
+                        dest='graph')
+    parser.add_argument('-v',
+                        '--log-verbosity',
+                        default=0,
+                        type=int,
+                        help='logging verbosity',
+                        dest='verbosity')
+
+    args = parser.parse_args()
+
+    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5,
+                  dtype=singa_dtype[args.precision])
+    run(0,
+        1,
+        args.dir_path,
+        args.max_epoch,
+        args.batch_size,
+        args.model,
+        args.data,
+        sgd,
+        args.graph,
+        args.verbosity,
+        precision=args.precision)
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from singa import tensor
+from singa.tensor import Tensor
+from singa import autograd
+from singa import opt
+import numpy as np
+from singa import device
+import argparse
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p',
+                        choices=['float32', 'float16'],
+                        default='float32',
+                        dest='precision')
+    parser.add_argument('-m',
+                        '--max-epoch',
+                        default=1001,
+                        type=int,
+                        help='maximum epochs',
+                        dest='max_epoch')
+    args = parser.parse_args()
+
+    np.random.seed(0)
+
+    autograd.training = True
+
+    # prepare training data in numpy array
+
+    # generate the boundary
+    f = lambda x: (5 * x + 1)
+    bd_x = np.linspace(-1.0, 1, 200)
+    bd_y = f(bd_x)
+
+    # generate the training data
+    x = np.random.uniform(-1, 1, 400)
+    y = f(x) + 2 * np.random.randn(len(x))
+
+    # convert training data to 2d space
+    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
+    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
+
+    def to_categorical(y, num_classes):
+        """
+        Converts a class vector (integers) to binary class matrix.
+        Args:
+            y: class vector to be converted into a matrix
+                (integers from 0 to num_classes).
+            num_classes: total number of classes.
+        Returns:
+            A binary matrix representation of the input.
+        """
+        y = np.array(y, dtype="int")
+        n = y.shape[0]
+        categorical = np.zeros((n, num_classes))
+        categorical[np.arange(n), y] = 1
+        return categorical
+
+    label = to_categorical(label, 2).astype(np.float32)
+    print("train_data_shape:", data.shape)
+    print("train_label_shape:", label.shape)
+
+    precision = singa_dtype[args.precision]
+    np_precision = np_dtype[args.precision]
+
+    dev = device.create_cuda_gpu()
+
+    inputs = Tensor(data=data, device=dev)
+    target = Tensor(data=label, device=dev)
+
+    inputs = inputs.as_type(precision)
+    target = target.as_type(tensor.int32)
+
+    w0_np = np.random.normal(0, 0.1, (2, 3)).astype(np_precision)
+    w0 = Tensor(data=w0_np,
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b0 = Tensor(shape=(3,),
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b0.set_value(0.0)
+
+    w1_np = np.random.normal(0, 0.1, (3, 2)).astype(np_precision)
+    w1 = Tensor(data=w1_np,
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b1 = Tensor(shape=(2,),
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b1.set_value(0.0)
+
+    sgd = opt.SGD(0.05, 0.8)
+
+    # training process
+    for i in range(args.max_epoch):
+        x = autograd.matmul(inputs, w0)
+        x = autograd.add_bias(x, b0)
+        x = autograd.relu(x)
+        x = autograd.matmul(x, w1)
+        x = autograd.add_bias(x, b1)
+        loss = autograd.softmax_cross_entropy(x, target)
+        sgd(loss)
+
+        if i % 100 == 0:
+            print("%d, training loss = " % i, tensor.to_numpy(loss)[0])
@@ -46,7 +46,12 @@ optional arguments:
   --n_layers         int        transformer model n_layers              default 6
 ```
 
-run the example
+**run the example**
+
+step 1: Download the dataset to the cmn-eng directory.
+
+step 2: Run the following script.
+
 ```
-python train.py --dataset cmn-2000.txt --max-epoch 100 --batch-size 32 --lr 0.01
+python train.py --dataset cmn-eng/cmn-2000.txt --max-epoch 100 --batch-size 32 --lr 0.01
 ```
@@ -56,12 +56,12 @@ def __len__(self):
 
 
 class CmnDataset:
-    def __init__(self, path='cmn-eng/cmn.txt', shuffle=False, batch_size=32, train_ratio=0.8, random_seed=0):
+    def __init__(self, path, shuffle=False, batch_size=32, train_ratio=0.8, random_seed=0):
         """
         cmn dataset, download from https://www.manythings.org/anki/, contains 29909 Chinese and English translation
         pairs, the pair format: English + TAB + Chinese + TAB + Attribution
         Args:
-            path: the path of the dataset, default 'cmn-eng/cnn.txt'
+            path: the path of the dataset
             shuffle: shuffle the dataset, default False
             batch_size: the size of every batch, default 32
             train_ratio: the proportion of the training set to the total data set, default 0.8
 
@@ -18,4 +18,4 @@
 #
 
 # run this example
-python train.py --dataset cmn-2000.txt --max-epoch 300 --batch-size 32 --lr 0.01
+python train.py --dataset cmn-eng/cmn-2000.txt --max-epoch 100 --batch-size 32 --lr 0.01
@@ -35,7 +35,7 @@ def run(args):
     np.random.seed(args.seed)
 
     batch_size = args.batch_size
-    cmn_dataset = CmnDataset(path="cmn-eng/"+args.dataset, shuffle=args.shuffle, batch_size=batch_size, train_ratio=0.8)
+    cmn_dataset = CmnDataset(path=args.dataset, shuffle=args.shuffle, batch_size=batch_size, train_ratio=0.8)
 
     print("【step-0】 prepare dataset...")
     src_vocab_size, tgt_vocab_size = cmn_dataset.en_vab_size, cmn_dataset.cn_vab_size
@@ -151,8 +151,7 @@ def run(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Training Transformer Model.")
-    parser.add_argument('--dataset', choices=['cmn.txt', 'cmn-15000.txt',
-                                              'cmn-2000.txt'],  default='cmn-2000.txt')
+    parser.add_argument('--dataset', default='cmn-eng/cmn-2000.txt')
     parser.add_argument('--max-epoch', default=100, type=int, help='maximum epochs.', dest='max_epoch')
     parser.add_argument('--batch-size', default=64, type=int, help='batch size', dest='batch_size')
     parser.add_argument('--shuffle', default=True, type=bool, help='shuffle the dataset', dest='shuffle')
Original file line number	Diff line number	Diff line change
`@@ -18,4 +18,4 @@`
`18`	`18`	`#`
`19`	`19`
`20`	`20`	`# run this example`
`21`		`-python train.py --dataset cmn-2000.txt --max-epoch 300 --batch-size 32 --lr 0.01`
	`21`	`+python train.py --dataset cmn-eng/cmn-2000.txt --max-epoch 100 --batch-size 32 --lr 0.01`