ECP-CANDLE · samadejacobs · Nov 18, 2020 · Dec 9, 2020 · Jan 5, 2021
diff --git a/workflows/lbann/.gitignore b/workflows/lbann/.gitignore
@@ -0,0 +1,36 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.x
+
+# Compiled Dynamic libraries
+*.so
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+
+*~
+.DS_Store
+
+*.d
+/.cproject
+/.project
+/.settings
+
+Debug/
+Release/
+
+/output/
+scratch/
+
+# ipython notebooks
+*.ipynb
+.ipynb_checkpoints
+
+# dataspace runtime conf file
+conf
+experiments
+test_data/combo_model.h5
diff --git a/workflows/lbann/data/mnist_params.json b/workflows/lbann/data/mnist_params.json
@@ -0,0 +1,67 @@
+[
+  {
+    "name": "conv",
+    "type": "constant",
+    "value": "6 5 1 16 5 1",
+    "debug_value": "0 0 0"
+  },
+
+  {
+    "name": "classes",
+    "type": "constant",
+    "value": 10,
+    "comment": "debug: 1000, default: remove this entry"
+  },
+
+  {
+    "name": "dense",
+    "type": "constant",
+    "value": "120 84"
+  },
+
+  {
+    "name": "activation",
+    "type": "categorical",
+    "element_type": "string",
+    "values": ["relu", "elu", "relu", "tanh"]
+  },
+
+  {
+    "name": "optimizer",
+    "type": "categorical",
+    "element_type": "string",
+    "values": ["adam", "sgd", "adagrad"]
+  },
+  {
+    "name": "pool_mode",
+    "type": "categorical",
+    "element_type": "string",
+    "values": ["max", "average"]
+  },
+
+  {
+    "name": "lr",
+    "type": "float",
+    "lower": 0.0001,
+    "upper": 0.01,
+    "sigma": 0.045
+  },
+
+
+  {
+    "name": "batch_size",
+    "type": "ordered",
+    "element_type": "int",
+    "values": [128, 128, 128, 128, 128, 128],
+    "sigma": 1
+  },
+
+  {
+    "name": "epochs",
+    "type": "int",
+    "lower": 40,
+    "upper": 60,
+    "sigma": 1
+  }
+
+]
diff --git a/workflows/lbann/models/mnist/data/__init__.py b/workflows/lbann/models/mnist/data/__init__.py
diff --git a/workflows/lbann/models/mnist/data/mnist/.gitignore b/workflows/lbann/models/mnist/data/mnist/.gitignore
@@ -0,0 +1,5 @@
+*.gz
+train-images-idx3-ubyte
+train-labels-idx1-ubyte
+t10k-images-idx3-ubyte
+t10k-labels-idx1-ubyte
diff --git a/workflows/lbann/models/mnist/data/mnist/__init__.py b/workflows/lbann/models/mnist/data/mnist/__init__.py
@@ -0,0 +1,59 @@
+import gzip
+import os
+import os.path
+import urllib.request
+
+import google.protobuf.text_format
+import lbann
+
+# Paths
+data_dir = os.path.dirname(os.path.realpath(__file__))
+
+def download_data():
+    """Download MNIST data files, if needed.
+
+    Data files are downloaded from http://yann.lecun.com/exdb/mnist/
+    and uncompressed. Does nothing if the files already exist.
+
+    """
+
+    # MNIST data files and associated URLs
+    urls = {
+        'train-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
+        'train-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
+        't10k-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
+        't10k-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
+    }
+
+    # Download and uncompress MNIST data files, if needed
+    for data_file, url in urls.items():
+        data_file = os.path.join(data_dir, data_file)
+        compressed_file = data_file + '.gz'
+        if not os.path.isfile(data_file):
+            urllib.request.urlretrieve(url, filename=compressed_file)
+            with gzip.open(compressed_file, 'rb') as in_file:
+                with open(data_file, 'wb') as out_file:
+                    out_file.write(in_file.read())
+
+def make_data_reader():
+    """Make Protobuf message for MNIST data reader.
+
+    MNIST data is downloaded if needed.
+
+    """
+
+    # Download MNIST data files
+    download_data()
+
+    # Load Protobuf message from file
+    protobuf_file = os.path.join(data_dir, 'data_reader.prototext')
+    message = lbann.lbann_pb2.LbannPB()
+    with open(protobuf_file, 'r') as f:
+        google.protobuf.text_format.Merge(f.read(), message)
+    message = message.data_reader
+
+    # Set paths
+    for reader in message.reader:
+        reader.data_filedir = data_dir
+
+    return message
diff --git a/workflows/lbann/models/mnist/data/mnist/data_reader.prototext b/workflows/lbann/models/mnist/data/mnist/data_reader.prototext
@@ -0,0 +1,30 @@
+data_reader {
+  reader {
+    name: "mnist"
+    role: "train"
+    shuffle: true
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "train-images-idx3-ubyte"
+    label_filename: "train-labels-idx1-ubyte"
+    validation_percent: 0.1
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+  reader {
+    name: "mnist"
+    role: "test"
+    data_filedir: "lbann/applications/vision/data/mnist"
+    data_filename: "t10k-images-idx3-ubyte"
+    label_filename: "t10k-labels-idx1-ubyte"
+    percent_of_data_to_use: 1.0
+    transforms {
+      scale {
+        scale: 0.003921568627  # 1/255
+      }
+    }
+  }
+}
diff --git a/workflows/lbann/models/mnist/mnist_baseline.py b/workflows/lbann/models/mnist/mnist_baseline.py
@@ -0,0 +1,185 @@
+import pandas as pd
+import numpy as np
+import os
+import sys
+import gzip
+import argparse
+##LBANN stuff
+import lbann
+import data.mnist
+import lbann.contrib.args
+import lbann.contrib.launcher
+
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+
+
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+
+def common_parser(parser):
+
+    parser.add_argument("--config_file", dest='config_file', type=str,
+                        default=os.path.join(file_path, 'mnist_default_model.txt'),
+                        help="specify model configuration file")
+    parser.add_argument("--nodes", type=int, default=8)
+
+    return parser
+
+def get_model_parser():
+
+	parser = argparse.ArgumentParser(prog='mnist_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                     description='MNIST LBANN ')
+
+	return common_parser(parser).parse_args()
+
+def read_config_file(file):
+    #print("Reading default config (param) file : ", file)
+    config=configparser.ConfigParser()
+    config.read(file)
+    section=config.sections()
+    fileParams={}
+
+    fileParams['model_name']=eval(config.get(section[0],'model_name'))
+    fileParams['conv']=eval(config.get(section[0],'conv'))
+    fileParams['dense']=eval(config.get(section[0],'dense'))
+    fileParams['activation']=eval(config.get(section[0],'activation'))
+    fileParams['pool_mode']=eval(config.get(section[0],'pool_mode'))
+    #fileParams['optimizer']=eval(config.get(section[0],'optimizer'))
+    fileParams['epochs']=eval(config.get(section[0],'epochs'))
+    fileParams['batch_size']=eval(config.get(section[0],'batch_size'))
+    fileParams['classes']=eval(config.get(section[0],'classes'))
+    fileParams['save']=eval(config.get(section[0], 'save'))
+    fileParams['lr']=eval(config.get(section[0], 'lr'))
+
+    return fileParams
+
+def initialize_parameters(args):
+    # Get command-line parameters
+    #args = get_model_parser()
+    #args = parser.parse_args()
+    # Get parameters from configuration file
+    gParameters = read_config_file(args.config_file)
+    return gParameters
+
+def get_activation(name, x):
+      if name  == 'relu':
+          return lbann.Relu(x)
+      elif name == 'tanh' :
+           return lbann.Tanh(x)
+      elif name == 'elu' :
+           return lbann.Elu(x)
+      elif name == 'selu' :
+           return lbann.Selu(x)
+      elif name == 'leaky_relu' :
+           return lbann.LeakyRelu(x)
+      elif name == 'softplus' :
+           return lbann.Softplus(x)
+
+
+def run(gParameters,run_args,exp_dir=None):
+
+    #convs: out_c, conv_dim, conv_stride
+    conv_outc= []
+    conv_dim = []
+    conv_stride = []
+    conv_params = list(range(0, len(gParameters['conv']), 3))
+    for l, i in enumerate(conv_params):
+        conv_outc.append(gParameters['conv'][i])
+        conv_dim.append(gParameters['conv'][i+1])
+        conv_stride.append(gParameters['conv'][i+2])
+
+    # Input data
+    input_ = lbann.Input(target_mode='classification')
+    images = lbann.Identity(input_)
+    labels = lbann.Identity(input_)
+    # LeNet
+    x = lbann.Convolution(images,
+                      num_dims = 2,
+                      num_output_channels = conv_outc[0],
+                      num_groups = 1,
+                      conv_dims_i = conv_dim[0],
+                      conv_strides_i = conv_stride[0],
+                      conv_dilations_i = 1,
+                      has_bias = True)
+    x = get_activation(gParameters['activation'],x)
+    x = lbann.Pooling(x,
+                  num_dims = 2,
+                  pool_dims_i = 2,
+                  pool_strides_i = 2,
+                  pool_mode = str(gParameters['pool_mode']))
+    x = lbann.Convolution(x,
+                      num_dims = 2,
+                      num_output_channels = conv_outc[1],
+                      num_groups = 1,
+                      conv_dims_i = conv_dim[1],
+                      conv_strides_i = conv_stride[1],
+                      conv_dilations_i = 1,
+                      has_bias = True)
+    x = get_activation(gParameters['activation'],x)
+    x = lbann.Pooling(x,
+                  num_dims = 2,
+                  pool_dims_i = 2,
+                  pool_strides_i = 2,
+                  pool_mode = str(gParameters['pool_mode']))
+    x = lbann.FullyConnected(x, num_neurons = gParameters['dense'][0], has_bias = True)
+    x = get_activation(gParameters['activation'],x)
+    x = lbann.FullyConnected(x, num_neurons = gParameters['dense'][1], has_bias = True)
+    x = get_activation(gParameters['activation'],x)
+    x = lbann.FullyConnected(x, num_neurons = gParameters['classes'], has_bias = True)
+    probs = lbann.Softmax(x)
+
+    # Loss function and accuracy
+    loss = lbann.CrossEntropy(probs, labels)
+    acc = lbann.CategoricalAccuracy(probs, labels)
+    lr = gParameters['lr']
+    opt = lbann.SGD(learn_rate=lr, momentum=0.9)
+    ##Uncomment to support optimizer exchange
+    '''
+    if gParameters['optimizer'] == 'adam':
+        opt = lbann.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, eps=1e-8)
+    elif gParameters['optimizer'] == 'adagrad':
+        opt = lbann.AdaGrad(learn_rate=lr, eps=1e-8)
+    '''
+    model = lbann.Model(gParameters['epochs'],
+                    layers=lbann.traverse_layer_graph(input_),
+                    objective_function=loss,
+                    metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
+                    callbacks=[lbann.CallbackPrintModelDescription(),
+                               lbann.CallbackPrint(),
+                               lbann.CallbackTimer()])
+    #lbann.CallbackLTFB(batch_interval=100,metric='accuracy')])
+
+    # Setup data reader
+    data_reader = data.mnist.make_data_reader()
+
+    # Setup trainer
+    job_name = "t"+ str(gParameters['run_id']-1)
+    trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size'])
+    status = lbann.contrib.launcher.run(
+        trainer,
+        model,
+        data_reader,
+        opt,
+        #work_dir=gParameters['save'],
+        work_dir=exp_dir,
+        nodes=run_args.nodes,
+        #proto_file_name=job_name+"exp.prototext",
+        proto_file_name="experiment.prototext.trainer"+str(gParameters['run_id']-1),
+        job_name=job_name,
+        setup_only = True,
+        #batch_job = True,
+        lbann_args=['--generate_multi_proto --procs_per_trainer=4']
+        #lbann_args=['--generate_multi_proto']
+     )
+
+def main():
+
+    args = get_model_parser()
+    gParameters = initialize_parameters(args)
+    run(gParameters)
+
+if __name__ == '__main__':
+    main()