Load images from directory and train a model

soerface · soerface · commit 313bdd4b700f · 2018-09-21T22:09:48.000+02:00
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -21,4 +21,5 @@ WORKDIR /opt/openautoml/runner
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
+COPY data/ data/
 COPY src/ src/
diff --git a/Dockerfile.nvidia b/Dockerfile.nvidia
@@ -38,5 +38,5 @@ RUN rm -rf pytorch
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 
-
+COPY data/ data/
 COPY src/ src/
diff --git a/models/mnist_model.h5 b/models/mnist_model.h5
diff --git a/models/mnist_model.png b/models/mnist_model.png
diff --git a/src/constants.py b/src/constants.py
@@ -0,0 +1,6 @@
+import os
+from pathlib import Path
+
+BASE_PATH = Path(os.path.realpath(__file__)).parents[1]
+
+DATA_PATH = BASE_PATH / 'data'
diff --git a/src/hello_flowers.py b/src/hello_flowers.py
@@ -0,0 +1,36 @@
+from autokeras.image_supervised import ImageClassifier
+from datetime import datetime
+from autokeras.image_supervised import load_image_dataset
+from constants import DATA_PATH
+
+
+def main():
+    started = datetime.now()
+
+    x_train, y_train = load_image_dataset(csv_file_path=DATA_PATH / 'flowers_one_dir' / 'train' / 'labels.csv',
+                                          images_path=DATA_PATH / 'flowers_one_dir' / 'train')
+
+    x_test, y_test = load_image_dataset(csv_file_path=DATA_PATH / 'flowers_one_dir' / 'test' / 'labels.csv',
+                                        images_path=DATA_PATH / 'flowers_one_dir' / 'train')
+
+    clf = ImageClassifier(verbose=True, searcher_args={
+        'trainer_args': {
+            'max_iter_num': 10,
+        }
+    })
+    print(clf)
+    # clf.fit(x_train, y_train, time_limit=12 * 60 * 60)
+    clf.fit(x_train, y_train, time_limit=6 * 60 * 60)
+    # clf.final_fit(x_train, y_train, x_test, y_test, retrain=True)
+    clf.final_fit(x_train, y_train, x_test, y_test, trainer_args={
+        'max_iter_num': 10,
+    })
+    y = clf.evaluate(x_test, y_test)
+    print(y)
+    clf.load_searcher().load_best_model().produce_keras_model().save('flower_model.h5')
+    finished = datetime.now() - started
+    print(f'Total training duration: {finished}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/hello_mnist.py b/src/hello_mnist.py
@@ -2,6 +2,7 @@
 from autokeras.image_supervised import ImageClassifier
 from datetime import datetime
 
+
 def main():
     started = datetime.now()
     (x_train, y_train), (x_test, y_test) = mnist.load_data()
@@ -10,19 +11,19 @@ def main():
 
     clf = ImageClassifier(verbose=True, searcher_args={
         'trainer_args': {
-            'max_iter_num': 3,
+            'max_iter_num': 10,
         }
     })
     print(clf)
     # clf.fit(x_train, y_train, time_limit=12 * 60 * 60)
-    clf.fit(x_train, y_train, time_limit=10 * 60)
+    clf.fit(x_train, y_train, time_limit=60 * 60)
     # clf.final_fit(x_train, y_train, x_test, y_test, retrain=True)
     clf.final_fit(x_train, y_train, x_test, y_test, trainer_args={
-        'max_iter_num': 3,
+        'max_iter_num': 10,
     })
     y = clf.evaluate(x_test, y_test)
     print(y)
-    clf.load_searcher().load_best_model().produce_keras_model().save('my_model.h5')
+    clf.load_searcher().load_best_model().produce_keras_model().save('mnist_model.h5')
     finished = datetime.now() - started
     print(f'Total training duration: {finished}')
 
diff --git a/src/preprocess_flowers.py b/src/preprocess_flowers.py
@@ -0,0 +1,48 @@
+from constants import DATA_PATH
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import shutil
+
+
+# TODO: Scrape this preprocessing when https://github.com/jhfjhfj1/autokeras/issues/204 is fixed and use
+# TODO: keras.preprocessing.image.ImageDataGenerator.flow_from_directory(dir)
+# TODO: https://keras.io/preprocessing/image/
+
+df_all_train = pd.DataFrame()
+df_all_test = pd.DataFrame()
+
+for dir_path in (DATA_PATH / 'flowers').glob('*'):
+    df = pd.DataFrame([
+        {
+            'File Name': f'{dir_path.name}_{x.name}',
+            'Label': dir_path.name
+        } for x in dir_path.glob('*.jpg')
+    ])
+
+    df_train, df_test = train_test_split(df, test_size=.15)
+
+    df_all_test = df_all_test.append(df_test)
+    df_all_train = df_all_train.append(df_train)
+
+shutil.rmtree(DATA_PATH / 'flowers_one_dir', ignore_errors=True)
+
+train_dir = DATA_PATH / 'flowers_one_dir' / 'train'
+test_dir = DATA_PATH / 'flowers_one_dir' / 'test'
+
+train_dir.mkdir(parents=True)
+test_dir.mkdir(parents=True)
+
+df_all_train.to_csv(train_dir / 'labels.csv', index=False)
+df_all_test.to_csv(test_dir / 'labels.csv', index=False)
+
+for index, row in df_all_train.iterrows():
+    src = DATA_PATH / 'flowers' / row['File Name'].replace('_', '/', 1)
+    dst = train_dir / row['File Name']
+    shutil.copy(src, dst)
+
+for index, row in df_all_test.iterrows():
+    src = DATA_PATH / 'flowers' / row['File Name'].replace('_', '/', 1)
+    dst = test_dir / row['File Name']
+    shutil.copy(src, dst)
+
+# print(df_all_test)