init

ChenglongChen · Jun 10, 2018 · 1a04249 · 1a04249
1 parent 654ccea
commit 1a04249
Show file tree

Hide file tree

Showing 12 changed files with 1,714 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,111 @@
+# Byte-compiled / optimized / DLL files
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+#
+.idea
+__pycache__
+analysis.ipynb
+dp.pkl
+tmp.py
+weights/*
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Chenglong Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/__init__.py b/__init__.py
diff --git a/config.py b/config.py
@@ -0,0 +1,5 @@
+
+
+DATA_DIR = "../data"
+
+TRAIN_FILE = DATA_DIR + "/atec_nlp_sim_train_all.csv"
diff --git a/main.py b/main.py
@@ -0,0 +1,154 @@
+
+import config
+import sys
+import numpy as np
+import pandas as pd
+import pickle as pkl
+
+from keras.preprocessing.sequence import pad_sequences
+
+import utils
+from preprocessor import DataProcessor
+from model import SemanticMatchingModel
+
+
+def get_model_data(dataset, params):
+
+    X = {}
+    X['id'] = dataset['id'].values
+    X["label"] = dataset['label'].values
+
+    # word level
+    X['seq_word_left'] = pad_sequences(dataset.seq_word_left, maxlen=params["max_sequence_length_word"],
+                                             padding=params["pad_sequences_padding"],
+                                             truncating=params["pad_sequences_truncating"])
+    X["sequence_length_word"] = params["max_sequence_length_word"] * np.ones(dataset.shape[0])
+
+    X['seq_word_right'] = pad_sequences(dataset.seq_word_right, maxlen=params["max_sequence_length_word"],
+                                             padding=params["pad_sequences_padding"],
+                                             truncating=params["pad_sequences_truncating"])
+    X["sequence_length_word"] = params["max_sequence_length_word"] * np.ones(dataset.shape[0])
+
+    # char level
+    X['seq_char_left'] = pad_sequences(dataset.seq_char_left, maxlen=params["max_sequence_length_char"],
+                                       padding=params["pad_sequences_padding"],
+                                       truncating=params["pad_sequences_truncating"])
+    X["sequence_length_char"] = params["max_sequence_length_char"] * np.ones(dataset.shape[0])
+
+    X['seq_char_right'] = pad_sequences(dataset.seq_char_right, maxlen=params["max_sequence_length_char"],
+                                        padding=params["pad_sequences_padding"],
+                                        truncating=params["pad_sequences_truncating"])
+    X["sequence_length_char"] = params["max_sequence_length_char"] * np.ones(dataset.shape[0])
+
+    return X
+
+params = {
+    "offline_model_dir": "./weights/semantic_matching",
+    "batch_size": 32,
+    "epoch": 5,
+    "l2_lambda": 0.0001,
+
+    "embedding_dropout": 0.2,
+    "embedding_word_dim": 128,
+    "embedding_char_dim": 128,
+    "embedding_dim": 128,
+
+    "max_num_words": 10000,
+    "max_num_chars": 10000,
+
+    "threshold": 0.217277,
+
+    "max_sequence_length_word": 20,
+    "max_sequence_length_char": 30,
+    "pad_sequences_padding": "post",
+    "pad_sequences_truncating": "post",
+
+    "optimizer_type": "nadam",
+    "init_lr": 0.001,
+    "beta1": 0.975,
+    "beta2": 0.999,
+    "decay_steps": 500,
+    "decay_rate": 0.95,
+    "schedule_decay": 0.004,
+    "random_seed": 2018,
+    "eval_every_num_update": 100,
+
+    "encode_method": "fasttext",
+    "attend_method": "attention",
+
+    "cnn_num_filters": 32,
+    "cnn_filter_sizes": [1, 2, 3],
+    "cnn_timedistributed": False,
+
+    "rnn_num_units": 20,
+    "rnn_cell_type": "gru",
+
+    # fc block
+    "fc_type": "fc",
+    "fc_dim": 64,
+    "fc_dropout": 0,
+}
+
+model_name = "semantic_matching"
+
+def train():
+
+    utils._makedirs("../logs")
+    utils._makedirs("../output")
+    logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp())
+
+
+    dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t")
+    dfTrain.columns = ["id", "left", "right", "label"]
+
+    dfTrain.dropna(inplace=True)
+
+    # shuffle training data
+    dfTrain = dfTrain.sample(frac=1.0)
+
+    dp = DataProcessor(max_num_words=params["max_num_words"], max_num_chars=params["max_num_chars"])
+    dfTrain = dp.fit_transform(dfTrain)
+
+    N = dfTrain.shape[0]
+    train_ratio = 0.6
+    train_num = int(N*train_ratio)
+    X_train = get_model_data(dfTrain[:train_num], params)
+    X_valid = get_model_data(dfTrain[train_num:], params)
+
+    model = SemanticMatchingModel(model_name, params, logger=logger, threshold=0.2)
+    model.fit(X_train, validation_data=X_valid, shuffle=False)
+
+    # save model
+    model.save_session()
+    with open("dp.pkl", "wb") as f:
+        pkl.dump((dp, model.threshold), f, protocol=2)
+
+
+def submit(input_file, output_file):
+
+    print("read %s"%input_file)
+    print("write %s"%output_file)
+
+    # load model
+    with open("dp.pkl", "rb") as f:
+        dp, threshold = pkl.load(f)
+    model = SemanticMatchingModel(model_name, params, logger=None, threshold=threshold, training=False)
+    model.restore_session()
+
+    dfTest = pd.read_csv(input_file, header=None, sep="\t")
+    dfTest.columns = ["id", "left", "right"]
+    dfTest["label"] = np.zeros(dfTest.shape[0])
+
+    dfTest = dp.transform(dfTest)
+    X_test = get_model_data(dfTest, params)
+
+    dfTest["label"] = model.predict(X_test)
+
+    dfTest[["id", "label"]].to_csv(output_file, header=False, index=False, sep="\t")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 2:
+        submit(sys.argv[1], sys.argv[2])
+    else:
+        train()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,5 @@


		DATA_DIR = "../data"

		TRAIN_FILE = DATA_DIR + "/atec_nlp_sim_train_all.csv"