diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..7a5426e9 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + analytics = false + remote = remote_storage +['remote "remote_storage"'] + url = /home/mlops/dvc_remote diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/data/prepared/.gitignore b/data/prepared/.gitignore index e69de29b..22a65dd9 100644 --- a/data/prepared/.gitignore +++ b/data/prepared/.gitignore @@ -0,0 +1,2 @@ +/train.csv +/test.csv diff --git a/data/prepared/test.csv.dvc b/data/prepared/test.csv.dvc new file mode 100644 index 00000000..3b350283 --- /dev/null +++ b/data/prepared/test.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: cbd4ba69ced15e40820a635e7e741627 + size: 71491 + path: test.csv diff --git a/data/prepared/train.csv.dvc b/data/prepared/train.csv.dvc new file mode 100644 index 00000000..ae08b157 --- /dev/null +++ b/data/prepared/train.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 7f79cf9a4ab1316f7d7246d0b92ea16c + size: 178060 + path: train.csv diff --git a/data/raw/.gitignore b/data/raw/.gitignore index e69de29b..a5d9d98f 100644 --- a/data/raw/.gitignore +++ b/data/raw/.gitignore @@ -0,0 +1,2 @@ +/train +/val diff --git a/data/raw/train.dvc b/data/raw/train.dvc new file mode 100644 index 00000000..14162364 --- /dev/null +++ b/data/raw/train.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7adc7abb69056f4d7afb512c78f2fce9.dir + size: 75309082 + nfiles: 9470 + path: train diff --git a/data/raw/val.dvc b/data/raw/val.dvc new file mode 100644 index 00000000..237148f4 --- /dev/null +++ b/data/raw/val.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 0ad4dcf197b452735726bf8d8777201d.dir + size: 31248080 + nfiles: 3925 + path: val diff --git a/metrics/accuracy.json b/metrics/accuracy.json new file mode 100644 index 00000000..39176609 --- /dev/null +++ b/metrics/accuracy.json @@ -0,0 +1 @@ +{"accuracy": 0.7490494296577946} \ No newline at end of file diff --git a/model/.gitignore b/model/.gitignore index e69de29b..565a9d50 100644 --- a/model/.gitignore +++ b/model/.gitignore @@ -0,0 +1 @@ +/model.joblib diff --git a/model/model.joblib.dvc b/model/model.joblib.dvc new file mode 100644 index 00000000..4b96d593 --- /dev/null +++ b/model/model.joblib.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 22490d1b369e3f7423c5b6ebd4db4234 + size: 241075 + path: model.joblib diff --git a/src/train.py b/src/train.py index e5feeda6..5d3c7c2c 100644 --- a/src/train.py +++ b/src/train.py @@ -37,7 +37,7 @@ def load_data(data_path): def main(repo_path): train_csv_path = repo_path / "data/prepared/train.csv" train_data, labels = load_data(train_csv_path) - sgd = SGDClassifier(max_iter=10) + sgd = SGDClassifier(max_iter=100) trained_model = sgd.fit(train_data, labels) dump(trained_model, repo_path / "model/model.joblib")