changed shell script distribution

stephenhky · Dec 14, 2024 · 151fe01 · 151fe01
1 parent d2dd2a0
commit 151fe01
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 59 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ Documentation = "https://shorttext.readthedocs.io"
 [tool.setuptools]
 packages = [
     "shorttext",
+    "shorttext.cli",
     "shorttext.utils",
     "shorttext.classifiers",
     "shorttext.classifiers.embed",
@@ -77,7 +78,8 @@ packages = [
 zip-safe = false
 
 [project.scripts]
-#retrieve_stock_symbols = "finsim.retrieve_stock_symbols_cli:main_cli"
+ShortTextCategorizerConsole = "shorttext.cli.categorization:main"
+ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main"
 
 [project.optional-dependencies]
 test = ["unittest2", "pytest", "pytest-cython", "simplerepresentations>=0.0.4"]
diff --git a/setup.py b/setup.py
diff --git a/shorttext/cli/__init__.py b/shorttext/cli/__init__.py
diff --git a/bin/ShortTextCategorizerConsole → shorttext/cli/categorization.py b/bin/ShortTextCategorizerConsole → shorttext/cli/categorization.py
@@ -1,36 +1,20 @@
-#!/usr/bin/env python
 
-# argument parsing
+import os
+from functools import partial
 import argparse
 
-def get_argparser():
-    argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.')
-    argparser.add_argument('model_filepath', help='Path of the trained (compact) model.')
-    argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)')
-    argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
-    argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)')
-    argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)')
-    argparser.add_argument('--type', default='word2vec',
-                           help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")')
-    return argparser
+from ..utils.compactmodel_io import get_model_classifier_name
+from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
+from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model
+from ..smartload import smartload_compact_model
+from ..classifiers import TopicVectorCosineDistanceClassifier
 
-argparser = get_argparser()
-args = argparser.parse_args()
 
 allowed_classifiers = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn',
                        'nnlibvec', 'sumvec', 'maxent']
 needembedded_classifiers = ['nnlibvec', 'sumvec']
 topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder']
 
-# library loading
-import os
-
-import shorttext
-from shorttext.utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
-from shorttext.utils import load_word2vec_model, load_fasttext_model, load_poincare_model
-
-from functools import partial
-
 load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False)
 load_poincare_binary_model = partial(load_poincare_model, binary=True)
 
@@ -40,15 +24,30 @@ def get_argparser():
             'poincare': load_poincare_model,
             'poincare_binary': load_poincare_binary_model}
 
+
+def get_argparser():
+    argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.')
+    argparser.add_argument('model_filepath', help='Path of the trained (compact) model.')
+    argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)')
+    argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
+    argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)')
+    argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)')
+    argparser.add_argument('--type', default='word2vec',
+                           help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")')
+    return argparser
+
 # main block
-if __name__ == '__main__':
+def main():
+    # argument parsing
+    args = get_argparser().parse_args()
+
     # check if the model file is given
     if not os.path.exists(args.model_filepath):
         raise IOError('Model file '+args.model_filepath+' not found!')
 
     # get the name of the classifier
     print('Retrieving classifier name...')
-    classifier_name = shorttext.utils.compactmodel_io.get_model_classifier_name(args.model_filepath)
+    classifier_name = get_model_classifier_name(args.model_filepath)
     if not (classifier_name in allowed_classifiers):
         raise AlgorithmNotExistException(classifier_name)
 
@@ -66,10 +65,10 @@ def get_argparser():
     print('Initializing the classifier...')
     classifier = None
     if classifier_name in topicmodels:
-        topicmodel = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
-        classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodel)
+        topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
+        classifier = TopicVectorCosineDistanceClassifier(topicmodel)
     else:
-        classifier = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
+        classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
 
 
     if args.inputtext != None:
@@ -91,4 +90,4 @@ def get_argparser():
             else:
                 run = False
 
-        print('Done.')
+        print('Done.')
diff --git a/bin/ShortTextWordEmbedSimilarity → shorttext/cli/wordembedsim.py b/bin/ShortTextWordEmbedSimilarity → shorttext/cli/wordembedsim.py
@@ -1,7 +1,22 @@
-#!/usr/bin/env python
 
-# argument parsing
 import argparse
+import time
+
+from scipy.spatial.distance import cosine
+
+from ..metrics.embedfuzzy import jaccardscore_sents
+from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
+from ..utils import shorttext_to_avgvec
+from ..metrics.wasserstein import word_mover_distance
+from ..metrics.dynprog.jaccard import soft_jaccard_score
+
+
+typedict = {
+    'word2vec': load_word2vec_model,
+    'fasttext': load_fasttext_model,
+    'poincare': load_poincare_model
+}
+
 
 def getargparser():
     parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.')
@@ -10,27 +25,11 @@ def getargparser():
                         help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")')
     return parser
 
-parser = getargparser()
-args = parser.parse_args()
-
-
-import time
-
-from scipy.spatial.distance import cosine
-
-from shorttext.metrics.embedfuzzy import jaccardscore_sents
-from shorttext.utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
-from shorttext.utils import shorttext_to_avgvec
-from shorttext.metrics.wasserstein import word_mover_distance
-from shorttext.metrics.dynprog.jaccard import soft_jaccard_score
-
-
-typedict = {'word2vec': load_word2vec_model,
-            'fasttext': load_fasttext_model,
-            'poincare': load_poincare_model}
 
+def main():
+    # argument parsing
+    args = getargparser().parse_args()
 
-if __name__ == '__main__':
     # preload tokenizer
     tokenize('Mogu is cute.')