diff --git a/pyproject.toml b/pyproject.toml index 434fc636..d57efba4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ Documentation = "https://shorttext.readthedocs.io" [tool.setuptools] packages = [ "shorttext", + "shorttext.cli", "shorttext.utils", "shorttext.classifiers", "shorttext.classifiers.embed", @@ -77,7 +78,8 @@ packages = [ zip-safe = false [project.scripts] -#retrieve_stock_symbols = "finsim.retrieve_stock_symbols_cli:main_cli" +ShortTextCategorizerConsole = "shorttext.cli.categorization:main" +ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main" [project.optional-dependencies] test = ["unittest2", "pytest", "pytest-cython", "simplerepresentations>=0.0.4"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 1b57de84..00000000 --- a/setup.py +++ /dev/null @@ -1,8 +0,0 @@ - -from setuptools import setup - - -setup( - scripts=['bin/ShortTextCategorizerConsole', - 'bin/ShortTextWordEmbedSimilarity'] - ) diff --git a/shorttext/cli/__init__.py b/shorttext/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bin/ShortTextCategorizerConsole b/shorttext/cli/categorization.py similarity index 80% rename from bin/ShortTextCategorizerConsole rename to shorttext/cli/categorization.py index ad511561..446c4ea7 100644 --- a/bin/ShortTextCategorizerConsole +++ b/shorttext/cli/categorization.py @@ -1,36 +1,20 @@ -#!/usr/bin/env python -# argument parsing +import os +from functools import partial import argparse -def get_argparser(): - argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.') - argparser.add_argument('model_filepath', help='Path of the trained (compact) model.') - argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)') - argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)') - argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)') - argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)') - argparser.add_argument('--type', default='word2vec', - help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")') - return argparser +from ..utils.compactmodel_io import get_model_classifier_name +from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException +from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model +from ..smartload import smartload_compact_model +from ..classifiers import TopicVectorCosineDistanceClassifier -argparser = get_argparser() -args = argparser.parse_args() allowed_classifiers = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent'] needembedded_classifiers = ['nnlibvec', 'sumvec'] topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder'] -# library loading -import os - -import shorttext -from shorttext.utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException -from shorttext.utils import load_word2vec_model, load_fasttext_model, load_poincare_model - -from functools import partial - load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False) load_poincare_binary_model = partial(load_poincare_model, binary=True) @@ -40,15 +24,30 @@ def get_argparser(): 'poincare': load_poincare_model, 'poincare_binary': load_poincare_binary_model} + +def get_argparser(): + argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.') + argparser.add_argument('model_filepath', help='Path of the trained (compact) model.') + argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)') + argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)') + argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)') + argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)') + argparser.add_argument('--type', default='word2vec', + help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")') + return argparser + # main block -if __name__ == '__main__': +def main(): + # argument parsing + args = get_argparser().parse_args() + # check if the model file is given if not os.path.exists(args.model_filepath): raise IOError('Model file '+args.model_filepath+' not found!') # get the name of the classifier print('Retrieving classifier name...') - classifier_name = shorttext.utils.compactmodel_io.get_model_classifier_name(args.model_filepath) + classifier_name = get_model_classifier_name(args.model_filepath) if not (classifier_name in allowed_classifiers): raise AlgorithmNotExistException(classifier_name) @@ -66,10 +65,10 @@ def get_argparser(): print('Initializing the classifier...') classifier = None if classifier_name in topicmodels: - topicmodel = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) - classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodel) + topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) + classifier = TopicVectorCosineDistanceClassifier(topicmodel) else: - classifier = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) + classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) if args.inputtext != None: @@ -91,4 +90,4 @@ def get_argparser(): else: run = False - print('Done.') \ No newline at end of file + print('Done.') diff --git a/bin/ShortTextWordEmbedSimilarity b/shorttext/cli/wordembedsim.py similarity index 70% rename from bin/ShortTextWordEmbedSimilarity rename to shorttext/cli/wordembedsim.py index eb22f768..db62fcbd 100644 --- a/bin/ShortTextWordEmbedSimilarity +++ b/shorttext/cli/wordembedsim.py @@ -1,7 +1,22 @@ -#!/usr/bin/env python -# argument parsing import argparse +import time + +from scipy.spatial.distance import cosine + +from ..metrics.embedfuzzy import jaccardscore_sents +from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model +from ..utils import shorttext_to_avgvec +from ..metrics.wasserstein import word_mover_distance +from ..metrics.dynprog.jaccard import soft_jaccard_score + + +typedict = { + 'word2vec': load_word2vec_model, + 'fasttext': load_fasttext_model, + 'poincare': load_poincare_model +} + def getargparser(): parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.') @@ -10,27 +25,11 @@ def getargparser(): help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")') return parser -parser = getargparser() -args = parser.parse_args() - - -import time - -from scipy.spatial.distance import cosine - -from shorttext.metrics.embedfuzzy import jaccardscore_sents -from shorttext.utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model -from shorttext.utils import shorttext_to_avgvec -from shorttext.metrics.wasserstein import word_mover_distance -from shorttext.metrics.dynprog.jaccard import soft_jaccard_score - - -typedict = {'word2vec': load_word2vec_model, - 'fasttext': load_fasttext_model, - 'poincare': load_poincare_model} +def main(): + # argument parsing + args = getargparser().parse_args() -if __name__ == '__main__': # preload tokenizer tokenize('Mogu is cute.')