Skip to content

Commit

Permalink
changed shell script distribution
Browse files Browse the repository at this point in the history
  • Loading branch information
stephenhky committed Dec 14, 2024
1 parent d2dd2a0 commit 151fe01
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 59 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Documentation = "https://shorttext.readthedocs.io"
[tool.setuptools]
packages = [
"shorttext",
"shorttext.cli",
"shorttext.utils",
"shorttext.classifiers",
"shorttext.classifiers.embed",
Expand All @@ -77,7 +78,8 @@ packages = [
zip-safe = false

[project.scripts]
#retrieve_stock_symbols = "finsim.retrieve_stock_symbols_cli:main_cli"
ShortTextCategorizerConsole = "shorttext.cli.categorization:main"
ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main"

[project.optional-dependencies]
test = ["unittest2", "pytest", "pytest-cython", "simplerepresentations>=0.0.4"]
8 changes: 0 additions & 8 deletions setup.py

This file was deleted.

Empty file added shorttext/cli/__init__.py
Empty file.
57 changes: 28 additions & 29 deletions bin/ShortTextCategorizerConsole → shorttext/cli/categorization.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,20 @@
#!/usr/bin/env python

# argument parsing
import os
from functools import partial
import argparse

def get_argparser():
argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.')
argparser.add_argument('model_filepath', help='Path of the trained (compact) model.')
argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)')
argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)')
argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)')
argparser.add_argument('--type', default='word2vec',
help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")')
return argparser
from ..utils.compactmodel_io import get_model_classifier_name
from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model
from ..smartload import smartload_compact_model
from ..classifiers import TopicVectorCosineDistanceClassifier

argparser = get_argparser()
args = argparser.parse_args()

allowed_classifiers = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn',
'nnlibvec', 'sumvec', 'maxent']
needembedded_classifiers = ['nnlibvec', 'sumvec']
topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder']

# library loading
import os

import shorttext
from shorttext.utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException
from shorttext.utils import load_word2vec_model, load_fasttext_model, load_poincare_model

from functools import partial

load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False)
load_poincare_binary_model = partial(load_poincare_model, binary=True)

Expand All @@ -40,15 +24,30 @@ def get_argparser():
'poincare': load_poincare_model,
'poincare_binary': load_poincare_binary_model}


def get_argparser():
argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.')
argparser.add_argument('model_filepath', help='Path of the trained (compact) model.')
argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)')
argparser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)')
argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)')
argparser.add_argument('--inputtext', default=None, help='single input text for classification. Run console if set to None. (Default: None)')
argparser.add_argument('--type', default='word2vec',
help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare", "word2vec_nonbinary", "poincare_binary")')
return argparser

# main block
if __name__ == '__main__':
def main():
# argument parsing
args = get_argparser().parse_args()

# check if the model file is given
if not os.path.exists(args.model_filepath):
raise IOError('Model file '+args.model_filepath+' not found!')

# get the name of the classifier
print('Retrieving classifier name...')
classifier_name = shorttext.utils.compactmodel_io.get_model_classifier_name(args.model_filepath)
classifier_name = get_model_classifier_name(args.model_filepath)
if not (classifier_name in allowed_classifiers):
raise AlgorithmNotExistException(classifier_name)

Expand All @@ -66,10 +65,10 @@ def get_argparser():
print('Initializing the classifier...')
classifier = None
if classifier_name in topicmodels:
topicmodel = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodel)
topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
classifier = TopicVectorCosineDistanceClassifier(topicmodel)
else:
classifier = shorttext.smartload.smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)
classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize)


if args.inputtext != None:
Expand All @@ -91,4 +90,4 @@ def get_argparser():
else:
run = False

print('Done.')
print('Done.')
41 changes: 20 additions & 21 deletions bin/ShortTextWordEmbedSimilarity → shorttext/cli/wordembedsim.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
#!/usr/bin/env python

# argument parsing
import argparse
import time

from scipy.spatial.distance import cosine

from ..metrics.embedfuzzy import jaccardscore_sents
from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
from ..utils import shorttext_to_avgvec
from ..metrics.wasserstein import word_mover_distance
from ..metrics.dynprog.jaccard import soft_jaccard_score


typedict = {
'word2vec': load_word2vec_model,
'fasttext': load_fasttext_model,
'poincare': load_poincare_model
}


def getargparser():
parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.')
Expand All @@ -10,27 +25,11 @@ def getargparser():
help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")')
return parser

parser = getargparser()
args = parser.parse_args()


import time

from scipy.spatial.distance import cosine

from shorttext.metrics.embedfuzzy import jaccardscore_sents
from shorttext.utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model
from shorttext.utils import shorttext_to_avgvec
from shorttext.metrics.wasserstein import word_mover_distance
from shorttext.metrics.dynprog.jaccard import soft_jaccard_score


typedict = {'word2vec': load_word2vec_model,
'fasttext': load_fasttext_model,
'poincare': load_poincare_model}

def main():
# argument parsing
args = getargparser().parse_args()

if __name__ == '__main__':
# preload tokenizer
tokenize('Mogu is cute.')

Expand Down

0 comments on commit 151fe01

Please sign in to comment.