From 9b50b627b4693cbc08a0b706671348eb1ce5452d Mon Sep 17 00:00:00 2001 From: Kwan-Yuet Ho <stephenhky@yahoo.com.hk> Date: Sat, 14 Dec 2024 21:30:03 -0500 Subject: [PATCH] release 2.1.0 --- .readthedocs.yml | 2 +- README.md | 11 +++------- docs/codes.rst | 8 ++++++- docs/install.rst | 2 +- docs/intro.rst | 1 - docs/news.rst | 8 +++++++ docs/scripts.rst | 6 ++++- docs/tutorial_wordembedAPI.rst | 40 ---------------------------------- pyproject.toml | 2 +- 9 files changed, 26 insertions(+), 54 deletions(-) delete mode 100644 docs/tutorial_wordembedAPI.rst diff --git a/.readthedocs.yml b/.readthedocs.yml index c0d768a6..9b7e02d5 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -12,7 +12,7 @@ sphinx: build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.12" # Build documentation with MkDocs #mkdocs: diff --git a/README.md b/README.md index 9da23c04..46f86fe0 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ representation of the texts and documents are needed before they are put into any classification algorithm. In this package, it facilitates various types of these representations, including topic modeling and word-embedding algorithms. -The package `shorttext` runs on Python 3.8, 3.9, 3.10, and 3.11. +The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12. Characteristics: - example data provided (including subject keywords and NIH RePORT); @@ -31,8 +31,7 @@ Characteristics: - maximum entropy classification; - metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); - character-level sequence-to-sequence (seq2seq) learning; -- spell correction; -- API for word-embedding algorithm for one-time loading; and +- spell correction; and - Sentence encodings and similarities based on BERT. ## Documentation @@ -84,6 +83,7 @@ If you would like to contribute, feel free to submit the pull requests. You can ## News +* 12/14/2024: `shorttext` 2.1.0 released. * 07/12/2024: `shorttext` 2.0.0 released. * 12/21/2023: `shorttext` 1.6.1 released. * 08/26/2023: `shorttext` 1.6.0 released. @@ -159,8 +159,3 @@ If you would like to contribute, feel free to submit the pull requests. You can * 12/21/2016: `shorttext` 0.2.0 released. * 11/25/2016: `shorttext` 0.1.2 released. * 11/21/2016: `shorttext` 0.1.1 released. - -## Possible Future Updates - -- [ ] Dividing components to other packages; -- [ ] More available corpus. diff --git a/docs/codes.rst b/docs/codes.rst index e77c2d4a..b86ecd42 100644 --- a/docs/codes.rst +++ b/docs/codes.rst @@ -65,7 +65,13 @@ Module `shorttext.metrics.dynprog` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: shorttext.metrics.dynprog.jaccard - :members: soft_intersection_list + :members: + +.. automodule:: shorttext.metrics.dynprog.dldist + :members: + +.. automodule:: shorttext.metrics.dynprog.lcp + :members: Module `shorttext.metrics.wassersterin` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/install.rst b/docs/install.rst index 99cee6cf..d1d2bacc 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -41,7 +41,7 @@ you may try one (or more) of the following: :: - pip install -U python3-dev + pip install python3-dev diff --git a/docs/intro.rst b/docs/intro.rst index 3a129c5e..929816de 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -23,7 +23,6 @@ Characteristics: - metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); (see :doc:`tutorial_metrics`) - character-level sequence-to-sequence (seq2seq) learning; (see :doc:`tutorial_charbaseseq2seq`) - spell correction; (see :doc:`tutorial_spell`) -- API for word-embedding algorithm for one-time loading; (see :doc:`tutorial_wordembedAPI`) and - Sentence encodings and similarities based on BERT (see :doc:`tutorial_wordembed` and :doc:`tutorial_metrics`). Author: Kwan Yuet Stephen Ho (LinkedIn_, ResearchGate_, Twitter_) diff --git a/docs/news.rst b/docs/news.rst index 4b5a4740..b1b9c266 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -1,6 +1,7 @@ News ==== +* 12/14/2024: `shorttext` 2.1.0 released. * 07/12/2024: `shorttext` 2.0.0 released. * 12/21/2023: `shorttext` 1.6.1 released. * 08/26/2023: `shorttext` 1.6.0 released. @@ -81,6 +82,13 @@ News What's New ---------- +Released 2.1.0 (December 14, 2024) +------------------------------ + +* Use of `pyproject.toml` for package distribution. +* Removed Cython components. +* Huge relative import refactoring. + Released 2.0.0 (July 13, 2024) ------------------------------ diff --git a/docs/scripts.rst b/docs/scripts.rst index 5911a92d..a729b6ed 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -12,6 +12,7 @@ ShortTextCategorizerConsole usage: ShortTextCategorizerConsole [-h] [--wv WV] [--vecsize VECSIZE] [--topn TOPN] [--inputtext INPUTTEXT] + [--type TYPE] model_filepath Perform prediction on short text with a given trained model. @@ -19,7 +20,7 @@ ShortTextCategorizerConsole positional arguments: model_filepath Path of the trained (compact) model. - optional arguments: + options: -h, --help show this help message and exit --wv WV Path of the pre-trained Word2Vec model. (None if not needed) @@ -28,6 +29,9 @@ ShortTextCategorizerConsole --inputtext INPUTTEXT single input text for classification. Run console if set to None. (Default: None) + --type TYPE Type of word-embedding model (default: "word2vec"; + other options: "fasttext", "poincare", + "word2vec_nonbinary", "poincare_binary") ShortTextWordEmbedSimilarity diff --git a/docs/tutorial_wordembedAPI.rst b/docs/tutorial_wordembedAPI.rst deleted file mode 100644 index e38a237b..00000000 --- a/docs/tutorial_wordembedAPI.rst +++ /dev/null @@ -1,40 +0,0 @@ -Word Embedding Models in API -============================ - -A lot of embedding models take a few minutes to load, and it would be desirable -for such a model to be loaded in the memory first. It is why such an API -has been developed. - -Model Preloading ----------------- - -To preload the model, use the script `WordEmbedAPI` provided. In -the command-line shell / Terminal, type: - -``` -> WordEmbedAPI /path/to/GoogleNews-vectors-negative300.bin.gz -``` - -After a few minutes, it will be loaded. - -For details about using `WordEmbedAPI`, please refer to: :doc:`scripts` . - -Class for Preloaded Model -------------------------- - -After the model is loaded, it can be used like other word-embedding models -using `RESTfulKeyedVectors`: - -``` ->>> import shorttext ->>> wmodel = shorttext.utils.wordembed.RESTfulKeyedVectors('http://localhost', port='5000') -``` - -This model can be used like other `gensim` `KeyedVectors`. - -.. autoclass:: shorttext.utils.wordembed.RESTfulKeyedVectors - :members: - - -Home: :doc:`index` - diff --git a/pyproject.toml b/pyproject.toml index 1649aebb..d85ebc10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "shorttext" -version = "2.1.0a1" +version = "2.1.0" authors = [ {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"} ]