diff --git a/Makefile b/Makefile index ab6a709..fe10090 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,17 @@ COMMIT_HASH := $(shell eval git rev-parse HEAD) -cython: - python setup.py build_ext --inplace --force - execute-notebooks: jupyter nbconvert --execute --to notebook --inplace docs/*/*.ipynb --ExecutePreprocessor.timeout=-1 render-notebooks: -doc: render-notebooks - python docs/scripts/index.py - mkdocs build - -livedoc: doc +livedoc: + mkdocs build --clean mkdocs serve --dirtyreload +deploydoc: + mkdocs gh-deploy --force + .PHONY: bench bench: asv run ${COMMIT_HASH} --config benchmarks/asv.conf.json --steps 1 diff --git a/README.md b/README.md index 0751530..39ae11b 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ Cherche enables the development of a neural search pipeline that employs retrievers and pre-trained language models both as retrievers and rankers. The primary advantage of Cherche lies in its capacity to construct end-to-end pipelines. Additionally, Cherche is well-suited for offline semantic search due to its compatibility with batch computation. +Here are some of the features Cherche offers: + [Live demo of a NLP search engine powered by Cherche](https://raphaelsty.github.io/knowledge/?query=cherche%20neural%20search) ![Alt text](docs/img/explain.png) @@ -82,12 +84,18 @@ Here is an example of a neural search pipeline composed of a TF-IDF that quickly ```python from cherche import data, retrieve, rank from sentence_transformers import SentenceTransformer +from lenlp import sparse # List of dicts documents = data.load_towns() # Retrieve on fields title and article -retriever = retrieve.TfIdf(key="id", on=["title", "article"], documents=documents, k=30) +retriever = retrieve.BM25( + key="id", + on=["title", "article"], + documents=documents, + k=30 +) # Rank on fields title and article ranker = rank.Encoder( @@ -163,6 +171,7 @@ search(["Bordeaux", "Paris", "Toulouse"]) Cherche provides [retrievers](https://raphaelsty.github.io/cherche/retrieve/retrieve/) that filter input documents based on a query. - retrieve.TfIdf +- retrieve.BM25 - retrieve.Lunr - retrieve.Flash - retrieve.Encoder @@ -193,7 +202,7 @@ We welcome all contributions. ## Acknowledgements πŸ‘ -TfIdf retriever is a wrapper around [scikit-learn's TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). Lunr retriever is a wrapper around [Lunr.py](https://github.com/yeraydiazdiaz/lunr.py). Flash retriever is a wrapper around [FlashText](https://github.com/vi3k6i5/flashtext). DPR, Encode and CrossEncoder rankers are wrappers dedicated to the use of the pre-trained models of [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html) in a neural search pipeline. +Lunr retriever is a wrapper around [Lunr.py](https://github.com/yeraydiazdiaz/lunr.py). Flash retriever is a wrapper around [FlashText](https://github.com/vi3k6i5/flashtext). DPR, Encode and CrossEncoder rankers are wrappers dedicated to the use of the pre-trained models of [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html) in a neural search pipeline. ## Citations diff --git a/cherche/__version__.py b/cherche/__version__.py index 5996f20..e6b021f 100644 --- a/cherche/__version__.py +++ b/cherche/__version__.py @@ -1,3 +1,3 @@ -VERSION = (2, 0, 6) +VERSION = (2, 1, 0) __version__ = ".".join(map(str, VERSION)) diff --git a/cherche/evaluate/evaluate.py b/cherche/evaluate/evaluate.py index 89b2f78..d0b6820 100644 --- a/cherche/evaluate/evaluate.py +++ b/cherche/evaluate/evaluate.py @@ -3,8 +3,6 @@ import collections import typing -import numpy as np - __all__ = ["evaluation"] @@ -56,7 +54,7 @@ def evaluation( -------- >>> from pprint import pprint as print >>> from cherche import data, evaluate, retrieve - >>> from sklearn.feature_extraction.text import TfidfVectorizer + >>> from lenlp import sparse >>> documents, query_answers = data.arxiv_tags( ... arxiv_title=True, arxiv_summary=False, comment=False @@ -66,7 +64,7 @@ def evaluation( ... key="uri", ... on=["prefLabel_text", "altLabel_text"], ... documents=documents, - ... tfidf=TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char"), + ... tfidf=sparse.TfidfVectorizer(normalize=True, ngram_range=(3, 7), analyzer="char"), ... ) + documents >>> scores = evaluate.evaluation(search=search, query_answers=query_answers, k=10) diff --git a/cherche/query/prf.py b/cherche/query/prf.py index 8cbefd3..875f1d0 100644 --- a/cherche/query/prf.py +++ b/cherche/query/prf.py @@ -2,7 +2,7 @@ import numpy as np import sklearn -from sklearn.feature_extraction.text import TfidfVectorizer +from lenlp import sparse from sklearn.metrics.pairwise import cosine_similarity from ..utils import yield_batch_single @@ -21,7 +21,7 @@ class PRF(Query): on Fields to use for fitting the spelling corrector on. tf - defaults to sklearn.feature_extraction.text.TfidfVectorizer. + defaults to sklearn.feature_extraction.text.sparse.TfidfVectorizer. If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform` `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information. nb_docs @@ -65,7 +65,7 @@ def __init__( self, on: typing.Union[str, list], documents: list, - tf: sklearn.feature_extraction.text.CountVectorizer = TfidfVectorizer(), + tf: sklearn.feature_extraction.text.CountVectorizer = sparse.TfidfVectorizer(), nb_docs: int = 5, nb_terms_per_doc: int = 3, ) -> None: diff --git a/cherche/retrieve/__init__.py b/cherche/retrieve/__init__.py index 9a4a891..bf713ac 100644 --- a/cherche/retrieve/__init__.py +++ b/cherche/retrieve/__init__.py @@ -1,4 +1,5 @@ from .base import Retriever +from .bm25 import BM25 from .dpr import DPR from .embedding import Embedding from .encoder import Encoder @@ -9,6 +10,7 @@ __all__ = [ "Retriever", + "BM25", "DPR", "Embedding", "Encoder", diff --git a/cherche/retrieve/bm25.py b/cherche/retrieve/bm25.py new file mode 100644 index 0000000..5ee4511 --- /dev/null +++ b/cherche/retrieve/bm25.py @@ -0,0 +1,111 @@ +__all__ = ["BM25"] + +import typing + +from lenlp import sparse + +from .tfidf import TfIdf + + +class BM25(TfIdf): + """TfIdf retriever based on cosine similarities. + + Parameters + ---------- + key + Field identifier of each document. + on + Fields to use to match the query to the documents. + documents + Documents in TFIdf retriever are static. The retriever must be reseted to index new + documents. + k + Number of documents to retrieve. Default is `None`, i.e all documents that match the query + will be retrieved. + tfidf + TfidfVectorizer class of Sklearn to create a custom TfIdf retriever. + + Examples + -------- + + >>> from pprint import pprint as print + >>> from cherche import retrieve + + >>> documents = [ + ... {"id": 0, "title": "Paris", "article": "Eiffel tower"}, + ... {"id": 1, "title": "Montreal", "article": "Montreal is in Canada."}, + ... {"id": 2, "title": "Paris", "article": "Eiffel tower"}, + ... {"id": 3, "title": "Montreal", "article": "Montreal is in Canada."}, + ... ] + + >>> retriever = retrieve.BM25( + ... key="id", + ... on=["title", "article"], + ... documents=documents, + ... ) + + >>> documents = [ + ... {"id": 4, "title": "Paris", "article": "Eiffel tower"}, + ... {"id": 5, "title": "Montreal", "article": "Montreal is in Canada."}, + ... {"id": 6, "title": "Paris", "article": "Eiffel tower"}, + ... {"id": 7, "title": "Montreal", "article": "Montreal is in Canada."}, + ... ] + + >>> retriever = retriever.add(documents) + + >>> print(retriever(q=["paris", "canada"], k=4)) + [[{'id': 6, 'similarity': 0.5404109029445249}, + {'id': 0, 'similarity': 0.5404109029445249}, + {'id': 2, 'similarity': 0.5404109029445249}, + {'id': 4, 'similarity': 0.5404109029445249}], + [{'id': 7, 'similarity': 0.3157669764669935}, + {'id': 5, 'similarity': 0.3157669764669935}, + {'id': 3, 'similarity': 0.3157669764669935}, + {'id': 1, 'similarity': 0.3157669764669935}]] + + >>> print(retriever(["unknown", "montreal paris"], k=2)) + [[], + [{'id': 7, 'similarity': 0.7391866872635209}, + {'id': 5, 'similarity': 0.7391866872635209}]] + + + >>> print(retriever(q="paris")) + [{'id': 6, 'similarity': 0.5404109029445249}, + {'id': 0, 'similarity': 0.5404109029445249}, + {'id': 2, 'similarity': 0.5404109029445249}, + {'id': 4, 'similarity': 0.5404109029445249}] + + References + ---------- + 1. [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) + 2. [Python: tf-idf-cosine: to find document similarity](https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity) + + """ + + def __init__( + self, + key: str, + on: typing.Union[str, list], + documents: typing.List[typing.Dict[str, str]] = None, + count_vectorizer: sparse.BM25Vectorizer = None, + k: typing.Optional[int] = None, + batch_size: int = 1024, + fit: bool = True, + ) -> None: + count_vectorizer = ( + sparse.BM25Vectorizer( + normalize=True, ngram_range=(3, 5), analyzer="char_wb" + ) + if count_vectorizer is None + else count_vectorizer + ) + + super().__init__( + key=key, + on=on, + documents=documents, + tfidf=count_vectorizer, + k=k, + batch_size=batch_size, + fit=fit, + ) diff --git a/cherche/retrieve/tfidf.py b/cherche/retrieve/tfidf.py index 511921e..6dd93c0 100644 --- a/cherche/retrieve/tfidf.py +++ b/cherche/retrieve/tfidf.py @@ -3,8 +3,8 @@ import typing import numpy as np +from lenlp import sparse from scipy.sparse import csc_matrix, hstack -from sklearn.feature_extraction.text import TfidfVectorizer from ..utils import yield_batch from .base import Retriever @@ -92,7 +92,7 @@ def __init__( key: str, on: typing.Union[str, list], documents: typing.List[typing.Dict[str, str]] = None, - tfidf: TfidfVectorizer = None, + tfidf: sparse.TfidfVectorizer = None, k: typing.Optional[int] = None, batch_size: int = 1024, fit: bool = True, @@ -100,7 +100,9 @@ def __init__( super().__init__(key=key, on=on, k=k, batch_size=batch_size) self.tfidf = ( - TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char_wb") + sparse.TfidfVectorizer( + normalize=True, ngram_range=(3, 7), analyzer="char_wb" + ) if tfidf is None else tfidf ) @@ -207,7 +209,7 @@ def __call__( ranked = [] for batch in yield_batch( - q, + array=q, batch_size=batch_size if batch_size is not None else self.batch_size, desc=f"{self.__class__.__name__} retriever", tqdm_bar=tqdm_bar, diff --git a/docs/api/evaluate/evaluation.md b/docs/api/evaluate/evaluation.md index 38a0d3b..4be42f4 100644 --- a/docs/api/evaluate/evaluation.md +++ b/docs/api/evaluate/evaluation.md @@ -33,7 +33,7 @@ Evaluation function ```python >>> from pprint import pprint as print >>> from cherche import data, evaluate, retrieve ->>> from sklearn.feature_extraction.text import TfidfVectorizer +>>> from lenlp import sparse >>> documents, query_answers = data.arxiv_tags( ... arxiv_title=True, arxiv_summary=False, comment=False @@ -43,7 +43,7 @@ Evaluation function ... key="uri", ... on=["prefLabel_text", "altLabel_text"], ... documents=documents, -... tfidf=TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char"), +... tfidf=sparse.TfidfVectorizer(normalize=True, ngram_range=(3, 7), analyzer="char"), ... ) + documents >>> scores = evaluate.evaluation(search=search, query_answers=query_answers, k=10) diff --git a/docs/api/query/PRF.md b/docs/api/query/PRF.md index 2bbaf28..a3fc4c4 100644 --- a/docs/api/query/PRF.md +++ b/docs/api/query/PRF.md @@ -12,9 +12,9 @@ Pseudo (or blind) Relevance-Feedback module. The Query-Augmentation method appli - **documents** (*list*) -- **tf** (*sklearn.feature_extraction.text.CountVectorizer*) – defaults to `TfidfVectorizer()` +- **tf** (*sklearn.feature_extraction.text.CountVectorizer*) – defaults to `sparse.TfidfVectorizer()` - defaults to sklearn.feature_extraction.text.TfidfVectorizer. If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform` `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information. + defaults to sklearn.feature_extraction.text.sparse.TfidfVectorizer. If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform` `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information. - **nb_docs** (*int*) – defaults to `5` diff --git a/docs/api/retrieve/TfIdf.md b/docs/api/retrieve/TfIdf.md index 3235084..e88e948 100644 --- a/docs/api/retrieve/TfIdf.md +++ b/docs/api/retrieve/TfIdf.md @@ -18,9 +18,9 @@ TfIdf retriever based on cosine similarities. Documents in TFIdf retriever are static. The retriever must be reseted to index new documents. -- **tfidf** (*sklearn.feature_extraction.text.TfidfVectorizer*) – defaults to `None` +- **tfidf** (*sklearn.feature_extraction.text.sparse.TfidfVectorizer*) – defaults to `None` - TfidfVectorizer class of Sklearn to create a custom TfIdf retriever. + sparse.TfidfVectorizer class of Sklearn to create a custom TfIdf retriever. - **k** (*Optional[int]*) – defaults to `None` @@ -37,7 +37,7 @@ TfIdf retriever based on cosine similarities. ```python >>> from pprint import pprint as print >>> from cherche import retrieve ->>> from sklearn.feature_extraction.text import TfidfVectorizer +>>> from lenlp import sparse >>> documents = [ ... {"id": 0, "title": "Paris", "article": "Eiffel tower"}, @@ -122,6 +122,6 @@ TfIdf retriever based on cosine similarities. ## References -1. [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) +1. [sklearn.feature_extraction.text.sparse.TfidfVectorizer](https://github.com/raphaelsty/LeNLP) 2. [Python: tf-idf-cosine: to find document similarity](https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity) diff --git a/docs/pipeline/pipeline.md b/docs/pipeline/pipeline.md index a99e627..cdc23a5 100644 --- a/docs/pipeline/pipeline.md +++ b/docs/pipeline/pipeline.md @@ -162,7 +162,7 @@ And here is the code: ```python >>> from cherche import retrieve, rank, data >>> from sentence_transformers import SentenceTransformer ->>> from sklearn.feature_extraction.text import TfidfVectorizer +>>> from lenlp import sparse >>> documents, _ = data.arxiv_tags(arxiv_title=True, arxiv_summary=False, comment=False) @@ -185,7 +185,7 @@ And here is the code: ... key = "uri", ... on = ["prefLabel_text", "altLabel_text"], ... documents = documents, -... tfidf = TfidfVectorizer(lowercase=True, min_df=1, max_df=0.9, ngram_range=(3, 7), analyzer="char"), +... tfidf = sparse.TfidfVectorizer(normalize=True, min_df=1, max_df=0.9, ngram_range=(3, 7), analyzer="char"), ... k = 100, ... ) + ranker diff --git a/docs/retrieve/.pages b/docs/retrieve/.pages index 89f9481..c9c1160 100644 --- a/docs/retrieve/.pages +++ b/docs/retrieve/.pages @@ -1,6 +1,7 @@ title: Retrieve nav: - retrieve.md + - bm25.md - tfidf.md - flash.md - lunr.md diff --git a/docs/retrieve/bm25.md b/docs/retrieve/bm25.md new file mode 100644 index 0000000..5aca474 --- /dev/null +++ b/docs/retrieve/bm25.md @@ -0,0 +1,108 @@ +# TfIdf + +Our BM25 retriever relies on the [sparse.BM25Vectorizer](https://github.com/raphaelsty/LeNLP) of LeNLP. + +```python +>>> from cherche import retrieve + +>>> documents = [ +... { +... "id": 0, +... "article": "Paris is the capital and most populous city of France", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... }, +... { +... "id": 1, +... "article": "Paris has been one of Europe major centres of finance, diplomacy , commerce , fashion , gastronomy , science , and arts.", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... }, +... { +... "id": 2, +... "article": "The City of Paris is the centre and seat of government of the region and province of Île-de-France .", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... } +... ] + +>>> retriever = retrieve.BM25(key="id", on=["title", "article"], documents=documents, k=30) + +>>> retriever("france") +[{'id': 0, 'similarity': 0.1236413097778466}, + {'id': 2, 'similarity': 0.08907655343363269}, + {'id': 1, 'similarity': 0.0031730868527342104}] +``` + +We can also initialize the retriever with a custom [sparse.BM25Vectorizer](https://github.com/raphaelsty/LeNLP). + + + +```python +>>> from cherche import retrieve +>>> from lenlp import sparse + +>>> documents = [ +... { +... "id": 0, +... "article": "Paris is the capital and most populous city of France", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... }, +... { +... "id": 1, +... "article": "Paris has been one of Europe major centres of finance, diplomacy , commerce , fashion , gastronomy , science , and arts.", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... }, +... { +... "id": 2, +... "article": "The City of Paris is the centre and seat of government of the region and province of Île-de-France .", +... "title": "Paris", +... "url": "https://en.wikipedia.org/wiki/Paris" +... } +... ] + +>>> count_vectorizer = sparse.BM25Vectorizer( +... normalize=True, ngram_range=(3, 7), analyzer="char_wb") + +>>> retriever = retrieve.BM25Vectorizer( +... key="id", on=["title", "article"], documents=documents, count_vectorizer=count_vectorizer) + +>>> retriever("fra", k=3) +[{'id': 0, 'similarity': 0.15055477454160002}, + {'id': 2, 'similarity': 0.022883459495904895}] +``` + +## Batch retrieval + +If we have several queries for which we want to retrieve the top k documents then we can +pass a list of queries to the retriever. This is much faster for multiple queries. In batch-mode, +retriever returns a list of list of documents instead of a list of documents. + +```python +>>> retriever(["fra", "arts", "capital"], k=3) +[[{'id': 0, 'similarity': 0.051000705070125066}, # Match query 1 + {'id': 2, 'similarity': 0.03415513704304113}], + [{'id': 1, 'similarity': 0.07021399356970497}], # Match query 2 + [{'id': 0, 'similarity': 0.25972148184421534}]] # Match query 3 +``` + +## Map keys to documents + +We can map documents to retrieved keys. + +```python +>>> retriever += documents +>>> retriever("fra") +[{'id': 0, + 'article': 'Paris is the capital and most populous city of France', + 'title': 'Paris', + 'url': 'https://en.wikipedia.org/wiki/Paris', + 'similarity': 0.15055477454160002}, + {'id': 2, + 'article': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France .', + 'title': 'Paris', + 'url': 'https://en.wikipedia.org/wiki/Paris', + 'similarity': 0.022883459495904895}] +``` diff --git a/docs/retrieve/tfidf.md b/docs/retrieve/tfidf.md index 5c339d5..fefc7e1 100644 --- a/docs/retrieve/tfidf.md +++ b/docs/retrieve/tfidf.md @@ -1,6 +1,6 @@ # TfIdf -Our TF-IDF retriever relies on the [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) of Sklearn. It computes the dot product between the query TF-IDF vector and the documents TF-IDF matrix and retrieves the highest match. TfIdf retriever stores a sparse matrix and an index that links the rows of the matrix to document identifiers. +Our TF-IDF retriever relies on the [sparse.TfidfVectorizer](https://github.com/raphaelsty/LeNLP) of Sklearn. It computes the dot product between the query TF-IDF vector and the documents TF-IDF matrix and retrieves the highest match. TfIdf retriever stores a sparse matrix and an index that links the rows of the matrix to document identifiers. ```python >>> from cherche import retrieve @@ -34,11 +34,11 @@ Our TF-IDF retriever relies on the [TfidfVectorizer](https://scikit-learn.org/st {'id': 1, 'similarity': 0.02505818772920329}] ``` -We can also initialize the retriever with a custom [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). +We can also initialize the retriever with a custom [sparse.TfidfVectorizer](https://github.com/raphaelsty/LeNLP). ```python >>> from cherche import retrieve ->>> from sklearn.feature_extraction.text import TfidfVectorizer +>>> from lenlp import sparse >>> documents = [ ... { @@ -61,8 +61,8 @@ We can also initialize the retriever with a custom [TfidfVectorizer](https://sci ... } ... ] ->>> tfidf = TfidfVectorizer( -... lowercase=True, ngram_range=(3, 7), analyzer="char_wb") +>>> tfidf = sparse.TfidfVectorizer( +... normalize=True, ngram_range=(3, 7), analyzer="char_wb") >>> retriever = retrieve.TfIdf( ... key="id", on=["title", "article"], documents=documents, tfidf=tfidf) diff --git a/setup.py b/setup.py index 45535ff..080baec 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ "flashtext >= 2.7", "tqdm >= 4.62.3", "scipy >= 1.7.3", + "lenlp >= 1.0.3", ] cpu = ["sentence-transformers >= 2.2.2", "faiss-cpu >= 1.7.4"]