Merge pull request #20 from raphaelsty/test

BM25 retriever
raphaelsty · May 30, 2024 · 5f149e2 · 5f149e2
2 parents 6aabe70 + 71b66ff
commit 5f149e2
Show file tree

Hide file tree

Showing 16 changed files with 266 additions and 37 deletions.
diff --git a/Makefile b/Makefile
@@ -1,20 +1,17 @@
 COMMIT_HASH := $(shell eval git rev-parse HEAD)
 
-cython:
-	python setup.py build_ext --inplace --force
-
 execute-notebooks:
 	jupyter nbconvert --execute --to notebook --inplace docs/*/*.ipynb --ExecutePreprocessor.timeout=-1
 
 render-notebooks:
 
-doc: render-notebooks
-	python docs/scripts/index.py
-	mkdocs build
-
-livedoc: doc
+livedoc:
+	mkdocs build --clean
 	mkdocs serve --dirtyreload
 
+deploydoc:
+	mkdocs gh-deploy --force
+
 .PHONY: bench
 bench:
 	asv run ${COMMIT_HASH} --config benchmarks/asv.conf.json --steps 1

diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@
 
 Cherche enables the development of a neural search pipeline that employs retrievers and pre-trained language models both as retrievers and rankers. The primary advantage of Cherche lies in its capacity to construct end-to-end pipelines. Additionally, Cherche is well-suited for offline semantic search due to its compatibility with batch computation.
 
+Here are some of the features Cherche offers:
+
 [Live demo of a NLP search engine powered by Cherche](https://raphaelsty.github.io/knowledge/?query=cherche%20neural%20search)
 
 ![Alt text](docs/img/explain.png)
@@ -82,12 +84,18 @@ Here is an example of a neural search pipeline composed of a TF-IDF that quickly
 ```python
 from cherche import data, retrieve, rank
 from sentence_transformers import SentenceTransformer
+from lenlp import sparse
 
 # List of dicts
 documents = data.load_towns()
 
 # Retrieve on fields title and article
-retriever = retrieve.TfIdf(key="id", on=["title", "article"], documents=documents, k=30)
+retriever = retrieve.BM25(
+  key="id", 
+  on=["title", "article"], 
+  documents=documents, 
+  k=30
+)
 
 # Rank on fields title and article
 ranker = rank.Encoder(
@@ -163,6 +171,7 @@ search(["Bordeaux", "Paris", "Toulouse"])
 Cherche provides [retrievers](https://raphaelsty.github.io/cherche/retrieve/retrieve/) that filter input documents based on a query.
 
 - retrieve.TfIdf
+- retrieve.BM25
 - retrieve.Lunr
 - retrieve.Flash
 - retrieve.Encoder
@@ -193,7 +202,7 @@ We welcome all contributions.
 
 ## Acknowledgements 👏
 
-TfIdf retriever is a wrapper around [scikit-learn's TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). Lunr retriever is a wrapper around [Lunr.py](https://github.com/yeraydiazdiaz/lunr.py). Flash retriever is a wrapper around [FlashText](https://github.com/vi3k6i5/flashtext). DPR, Encode and CrossEncoder rankers are wrappers dedicated to the use of the pre-trained models of [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html) in a neural search pipeline.
+Lunr retriever is a wrapper around [Lunr.py](https://github.com/yeraydiazdiaz/lunr.py). Flash retriever is a wrapper around [FlashText](https://github.com/vi3k6i5/flashtext). DPR, Encode and CrossEncoder rankers are wrappers dedicated to the use of the pre-trained models of [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html) in a neural search pipeline.
 
 ## Citations
 

diff --git a/cherche/__version__.py b/cherche/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (2, 0, 6)
+VERSION = (2, 1, 0)
 
 __version__ = ".".join(map(str, VERSION))
diff --git a/cherche/evaluate/evaluate.py b/cherche/evaluate/evaluate.py
@@ -3,8 +3,6 @@
 import collections
 import typing
 
-import numpy as np
-
 __all__ = ["evaluation"]
 
 
@@ -56,7 +54,7 @@ def evaluation(
     --------
     >>> from pprint import pprint as print
     >>> from cherche import data, evaluate, retrieve
-    >>> from sklearn.feature_extraction.text import TfidfVectorizer
+    >>> from lenlp import sparse
 
     >>> documents, query_answers = data.arxiv_tags(
     ...    arxiv_title=True, arxiv_summary=False, comment=False
@@ -66,7 +64,7 @@ def evaluation(
     ...     key="uri",
     ...     on=["prefLabel_text", "altLabel_text"],
     ...     documents=documents,
-    ...     tfidf=TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char"),
+    ...     tfidf=sparse.TfidfVectorizer(normalize=True, ngram_range=(3, 7), analyzer="char"),
     ... ) + documents
 
     >>> scores = evaluate.evaluation(search=search, query_answers=query_answers, k=10)

diff --git a/cherche/query/prf.py b/cherche/query/prf.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import sklearn
-from sklearn.feature_extraction.text import TfidfVectorizer
+from lenlp import sparse
 from sklearn.metrics.pairwise import cosine_similarity
 
 from ..utils import yield_batch_single
@@ -21,7 +21,7 @@ class PRF(Query):
     on
         Fields to use for fitting the spelling corrector on.
     tf
-        defaults to sklearn.feature_extraction.text.TfidfVectorizer.
+        defaults to sklearn.feature_extraction.text.sparse.TfidfVectorizer.
         If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform`
         `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information.
     nb_docs
@@ -65,7 +65,7 @@ def __init__(
         self,
         on: typing.Union[str, list],
         documents: list,
-        tf: sklearn.feature_extraction.text.CountVectorizer = TfidfVectorizer(),
+        tf: sklearn.feature_extraction.text.CountVectorizer = sparse.TfidfVectorizer(),
         nb_docs: int = 5,
         nb_terms_per_doc: int = 3,
     ) -> None:

diff --git a/cherche/retrieve/__init__.py b/cherche/retrieve/__init__.py
@@ -1,4 +1,5 @@
 from .base import Retriever
+from .bm25 import BM25
 from .dpr import DPR
 from .embedding import Embedding
 from .encoder import Encoder
@@ -9,6 +10,7 @@
 
 __all__ = [
     "Retriever",
+    "BM25",
     "DPR",
     "Embedding",
     "Encoder",

diff --git a/cherche/retrieve/bm25.py b/cherche/retrieve/bm25.py
@@ -0,0 +1,111 @@
+__all__ = ["BM25"]
+
+import typing
+
+from lenlp import sparse
+
+from .tfidf import TfIdf
+
+
+class BM25(TfIdf):
+    """TfIdf retriever based on cosine similarities.
+
+    Parameters
+    ----------
+    key
+        Field identifier of each document.
+    on
+        Fields to use to match the query to the documents.
+    documents
+        Documents in TFIdf retriever are static. The retriever must be reseted to index new
+        documents.
+    k
+        Number of documents to retrieve. Default is `None`, i.e all documents that match the query
+        will be retrieved.
+    tfidf
+        TfidfVectorizer class of Sklearn to create a custom TfIdf retriever.
+
+    Examples
+    --------
+
+    >>> from pprint import pprint as print
+    >>> from cherche import retrieve
+
+    >>> documents = [
+    ...     {"id": 0, "title": "Paris", "article": "Eiffel tower"},
+    ...     {"id": 1, "title": "Montreal", "article": "Montreal is in Canada."},
+    ...     {"id": 2, "title": "Paris", "article": "Eiffel tower"},
+    ...     {"id": 3, "title": "Montreal", "article": "Montreal is in Canada."},
+    ... ]
+
+    >>> retriever = retrieve.BM25(
+    ...     key="id",
+    ...     on=["title", "article"],
+    ...     documents=documents,
+    ... )
+
+    >>> documents = [
+    ...     {"id": 4, "title": "Paris", "article": "Eiffel tower"},
+    ...     {"id": 5, "title": "Montreal", "article": "Montreal is in Canada."},
+    ...     {"id": 6, "title": "Paris", "article": "Eiffel tower"},
+    ...     {"id": 7, "title": "Montreal", "article": "Montreal is in Canada."},
+    ... ]
+
+    >>> retriever = retriever.add(documents)
+
+    >>> print(retriever(q=["paris", "canada"], k=4))
+    [[{'id': 6, 'similarity': 0.5404109029445249},
+      {'id': 0, 'similarity': 0.5404109029445249},
+      {'id': 2, 'similarity': 0.5404109029445249},
+      {'id': 4, 'similarity': 0.5404109029445249}],
+     [{'id': 7, 'similarity': 0.3157669764669935},
+      {'id': 5, 'similarity': 0.3157669764669935},
+      {'id': 3, 'similarity': 0.3157669764669935},
+      {'id': 1, 'similarity': 0.3157669764669935}]]
+
+    >>> print(retriever(["unknown", "montreal paris"], k=2))
+    [[],
+     [{'id': 7, 'similarity': 0.7391866872635209},
+      {'id': 5, 'similarity': 0.7391866872635209}]]
+
+
+    >>> print(retriever(q="paris"))
+    [{'id': 6, 'similarity': 0.5404109029445249},
+     {'id': 0, 'similarity': 0.5404109029445249},
+     {'id': 2, 'similarity': 0.5404109029445249},
+     {'id': 4, 'similarity': 0.5404109029445249}]
+
+    References
+    ----------
+    1. [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
+    2. [Python: tf-idf-cosine: to find document similarity](https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity)
+
+    """
+
+    def __init__(
+        self,
+        key: str,
+        on: typing.Union[str, list],
+        documents: typing.List[typing.Dict[str, str]] = None,
+        count_vectorizer: sparse.BM25Vectorizer = None,
+        k: typing.Optional[int] = None,
+        batch_size: int = 1024,
+        fit: bool = True,
+    ) -> None:
+        count_vectorizer = (
+            sparse.BM25Vectorizer(
+                normalize=True, ngram_range=(3, 5), analyzer="char_wb"
+            )
+            if count_vectorizer is None
+            else count_vectorizer
+        )
+
+        super().__init__(
+            key=key,
+            on=on,
+            documents=documents,
+            tfidf=count_vectorizer,
+            k=k,
+            batch_size=batch_size,
+            fit=fit,
+        )
diff --git a/cherche/retrieve/tfidf.py b/cherche/retrieve/tfidf.py
@@ -3,8 +3,8 @@
 import typing
 
 import numpy as np
+from lenlp import sparse
 from scipy.sparse import csc_matrix, hstack
-from sklearn.feature_extraction.text import TfidfVectorizer
 
 from ..utils import yield_batch
 from .base import Retriever
@@ -92,15 +92,17 @@ def __init__(
         key: str,
         on: typing.Union[str, list],
         documents: typing.List[typing.Dict[str, str]] = None,
-        tfidf: TfidfVectorizer = None,
+        tfidf: sparse.TfidfVectorizer = None,
         k: typing.Optional[int] = None,
         batch_size: int = 1024,
         fit: bool = True,
     ) -> None:
         super().__init__(key=key, on=on, k=k, batch_size=batch_size)
 
         self.tfidf = (
-            TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char_wb")
+            sparse.TfidfVectorizer(
+                normalize=True, ngram_range=(3, 7), analyzer="char_wb"
+            )
             if tfidf is None
             else tfidf
         )
@@ -207,7 +209,7 @@ def __call__(
         ranked = []
 
         for batch in yield_batch(
-            q,
+            array=q,
             batch_size=batch_size if batch_size is not None else self.batch_size,
             desc=f"{self.__class__.__name__} retriever",
             tqdm_bar=tqdm_bar,

diff --git a/docs/api/evaluate/evaluation.md b/docs/api/evaluate/evaluation.md
@@ -33,7 +33,7 @@ Evaluation function
 ```python
 >>> from pprint import pprint as print
 >>> from cherche import data, evaluate, retrieve
->>> from sklearn.feature_extraction.text import TfidfVectorizer
+>>> from lenlp import sparse
 
 >>> documents, query_answers = data.arxiv_tags(
 ...    arxiv_title=True, arxiv_summary=False, comment=False
@@ -43,7 +43,7 @@ Evaluation function
 ...     key="uri",
 ...     on=["prefLabel_text", "altLabel_text"],
 ...     documents=documents,
-...     tfidf=TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char"),
+...     tfidf=sparse.TfidfVectorizer(normalize=True, ngram_range=(3, 7), analyzer="char"),
 ... ) + documents
 
 >>> scores = evaluate.evaluation(search=search, query_answers=query_answers, k=10)

diff --git a/docs/api/query/PRF.md b/docs/api/query/PRF.md
@@ -12,9 +12,9 @@ Pseudo (or blind) Relevance-Feedback module. The Query-Augmentation method appli
 
 - **documents** (*list*)
 
-- **tf** (*sklearn.feature_extraction.text.CountVectorizer*) – defaults to `TfidfVectorizer()`
+- **tf** (*sklearn.feature_extraction.text.CountVectorizer*) – defaults to `sparse.TfidfVectorizer()`
 
-    defaults to sklearn.feature_extraction.text.TfidfVectorizer. If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform` `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information.
+    defaults to sklearn.feature_extraction.text.sparse.TfidfVectorizer. If you want to implement your own tf, it needs to follow the sklearn base API and provides the `transform` `fit_transform` and `get_feature_names_out` methods. See sklearn documentation for more information.
 
 - **nb_docs** (*int*) – defaults to `5`
 

diff --git a/docs/api/retrieve/TfIdf.md b/docs/api/retrieve/TfIdf.md
@@ -18,9 +18,9 @@ TfIdf retriever based on cosine similarities.
 
     Documents in TFIdf retriever are static. The retriever must be reseted to index new documents.
 
-- **tfidf** (*sklearn.feature_extraction.text.TfidfVectorizer*) – defaults to `None`
+- **tfidf** (*sklearn.feature_extraction.text.sparse.TfidfVectorizer*) – defaults to `None`
 
-    TfidfVectorizer class of Sklearn to create a custom TfIdf retriever.
+    sparse.TfidfVectorizer class of Sklearn to create a custom TfIdf retriever.
 
 - **k** (*Optional[int]*) – defaults to `None`
 
@@ -37,7 +37,7 @@ TfIdf retriever based on cosine similarities.
 ```python
 >>> from pprint import pprint as print
 >>> from cherche import retrieve
->>> from sklearn.feature_extraction.text import TfidfVectorizer
+>>> from lenlp import sparse
 
 >>> documents = [
 ...     {"id": 0, "title": "Paris", "article": "Eiffel tower"},
@@ -122,6 +122,6 @@ TfIdf retriever based on cosine similarities.
 
 ## References
 
-1. [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
+1. [sklearn.feature_extraction.text.sparse.TfidfVectorizer](https://github.com/raphaelsty/LeNLP)
 2. [Python: tf-idf-cosine: to find document similarity](https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity)
 
diff --git a/docs/pipeline/pipeline.md b/docs/pipeline/pipeline.md
@@ -162,7 +162,7 @@ And here is the code:
 ```python
 >>> from cherche import retrieve, rank, data
 >>> from sentence_transformers import SentenceTransformer
->>> from sklearn.feature_extraction.text import TfidfVectorizer
+>>> from lenlp import sparse
 
 >>> documents, _ = data.arxiv_tags(arxiv_title=True, arxiv_summary=False, comment=False)
 
@@ -185,7 +185,7 @@ And here is the code:
 ...    key = "uri",
 ...    on = ["prefLabel_text", "altLabel_text"],
 ...    documents = documents,
-...    tfidf = TfidfVectorizer(lowercase=True, min_df=1, max_df=0.9, ngram_range=(3, 7), analyzer="char"),
+...    tfidf = sparse.TfidfVectorizer(normalize=True, min_df=1, max_df=0.9, ngram_range=(3, 7), analyzer="char"),
 ...    k = 100,
 ... ) + ranker
 

diff --git a/docs/retrieve/.pages b/docs/retrieve/.pages
@@ -1,6 +1,7 @@
 title: Retrieve
 nav:
     - retrieve.md
+    - bm25.md
     - tfidf.md
     - flash.md
     - lunr.md