|
2 | 2 |
|
3 | 3 | import logging
|
4 | 4 | import os
|
5 |
| -from typing import List, Optional |
| 5 | +import warnings |
| 6 | +from typing import Any, Dict, List, Literal, Optional |
6 | 7 |
|
7 | 8 | import requests
|
| 9 | + |
8 | 10 | from llama_index.core.base.base_retriever import BaseRetriever
|
9 | 11 | from llama_index.core.callbacks.base import CallbackManager
|
10 | 12 | from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
|
|
13 | 15 |
|
14 | 16 |
|
15 | 17 | class YouRetriever(BaseRetriever):
|
16 |
| - """You retriever.""" |
| 18 | + """ |
| 19 | + Retriever for You.com's Search and News API. |
| 20 | +
|
| 21 | + [API reference](https://documentation.you.com/api-reference/) |
| 22 | +
|
| 23 | + Args: |
| 24 | + api_key: you.com API key, if `YDC_API_KEY` is not set in the environment |
| 25 | + endpoint: you.com endpoints |
| 26 | + num_web_results: The max number of web results to return, must be under 20 |
| 27 | + safesearch: Safesearch settings, one of "off", "moderate", "strict", defaults to moderate |
| 28 | + country: Country code, ex: 'US' for United States, see API reference for more info |
| 29 | + search_lang: (News API) Language codes, ex: 'en' for English, see API reference for more info |
| 30 | + ui_lang: (News API) User interface language for the response, ex: 'en' for English, see API reference for more info |
| 31 | + spellcheck: (News API) Whether to spell check query or not, defaults to True |
| 32 | + """ |
17 | 33 |
|
18 | 34 | def __init__(
|
19 | 35 | self,
|
20 | 36 | api_key: Optional[str] = None,
|
21 | 37 | callback_manager: Optional[CallbackManager] = None,
|
| 38 | + endpoint: Literal["search", "news"] = "search", |
| 39 | + num_web_results: Optional[int] = None, |
| 40 | + safesearch: Optional[Literal["off", "moderate", "strict"]] = None, |
| 41 | + country: Optional[str] = None, |
| 42 | + search_lang: Optional[str] = None, |
| 43 | + ui_lang: Optional[str] = None, |
| 44 | + spellcheck: Optional[bool] = None, |
22 | 45 | ) -> None:
|
23 | 46 | """Init params."""
|
24 |
| - self._api_key = api_key or os.environ["YOU_API_KEY"] |
| 47 | + # Should deprecate `YOU_API_KEY` in favour of `YDC_API_KEY` for standardization purposes |
| 48 | + self._api_key = api_key or os.getenv("YOU_API_KEY") or os.environ["YDC_API_KEY"] |
25 | 49 | super().__init__(callback_manager)
|
26 | 50 |
|
| 51 | + if endpoint not in ("search", "news"): |
| 52 | + raise ValueError('`endpoint` must be either "search" or "news"') |
| 53 | + |
| 54 | + # Raise warning if News API-specific fields are set but endpoint is not "news" |
| 55 | + if endpoint != "news": |
| 56 | + news_api_fields = (search_lang, ui_lang, spellcheck) |
| 57 | + for field in news_api_fields: |
| 58 | + if field: |
| 59 | + warnings.warn( |
| 60 | + ( |
| 61 | + f"News API-specific field '{field}' is set but `{endpoint=}`. " |
| 62 | + "This will have no effect." |
| 63 | + ), |
| 64 | + UserWarning, |
| 65 | + ) |
| 66 | + |
| 67 | + self.endpoint = endpoint |
| 68 | + self.num_web_results = num_web_results |
| 69 | + self.safesearch = safesearch |
| 70 | + self.country = country |
| 71 | + self.search_lang = search_lang |
| 72 | + self.ui_lang = ui_lang |
| 73 | + self.spellcheck = spellcheck |
| 74 | + |
| 75 | + def _generate_params(self, query: str) -> Dict[str, Any]: |
| 76 | + params = {"safesearch": self.safesearch, "country": self.country} |
| 77 | + |
| 78 | + if self.endpoint == "search": |
| 79 | + params.update( |
| 80 | + query=query, |
| 81 | + num_web_results=self.num_web_results, |
| 82 | + ) |
| 83 | + elif self.endpoint == "news": |
| 84 | + params.update( |
| 85 | + q=query, |
| 86 | + count=self.num_web_results, |
| 87 | + search_lang=self.search_lang, |
| 88 | + ui_lang=self.ui_lang, |
| 89 | + spellcheck=self.spellcheck, |
| 90 | + ) |
| 91 | + |
| 92 | + # Remove `None` values |
| 93 | + return {k: v for k, v in params.items() if v is not None} |
| 94 | + |
27 | 95 | def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
|
28 | 96 | """Retrieve."""
|
29 | 97 | headers = {"X-API-Key": self._api_key}
|
30 |
| - results = requests.get( |
31 |
| - f"https://api.ydc-index.io/search?query={query_bundle.query_str}", |
| 98 | + params = self._generate_params(query_bundle.query_str) |
| 99 | + response = requests.get( |
| 100 | + f"https://api.ydc-index.io/{self.endpoint}", |
| 101 | + params=params, |
32 | 102 | headers=headers,
|
33 |
| - ).json() |
| 103 | + ) |
| 104 | + response.raise_for_status() |
| 105 | + results = response.json() |
| 106 | + |
| 107 | + nodes: List[TextNode] = [] |
| 108 | + if self.endpoint == "search": |
| 109 | + for hit in results["hits"]: |
| 110 | + nodes.append( |
| 111 | + TextNode( |
| 112 | + text="\n".join(hit["snippets"]), |
| 113 | + ) |
| 114 | + ) |
| 115 | + else: # news endpoint |
| 116 | + for article in results["news"]["results"]: |
| 117 | + node = TextNode( |
| 118 | + text=article["description"], |
| 119 | + extra_info={"url": article["url"], "age": article["age"]}, |
| 120 | + ) |
| 121 | + nodes.append(node) |
34 | 122 |
|
35 |
| - search_hits = ["\n".join(hit["snippets"]) for hit in results["hits"]] |
36 |
| - return [NodeWithScore(node=TextNode(text=s), score=1.0) for s in search_hits] |
| 123 | + return [NodeWithScore(node=node, score=1.0) for node in nodes] |
0 commit comments