Skip to content

Commit 37deee0

Browse files
committed
Added caching feature for search engine
1 parent f3ae96a commit 37deee0

File tree

5 files changed

+110
-9
lines changed

5 files changed

+110
-9
lines changed

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,6 @@ venv.bak/
102102

103103
# mypy
104104
.mypy_cache/
105+
106+
#search_engine_parser cache
107+
**/cache/**

Diff for: README.md

+14
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,19 @@ For localization, you can pass the `url` keyword and a localized url. This would
125125
results = gsearch.search(*search_args, url="google.de")
126126
```
127127

128+
The results are automatically cached for engine searches, you can either bypass cache by adding `cache=False` to the `search` method or clear the engines cache
129+
```python
130+
github = GitHubSearch()
131+
# bypass the cache
132+
github.search("search-engine-parser", cache=False)
133+
134+
#OR
135+
136+
# clear cache before search
137+
github.clear_cache()
138+
github.search("search-engine-parser")
139+
```
140+
128141
#### Async
129142
search-engine-parser supports `async` hence you could use codes like
130143
```python
@@ -203,6 +216,7 @@ optional arguments:
203216
-t TYPE, --type TYPE Type of detail to return i.e full, links, desciptions
204217
or titles (default: full)
205218
-r RANK, --rank RANK ID of Detail to return e.g 5 (default: 0)
219+
-cc, --clear_cache Clear cache of engine before searching
206220
```
207221

208222
## Code of Conduct

Diff for: search_engine_parser/core/base.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from search_engine_parser.core import utils
1616
from search_engine_parser.core.exceptions import NoResultsOrTrafficError
1717

18+
CACHEHANDLER = utils.CacheHandler()
1819

1920
@unique
2021
class ReturnType(Enum):
@@ -153,7 +154,18 @@ def headers(self):
153154
}
154155
return headers
155156

156-
async def get_source(self, url):
157+
def clear_cache(self, all_cache=False):
158+
"""
159+
Triggers the clear cache function for a particular engine
160+
161+
:param all_cache: if True, deletes for all engines
162+
"""
163+
if all_cache:
164+
CACHEHANDLER.clear()
165+
else:
166+
CACHEHANDLER.clear(self.name)
167+
168+
async def get_source(self, url, cache=True):
157169
"""
158170
Returns the source code of a webpage.
159171
@@ -162,20 +174,18 @@ async def get_source(self, url):
162174
:return: html source code of a given URL.
163175
"""
164176
try:
165-
async with aiohttp.ClientSession() as session:
166-
async with session.get(url, headers=self.headers()) as resp:
167-
html = await resp.text()
177+
html = await CACHEHANDLER.get_source(self.name, url, self.headers(), cache)
168178
except Exception as exc:
169179
raise Exception('ERROR: {}\n'.format(exc))
170-
return str(html)
180+
return html
171181

172-
async def get_soup(self, url):
182+
async def get_soup(self, url, cache):
173183
"""
174184
Get the html soup of a query
175185
176186
:rtype: `bs4.element.ResultSet`
177187
"""
178-
html = await self.get_source(url)
188+
html = await self.get_source(url, cache)
179189
return BeautifulSoup(html, 'lxml')
180190

181191
def get_search_url(self, query=None, page=None, **kwargs):
@@ -222,7 +232,7 @@ def get_results(self, soup, **kwargs):
222232

223233
return search_results
224234

225-
def search(self, query=None, page=1, **kwargs):
235+
def search(self, query=None, page=1, cache=True, **kwargs):
226236
"""
227237
Query the search engine
228238
@@ -237,7 +247,8 @@ def search(self, query=None, page=1, **kwargs):
237247
soup = loop.run_until_complete(
238248
self.get_soup(
239249
self.get_search_url(
240-
query, page, **kwargs)))
250+
query, page, **kwargs),
251+
cache=cache))
241252
return self.get_results(soup, **kwargs)
242253

243254
async def async_search(self, query=None, page=1, callback=None, **kwargs):

Diff for: search_engine_parser/core/cli.py

+12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import argparse
77
import sys
88
from importlib import import_module
9+
from datetime import datetime
910

1011
from blessed import Terminal
1112
from search_engine_parser import __version__
@@ -69,9 +70,15 @@ def main(args): # pylint: disable=too-many-branches
6970
# Initialize search Engine with required params
7071
engine = engine_class()
7172
try:
73+
if args['clear_cache']:
74+
engine.clear_cache()
7275
# Display full details: Header, Link, Description
76+
start = datetime.now()
7377
results = engine.search(args['query'], args['page'], return_type=ReturnType(args["type"]), url=args.get("url"))
78+
duration = datetime.now() - start
7479
display(results, term, type=args.get('type'), rank=args.get('rank'))
80+
print("Total search took -> %s seconds" %(duration))
81+
print("Used Cache -> {}".format(not args["clear_cache"]))
7582
except NoResultsOrTrafficError as exc:
7683
print('\n', '{}'.format(term.red(str(exc))))
7784

@@ -113,6 +120,11 @@ def runner():
113120
'-t', '--type',
114121
help='Type of detail to return i.e full, links, desciptions or titles (default: full)',
115122
default="full")
123+
parser_search.add_argument(
124+
'-cc', '--clear_cache',
125+
action='store_true',
126+
help='Clear cache of engine before searching'
127+
)
116128
parser_search.add_argument(
117129
'-r',
118130
'--rank',

Diff for: search_engine_parser/core/utils.py

+61
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1+
import os
12
import random
3+
import pickle
4+
import hashlib
5+
import aiohttp
6+
7+
FILEPATH = os.path.dirname(os.path.abspath(__file__))
28

39
# prevent caching
410
USER_AGENT_LIST = [
@@ -16,3 +22,58 @@
1622

1723
def get_rand_user_agent():
1824
return random.choice(USER_AGENT_LIST)
25+
26+
27+
class CacheHandler:
28+
def __init__(self):
29+
if not os.path.exists(os.path.join(FILEPATH, "cache")):
30+
os.makedirs("cache")
31+
self.cache = os.path.join(FILEPATH, "cache")
32+
enginelist = os.listdir(os.path.join(FILEPATH, "engines"))
33+
self.engine_cache = {i[:-3]: os.path.join(self.cache, i[:-3]) for i in enginelist if i not in
34+
("__init__.py")}
35+
for cache in self.engine_cache.values():
36+
if not os.path.exists(cache):
37+
os.makedirs(cache)
38+
39+
async def get_source(self, engine, url, headers, cache=True):
40+
"""
41+
Retrieves source code of webpage from internet or from cache
42+
43+
:rtype: str
44+
:param engine: engine of the engine saving
45+
:param url: URL to pull source code from
46+
:param headers: request headers to make use of
47+
:param cache: use cache or not
48+
"""
49+
encodedUrl = url.encode("utf-8")
50+
urlhash = hashlib.sha256(encodedUrl).hexdigest()
51+
engine = engine.lower()
52+
cache_path = os.path.join(self.engine_cache[engine], urlhash)
53+
if os.path.exists(cache_path) and cache:
54+
with open(cache_path, 'rb') as stream:
55+
return pickle.load(stream)
56+
async with aiohttp.ClientSession() as session:
57+
async with session.get(url, headers=headers) as resp:
58+
html = await resp.text()
59+
with open(cache_path, 'wb') as stream:
60+
pickle.dump(str(html), stream)
61+
return str(html)
62+
63+
def clear(self, engine=None):
64+
"""
65+
Clear the entire cache either by engine name
66+
or just all
67+
68+
:param engine: engine to clear
69+
"""
70+
if not engine:
71+
for engine_cache in self.engine_cache.values():
72+
for root, dirs, files in os.walk(engine_cache):
73+
for f in files:
74+
os.remove(os.path.join(engine_cache, f))
75+
else:
76+
engine_cache = self.engine_cache[engine.lower()]
77+
for _, _, files in os.walk(engine_cache):
78+
for f in files:
79+
os.remove(os.path.join(engine_cache, f))

0 commit comments

Comments
 (0)