-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_ss_data.py
45 lines (38 loc) · 1.47 KB
/
fetch_ss_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Queries Semantic Scholar API with arXiv id and downloads paper data.
The script is intended to read from a database pickle (by default db.p),
generated by `./fetch_paper_titles.py`.
"""
import pickle
from arxiv_net.ss import semantic_scholar_api as ss
from arxiv_net.utilities import Config, safe_pickle_dump
if __name__ == "__main__":
# lets load the database of arxiv id's to memory
try:
db = pickle.load(open(Config.db_path, 'rb'))
except Exception as e:
print('error loading arxiv id database:')
raise e
# lets load the existing ss database to memory
try:
ss_db = pickle.load(open(Config.ss_db_path, 'rb'))
except Exception as e:
print('error loading semantic scholar database:')
print(e)
print('starting from an empty database')
ss_db = {}
# -----------------------------------------------------------------------------
# main loop where we fetch the new results
print('database has %d entries at start' % (len(db),))
num_added_total = 0
for arxiv_id in db.keys():
if arxiv_id not in ss_db:
ss_data = ss.get_data(arxiv_id)
if ss_data is not None:
ss_db[arxiv_id] = ss_data
num_added_total += 1
if num_added_total % 100 == 0:
print(
'Saving database with %d papers to %s' % (
len(ss_db), Config.ss_db_path))
safe_pickle_dump(ss_db, Config.ss_db_path)