fetch_ss_data.py

"""
Queries Semantic Scholar API with arXiv id and downloads paper data.
The script is intended to read from a database pickle (by default db.p),
generated by `./fetch_paper_titles.py`.
"""

import pickle
from arxiv_net.ss import semantic_scholar_api as ss

from arxiv_net.utilities import Config, safe_pickle_dump


if __name__ == "__main__":
    # lets load the database of arxiv id's to memory
    try:
        db = pickle.load(open(Config.db_path, 'rb'))
    except Exception as e:
        print('error loading arxiv id database:')
        raise e

    # lets load the existing ss database to memory
    try:
        ss_db = pickle.load(open(Config.ss_db_path, 'rb'))
    except Exception as e:
        print('error loading semantic scholar database:')
        print(e)
        print('starting from an empty database')
        ss_db = {}

    # -----------------------------------------------------------------------------
    # main loop where we fetch the new results
    print('database has %d entries at start' % (len(db),))
    num_added_total = 0
    for arxiv_id in db.keys():
        if arxiv_id not in ss_db:
            ss_data = ss.get_data(arxiv_id)
            if ss_data is not None:
                ss_db[arxiv_id] = ss_data
                num_added_total += 1

        if num_added_total % 100 == 0:
            print(
                'Saving database with %d papers to %s' % (
                len(ss_db), Config.ss_db_path))
            safe_pickle_dump(ss_db, Config.ss_db_path)