Skip to content

Commit

Permalink
v0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
alech97 committed Sep 19, 2018
0 parents commit bb6e0f4
Show file tree
Hide file tree
Showing 2 changed files with 2,832 additions and 0 deletions.
94 changes: 94 additions & 0 deletions ScopusParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import bs4, requests, sys, pprint

DOMAIN = 'https://www.scopus.com'
END_TAIL = '/search/submit/authorFreeLookup.uri'

#Parses html response from scopus for an author search
def search_author(firstname, lastname, org, pp=False):
text_response = search_author_request(firstname, lastname, org).text
bs = bs4.BeautifulSoup(text_response, 'html.parser')
authors = []
for row in bs.find_all('tr', class_='searchArea'):
author = {}
names = []
author['names'] = names
#Get names and url from name_col
name_col = row.find('td', class_='authorResultsNamesCol')
a_first = name_col.find('a')
if a_first:
author['url'] = a_first.get('href')
names.append(a_first.text.strip())
author['scopus_auth_id'] = author['url'].split('authorID=')[1].split('&')[0]
for div in name_col.find_all('div'):
names.append(div.text.strip())
if len(names) == 0:
names.append(name_col.text.strip())
#Get number of documents
author['documents'] = int(
row.find(
'td',
id=lambda x: x and x.startswith('resultsDocumentsCol')
).find('a').text
)
#Get Affiliation
a_aff = row.find('td', class_='dataCol5').find('a')
author['affiliation_url'] = a_aff.get('href')
author['affiliation'] = a_aff.text.strip()
#Add author
authors.append(author)
if pp:
print_author_results(authors)
return authors

#Sends a post request to scopus based on three search fields, returns response
def search_author_request(firstname, lastname, org):
form_data = {
'origin': 'searchauthorfreelookup',
'freeSearch': True,
'src': '',
'edit': '',
'poppUp': '',
'exactSearch': 'on',
'searchterm1': lastname,
'searchterm2': firstname,
'institute': org,
'submitButtonName': 'Search',
'orcidId': '',
'authSubject': 'LFSC',
'_authSubject': 'on',
'authSubject': 'HLSC',
'_authSubject': 'on',
'authSubject': 'PHSC',
'_authSubject': 'on',
'authSubject': 'SOSC',
'_authSubject': 'on'
}
s = requests.Session()
s.get('https://www.scopus.com/freelookup/form/author.uri?zone=&origin=AuthorProfile')
r = s.post(DOMAIN + END_TAIL, data=form_data)
return r

#Pretty prints the return of a search_author call
def print_author_results(authors):
for i in range(len(authors)):
print(i, '\t', authors[i]['names'][0])
for j in range(1, len(authors[i]['names'])):
print('\t', authors[i]['names'][j])
print('Affiliation:', authors[i]['affiliation'])
print('Documents:', authors[i]['documents'])
print('')

def main():
pp = pprint.PrettyPrinter(indent=4)
if len(sys.argv) != 4:
print('Incorrect formatting. Should be: "fname" "lname" "org"')
else:
authors = search_author(*sys.argv[1:], pp=True)
ind = 0
if len(authors) > 1:
ind = int(input('Select the author using the number above.\n'))
print('Selected', ind)
pp.pprint(authors[ind])

if __name__ == "__main__":
main()
Loading

0 comments on commit bb6e0f4

Please sign in to comment.