-
Notifications
You must be signed in to change notification settings - Fork 0
/
anb.py
83 lines (76 loc) · 3.7 KB
/
anb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
This module defines what to do when searching on the Oxford Am Natl Bio site.
The goal is to reduce this to lists and let Biohunter do the work.
Jul172013sw
"""
import requests
from bs4 import BeautifulSoup
def seek(parameters):
'''
search api translator for Oxford ANB with ND proxy info.
available search parameters: fulltext, name, gender, Y birth Start, Y birth End, birth state, birth country
'''
if type(parameters) != dict:
return 0
else:
if parameters["birthstate"] != "":
parameters["birthplace"] = parameters["birthstate"]
elif parameters["birthcountry"] != "":
parameters["birthplace"] = parameters["birthcountry"]
else:
parameters["birthplace"] = ""
if parameters["StartYear"] != "":
parameters["startday"] = "1"
parameters["startmonth"] = "1"
else:
parameters["startday"] = ""
parameters["startmonth"] = ""
if parameters["EndYear"] != "":
parameters["endday"] = "31"
parameters["endmonth"] = "12"
else:
parameters["endday"] = ""
parameters["endmonth"] = ""
url = "http://www.anb.org.proxy.library.nd.edu/articles/bin/search.cgi?func=advanced_search&fulltext={0}&idxa=-at&idxb=-bib&field-Name={1}&field-gender={2}&realms_top=Writing+and+Publishing&field-Realm=Writing+and+Publishing&date-m-birthS={3}&date-d-birthS={4}&date-y-birthS={5}&date-m-birthE={6}&date-d-birthE={7}&date-y-birthE={8}&date-m-deathS=&date-d-deathS=&date-y-deathS=&date-m-deathE=&date-d-deathE=&date-y-deathE=&field-BirthPlace={9}&bp_state={10}&bp_country={11}&field-Contrib=&meta-dc=500".format("+".join(parameters["fulltext"].split()), "+".join(parameters["name"].split()), parameters["gender"], parameters["startmonth"], parameters["startday"], parameters["StartYear"], parameters["endmonth"], parameters["endday"], parameters["EndYear"], "+".join(parameters["birthplace"].split()), "+".join(parameters["birthstate"].split()), "+".join(parameters["birthcountry"].split()))
#download search page and read its contents.
#using Requests module since Oxford requires cookies to be accepted
page = requests.get(url)
searchtext = page.text
yummyloc = searchtext.find("Search Results List") + 27
nomoreyummy = searchtext.find("<!--back to the top-->")
washedtext = searchtext[yummyloc:nomoreyummy]
#TODO: count the number of results to determine whether there's more than one page of data
soup = BeautifulSoup(washedtext)
goodstuff = soup.ol.find_all("a")
link_list = []
for href in goodstuff:
link_list.append(href['href'])
count = 0 #trying to be cheap and write over link_list variable onto itself
for x in link_list:
if x.find("?") != -1:
stoppoint = x.find("?")-5
else:
stoppoint = len(x)-5
link_list[count] = "http://www.anb.org.proxy.library.nd.edu/articles/"+x[3:stoppoint]+"-print.html"
count += 1
return link_list
def reap(victim):
writefile = ""
soupfile = requests.get(victim).text
yummyloc = soupfile.find("</TD>")+11
nomoreyummy = soupfile.find("<HR>")-15
soupfile = soupfile[yummyloc:nomoreyummy]
soup = BeautifulSoup(soupfile)
soupfile = soup.find_all("p")
for x in soupfile:
writefile += str(x)+"\n"
return writefile
def redeemer(soul):
soup = BeautifulSoup(soul.replace("\n", " "))
repeat = len(soup.find_all("p"))
count = 0
while (count < repeat):
soup.p.append("\n")
soup.p.unwrap()
count += 1
return soup.get_text()