Skip to content

Commit ef2990b

Browse files
Michelle JanowieckiMichelle Janowiecki
Michelle Janowiecki
authored and
Michelle Janowiecki
committed
Improved workflows to use pre exisiting uris
1 parent 5ef7665 commit ef2990b

6 files changed

+62
-39
lines changed

.~lock.cleaned_verified_marcRecords_2020-07-21 09.39.20.csv#

-1
This file was deleted.

addGeoNames.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,18 @@ def explodeHeadingsByIndex(dataframe):
110110
fastresults = explodeHeadingsByIndex(fastresults)
111111
lcnafresults = explodeHeadingsByIndex(lcnafresults)
112112

113-
# Merge results from FAST and LCNAF into one new column 'spatial_coverage.'
114-
# Remove duplicate result from spatial_coverage.
113+
# Merge results from FAST and LCNAF into one new column 'spatial.'
114+
# Remove duplicate result from spatial.
115115
frame = pd.merge(fastresults, lcnafresults, on='oindex', suffixes=('_1', '_2'))
116116
print(frame.head)
117-
frame['spatial_coverage'] = frame['fullName_1']+'|'+frame['fullName_2']
118-
frame.spatial_coverage = frame.spatial_coverage.str.split('|')
119-
frame.spatial_coverage = frame.apply(lambda row:
120-
set(row['spatial_coverage']), axis=1)
121-
frame.spatial_coverage = frame.spatial_coverage.str.join('|')
117+
frame['spatial'] = frame['fullName_1']+'|'+frame['fullName_2']
118+
frame.spatial = frame.spatial.str.split('|')
119+
frame.spatial = frame.apply(lambda row:
120+
set(row['spatial']), axis=1)
121+
frame.spatial = frame.spatial.str.join('|')
122122
frame = frame.drop(columns=['fullName_1', 'fullName_2'])
123123

124-
# Merge 'spatial_coverage' column into marc spreadsheet.
124+
# Merge 'spatial' column into marc spreadsheet.
125125
frame.oindex = frame.oindex.astype('int64')
126126
print(frame.head)
127127
updated = pd.merge(df, frame, left_index=True, right_on='oindex')

cleanUpExtractedMarc.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,15 @@ def addDictonary(columnName, vocab):
3434
else:
3535
terms = [term]
3636
for x in terms:
37-
vocabDict = {'vocab': vocab, 'term': x,
37+
if "http://id.loc.gov/authorities/names/" in x:
38+
termURI = x.rsplit(' ', 1)
39+
uri = termURI[-1].strip()
40+
x = termURI[0].strip()
41+
else:
42+
uri = 'None'
43+
vocabDict = {'vocab': vocab, 'term': x, 'uri': uri,
3844
'field': columnName, 'oindex': index}
45+
print(vocabDict)
3946
searchList.append(vocabDict)
4047

4148

@@ -65,13 +72,12 @@ def addDictonary(columnName, vocab):
6572
df_2 = pd.DataFrame.from_dict(searchList)
6673
df_2.term = df_2['term'].str.strip()
6774
df_2.term = df_2['term'].str.rstrip(',')
68-
# Drop null values before attempting validation.
69-
df_2.dropna(axis=0, inplace=True)
7075
# Group/condense matching headings together in order to perform fewer searches.
7176
# Create column 'oindex' to keep track of original index of headings.
72-
pivoted = pd.pivot_table(df_2, index=['term', 'vocab', 'field'],
77+
pivoted = pd.pivot_table(df_2, index=['term', 'vocab', 'field', 'uri'],
7378
values='oindex',
7479
aggfunc=lambda x: '|'.join(str(v) for v in x))
80+
print(pivoted.head)
7581
pivoted.reset_index(inplace=True)
7682
# Convert dataframe back to dictionary.
7783
updatedList = pd.DataFrame.to_dict(pivoted, orient='records')

convertCSVToJSON.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,9 @@ def addListToDict(json_file, key, value):
9393
addToDict(json_file, 'dc_rights_s', 'rights')
9494
addToDict(json_file, 'dc_title_s', 'title')
9595
addToDict(json_file, 'layer_slug_s', 'layer_slug')
96-
fixGeom(json_file, 'solr_geom', 'solr_geom')
96+
fixGeom(json_file, 'solr_geom', 'bounding_box')
9797
addToDictInt(json_file, 'solr_year_i', 'solr_year')
98+
addToDict(json_file, 'dct_issued_s', 'date_issued')
9899
addListToDict(json_file, 'dc_creator_sm', 'creators')
99100
addToDict(json_file, 'dc_description_s', 'description')
100101
addToDict(json_file, 'dc_format_s', 'format')

convertGeoNamesFromLCNAF.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import requests
2-
from rdflib import Namespace, Graph, URIRef, Literal, RDFS
2+
from rdflib import Namespace, Graph, URIRef, RDFS
33
from bs4 import BeautifulSoup as Soup
44

55
# Convert geographic names from LCNAF to geonames identifiers.

verifyHeadings.py

+41-24
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import requests
22
from bs4 import BeautifulSoup as Soup
33
from rdflib import Namespace, Graph, URIRef
4+
from fuzzywuzzy import fuzz
45

56

67
# Configuration for requests.
@@ -28,37 +29,52 @@ def getGraph(url, format):
2829
return graph
2930

3031

32+
def findTermFromLabel(searchTerm, type):
33+
newURL = None
34+
url = baseURL+type+'/label/'+searchTerm
35+
try:
36+
data = lc.get(url, timeout=30, headers=headers)
37+
foundName = data.ok
38+
newURL = data.url
39+
if foundName:
40+
newURL = data.url
41+
if newURL:
42+
newURL = newURL.replace('.html', '')
43+
return newURL
44+
except requests.Timeout:
45+
pass
46+
47+
48+
def getInfoFromGraph(graph, item, searchTerm, type):
49+
if graph:
50+
for result in graph.subject_objects((mads.authoritativeLabel)):
51+
if auth+type in result[0]:
52+
ratio = fuzz.ratio(result[1].value, searchTerm)
53+
if ratio > 95:
54+
print('Heading validated')
55+
item['authURI'] = result[0].toPython()
56+
item['authLabel'] = result[1].value
57+
58+
3159
def verifyHeadingList(searchList):
3260
all_items = []
3361
for item in searchList:
34-
print(item)
35-
vocab = item.get('vocab')
3662
searchTerm = item.get('term')
37-
print(vocab)
38-
print(searchTerm)
3963
if searchTerm:
40-
searchTerm.rstrip('.')
64+
vocab = item.get('vocab')
4165
type = authorities.get(vocab)
66+
print(vocab)
67+
print(searchTerm)
4268
if vocab != 'fast':
43-
url = baseURL+type+'/label/'+searchTerm
44-
try:
45-
data = lc.get(url, timeout=30, headers=headers)
46-
foundName = data.ok
47-
newURL = data.url
48-
if foundName:
49-
newURL = data.url
50-
newURL = newURL.replace('.html', '')
51-
print(newURL)
69+
if item.get('uri') != 'None':
70+
newURL = item.get('uri')
71+
graph = getGraph(newURL+'.nt', 'nt')
72+
getInfoFromGraph(graph, item, searchTerm, type)
73+
else:
74+
newURL = findTermFromLabel(searchTerm, type)
75+
if newURL:
5276
graph = getGraph(newURL+'.nt', 'nt')
53-
if graph:
54-
for result in graph.subject_objects((mads.authoritativeLabel)):
55-
if auth+type in result[0]:
56-
if result[1].value == searchTerm:
57-
print('Heading validated')
58-
item['authURI'] = result[0]
59-
item['authLabel'] = result[1].value
60-
except requests.Timeout:
61-
pass
77+
getInfoFromGraph(graph, item, searchTerm, type)
6278
else:
6379
data = ft.get(fastURL+searchTerm+fastPara)
6480
data = data.content
@@ -69,10 +85,11 @@ def verifyHeadingList(searchList):
6985
identifier = identifier.string
7086
authLabel = record.find('skos:preflabel')
7187
authLabel = authLabel.string
72-
print(authLabel)
7388
if authLabel == searchTerm:
7489
print('Heading validated')
7590
item['authLabel'] = authLabel
7691
item['authURI'] = identifier
92+
del item['uri']
93+
print(item)
7794
all_items.append(item)
7895
return all_items

0 commit comments

Comments
 (0)