Improved workflows to use pre exisiting uris

Michelle Janowiecki · Michelle Janowiecki · commit ef2990bda1fc · 2020-08-07T11:14:09.000-04:00
diff --git a/.~lock.cleaned_verified_marcRecords_2020-07-21 09.39.20.csv# b/.~lock.cleaned_verified_marcRecords_2020-07-21 09.39.20.csv#
diff --git a/addGeoNames.py b/addGeoNames.py
@@ -110,18 +110,18 @@ def explodeHeadingsByIndex(dataframe):
 fastresults = explodeHeadingsByIndex(fastresults)
 lcnafresults = explodeHeadingsByIndex(lcnafresults)
 
-# Merge results from FAST and LCNAF into one new column 'spatial_coverage.'
-# Remove duplicate result from spatial_coverage.
+# Merge results from FAST and LCNAF into one new column 'spatial.'
+# Remove duplicate result from spatial.
 frame = pd.merge(fastresults, lcnafresults, on='oindex', suffixes=('_1', '_2'))
 print(frame.head)
-frame['spatial_coverage'] = frame['fullName_1']+'|'+frame['fullName_2']
-frame.spatial_coverage = frame.spatial_coverage.str.split('|')
-frame.spatial_coverage = frame.apply(lambda row:
-                                     set(row['spatial_coverage']), axis=1)
-frame.spatial_coverage = frame.spatial_coverage.str.join('|')
+frame['spatial'] = frame['fullName_1']+'|'+frame['fullName_2']
+frame.spatial = frame.spatial.str.split('|')
+frame.spatial = frame.apply(lambda row:
+                            set(row['spatial']), axis=1)
+frame.spatial = frame.spatial.str.join('|')
 frame = frame.drop(columns=['fullName_1', 'fullName_2'])
 
-# Merge 'spatial_coverage' column into marc spreadsheet.
+# Merge 'spatial' column into marc spreadsheet.
 frame.oindex = frame.oindex.astype('int64')
 print(frame.head)
 updated = pd.merge(df, frame, left_index=True, right_on='oindex')
diff --git a/cleanUpExtractedMarc.py b/cleanUpExtractedMarc.py
@@ -34,8 +34,15 @@ def addDictonary(columnName, vocab):
         else:
             terms = [term]
         for x in terms:
-            vocabDict = {'vocab': vocab, 'term': x,
+            if "http://id.loc.gov/authorities/names/" in x:
+                termURI = x.rsplit(' ', 1)
+                uri = termURI[-1].strip()
+                x = termURI[0].strip()
+            else:
+                uri = 'None'
+            vocabDict = {'vocab': vocab, 'term': x, 'uri': uri,
                          'field': columnName, 'oindex': index}
+            print(vocabDict)
             searchList.append(vocabDict)
 
 
@@ -65,13 +72,12 @@ def addDictonary(columnName, vocab):
     df_2 = pd.DataFrame.from_dict(searchList)
     df_2.term = df_2['term'].str.strip()
     df_2.term = df_2['term'].str.rstrip(',')
-    # Drop null values before attempting validation.
-    df_2.dropna(axis=0, inplace=True)
     # Group/condense matching headings together in order to perform fewer searches.
     # Create column 'oindex' to keep track of original index of headings.
-    pivoted = pd.pivot_table(df_2, index=['term', 'vocab', 'field'],
+    pivoted = pd.pivot_table(df_2, index=['term', 'vocab', 'field', 'uri'],
                              values='oindex',
                              aggfunc=lambda x: '|'.join(str(v) for v in x))
+    print(pivoted.head)
     pivoted.reset_index(inplace=True)
     # Convert dataframe back to dictionary.
     updatedList = pd.DataFrame.to_dict(pivoted, orient='records')
diff --git a/convertCSVToJSON.py b/convertCSVToJSON.py
@@ -93,8 +93,9 @@ def addListToDict(json_file, key, value):
         addToDict(json_file, 'dc_rights_s', 'rights')
         addToDict(json_file, 'dc_title_s', 'title')
         addToDict(json_file, 'layer_slug_s', 'layer_slug')
-        fixGeom(json_file, 'solr_geom', 'solr_geom')
+        fixGeom(json_file, 'solr_geom', 'bounding_box')
         addToDictInt(json_file, 'solr_year_i', 'solr_year')
+        addToDict(json_file, 'dct_issued_s', 'date_issued')
         addListToDict(json_file, 'dc_creator_sm', 'creators')
         addToDict(json_file, 'dc_description_s', 'description')
         addToDict(json_file, 'dc_format_s', 'format')
diff --git a/convertGeoNamesFromLCNAF.py b/convertGeoNamesFromLCNAF.py
@@ -1,5 +1,5 @@
 import requests
-from rdflib import Namespace, Graph, URIRef, Literal, RDFS
+from rdflib import Namespace, Graph, URIRef, RDFS
 from bs4 import BeautifulSoup as Soup
 
 # Convert geographic names from LCNAF to geonames identifiers.
diff --git a/verifyHeadings.py b/verifyHeadings.py
@@ -1,6 +1,7 @@
 import requests
 from bs4 import BeautifulSoup as Soup
 from rdflib import Namespace, Graph, URIRef
+from fuzzywuzzy import fuzz
 
 
 # Configuration for requests.
@@ -28,37 +29,52 @@ def getGraph(url, format):
     return graph
 
 
+def findTermFromLabel(searchTerm, type):
+    newURL = None
+    url = baseURL+type+'/label/'+searchTerm
+    try:
+        data = lc.get(url, timeout=30, headers=headers)
+        foundName = data.ok
+        newURL = data.url
+        if foundName:
+            newURL = data.url
+            if newURL:
+                newURL = newURL.replace('.html', '')
+                return newURL
+    except requests.Timeout:
+        pass
+
+
+def getInfoFromGraph(graph, item, searchTerm, type):
+    if graph:
+        for result in graph.subject_objects((mads.authoritativeLabel)):
+            if auth+type in result[0]:
+                ratio = fuzz.ratio(result[1].value, searchTerm)
+                if ratio > 95:
+                    print('Heading validated')
+                    item['authURI'] = result[0].toPython()
+                    item['authLabel'] = result[1].value
+
+
 def verifyHeadingList(searchList):
     all_items = []
     for item in searchList:
-        print(item)
-        vocab = item.get('vocab')
         searchTerm = item.get('term')
-        print(vocab)
-        print(searchTerm)
         if searchTerm:
-            searchTerm.rstrip('.')
+            vocab = item.get('vocab')
             type = authorities.get(vocab)
+            print(vocab)
+            print(searchTerm)
             if vocab != 'fast':
-                url = baseURL+type+'/label/'+searchTerm
-                try:
-                    data = lc.get(url, timeout=30, headers=headers)
-                    foundName = data.ok
-                    newURL = data.url
-                    if foundName:
-                        newURL = data.url
-                        newURL = newURL.replace('.html', '')
-                        print(newURL)
+                if item.get('uri') != 'None':
+                    newURL = item.get('uri')
+                    graph = getGraph(newURL+'.nt', 'nt')
+                    getInfoFromGraph(graph, item, searchTerm, type)
+                else:
+                    newURL = findTermFromLabel(searchTerm, type)
+                    if newURL:
                         graph = getGraph(newURL+'.nt', 'nt')
-                        if graph:
-                            for result in graph.subject_objects((mads.authoritativeLabel)):
-                                if auth+type in result[0]:
-                                    if result[1].value == searchTerm:
-                                        print('Heading validated')
-                                        item['authURI'] = result[0]
-                                        item['authLabel'] = result[1].value
-                except requests.Timeout:
-                    pass
+                        getInfoFromGraph(graph, item, searchTerm, type)
             else:
                 data = ft.get(fastURL+searchTerm+fastPara)
                 data = data.content
@@ -69,10 +85,11 @@ def verifyHeadingList(searchList):
                     identifier = identifier.string
                     authLabel = record.find('skos:preflabel')
                     authLabel = authLabel.string
-                    print(authLabel)
                     if authLabel == searchTerm:
                         print('Heading validated')
                         item['authLabel'] = authLabel
                         item['authURI'] = identifier
+        del item['uri']
+        print(item)
         all_items.append(item)
     return all_items