forked from b-cube/semantics_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
identify_priority_services.py
64 lines (50 loc) · 2.06 KB
/
identify_priority_services.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import glob
import json
from lib.identifier import Identify
from lib.parser import Parser
from lib.rawresponse import RawResponse
import logging
'''
so a quick script to demonstrate the identification
process against a local file store of solr responses
'''
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
YAML_FILE = 'lib/configs/all_identifiers.yaml'
# responses = glob.glob('testdata/docs/response_60de9ec6341a2116ff4bb2739c307739.json')
responses = glob.glob('testdata/second_harvest/docs/*.json')
# with open('testdata/probable_ogc.txt', 'r') as f:
# digests = f.readlines()
# responses = ['testdata/docs/response_%s.json' % d.strip() for d in digests]
with open('testdata/second_harvest/priority_identification_all.csv', 'w') as f:
f.write('digest|url|protocol|subtype|service|has dataset|has metadata|version|is error\n')
for response in responses:
with open(response, 'r') as f:
data = json.loads(f.read())
digest = data['digest']
raw_content = data['raw_content']
url = data['url']
rr = RawResponse(url.upper(), raw_content, digest, **{})
cleaned_text = rr.clean_raw_content()
cleaned_text = cleaned_text.strip()
try:
parser = Parser(cleaned_text)
except Exception as ex:
logger.debug('xml parsing error: %s' % digest, exc_info=1)
continue
print digest
identifier = Identify(YAML_FILE, cleaned_text, url, **{'parser': parser, 'ignore_case': True})
identifier.identify()
protocol = identifier.protocol
subtype = identifier.subtype
service = identifier.service
has_dataset = identifier.has_dataset
has_metadata = identifier.has_metadata
version = identifier.version
is_error = identifier.is_error
# if not protocol:
# continue
with open('testdata/second_harvest/priority_identification_all.csv', 'a') as f:
f.write('|'.join([digest, url.replace(',', ';').replace('|', ';'), protocol,
str(subtype), service, str(has_dataset), str(has_metadata),
str(version), str(is_error)]) + '\n')