-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge.py
82 lines (71 loc) · 2.98 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
from data_source import DataSource, DataSourceConnection
from model import Publication
class Merge(DataSource):
def __init__(self, *data_sources):
self.data_sources = data_sources
def connect(self):
return MergeConnection(self.data_sources)
class MergeConnection(DataSourceConnection):
def __init__(self, data_sources):
self.data_sources = data_sources
def search_by_author(self, surname, name=None, year=None):
for data_source in self.data_sources: # TODO move to connection setup to __enter__
with data_source() as conn:
for pub in conn.search_by_author(surname, name=name, year=year):
yield pub
def search_citations(self, publications):
cit = []
for data_source in self.data_sources: # TODO move to connection setup to __enter__
with data_source() as conn:
cit.extend(list(conn.search_citations(publications)))
cit.sort(key=lambda r: r.title)
cit.sort(key=lambda r: r.year)
merged = []
while len(cit) > 0:
cur = cit.pop(0)
bucket = [cur]
for i in xrange(len(cit) - 1, -1, -1):
if cit[i] == cur:
bucket.append(cit[i])
del cit[i]
def find_longest(attr):
longest = None
for p in bucket:
v = getattr(p, attr)
if v != None:
if longest == None:
longest = (p, v)
elif len(v) > len(longest):
longest = (p, v)
if longest == None:
return (None, None)
return longest
lauthors_pub, lauthors = find_longest('authors')
mpub = Publication(find_longest('title')[1], lauthors, cur.year)
mpub.authors_incomplete = lauthors_pub.authors_incomplete
mpub.published_in = find_longest('published_in')[1]
mpub.pages = find_longest('pages')[1]
mpub.volume = find_longest('volume')[1]
mpub.series = find_longest('series')[1]
mpub.issue = find_longest('issue')[1]
mpub.special_issue = find_longest('special_issue')[1]
mpub.supplement = find_longest('supplement')[1]
mpub.times_cited = max(p.times_cited for p in bucket)
mpub.article_no = find_longest('article_no')[1]
mpub.publisher = find_longest('publisher')[1]
mpub.publisher_city = find_longest('publisher_city')[1]
mpub.edition = find_longest('edition')[1]
mpub.source_urls = list(set([x for p in bucket for x in p.source_urls]))
mpub.cite_urls = list(set([x for p in bucket for x in p.cite_urls]))
mpub.identifiers = list(set([x for p in bucket for x in p.identifiers]))
mpub.indexes = list(set([x for p in bucket for x in p.indexes]))
mpub.merge_sources = bucket
merged.append(mpub)
return merged
def assign_indexes(self, publications):
for data_source in self.data_sources: # TODO move to connection setup to __enter__
with data_source() as conn:
conn.assign_indexes(publications)
def close(self):
pass # TODO close connections created in __enter__