-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxml_parse.py
169 lines (141 loc) · 6.75 KB
/
xml_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 14 23:54:35 2020
@author: Administrator
"""
# use xml.sax parse dblp.xml. Give each author an ID and put them in a file.
# find collaboration relations of all authors and output them to a file. One relation's format is like (id1,id2)
# we first extract all author(s) in the same paper, then update the dict of authors, after which we extract
# collaboration relation(s) in this paper
import xml.sax
from xml.sax.handler import feature_external_ges
paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www')
# class authorHandler(xml.sax.ContentHandler): # extract all authors
# def __init__(self):
# self.CurrentData = "" # tag's name
# self.dict = {} # save all authors. The key is an author's name, the value is his id
# self.name = "" # the name of an author
# self.id = 0 # the ID of an author
# self.contents = []
# self.author = [] # all authors for the same paper
# self.year = "" # the year of publication
# def resolveEntity(self, publicID, systemID):
# print("authorHandler.resolveEntity(): %s %s" % (publicID, systemID))
# return systemID
# def startElement(self, tag, attributes):
# if tag != None and len(tag.strip()) > 0:
# self.CurrentData = tag
# def endElement(self, tag):
# if tag != None and len(tag.strip()) > 0:
# if tag in paper_tags:
# if len(self.author) > 0 and self.year != '2015':
# #if self.CurrentData == 'author': # this tag is author, save it in the dict
# for authorname in self.author:
# exist = self.dict.get(authorname, -1)
# if exist == -1: # if this author have not been added into dict
# self.dict[authorname] = self.id
# self.id = self.id + 1
#
# self.author.clear()
# elif self.CurrentData == 'author':
# self.author.append(self.name)
# self.contents.clear()
# def characters(self, content):
# if content != '\n':
# if self.CurrentData == 'author':
# self.contents.append(content)
# self.name = ''.join(self.contents)
# # self.name += content.strip()
# elif self.CurrentData == "year":
# self.year = content.strip()
class collabrationHandler(xml.sax.ContentHandler): # extract all collaboration relations
def __init__(self, file):
self.CurrentData = "" # tag's name
self.dict = {} # save all authors. The key is an author's name, the value is his id
self.name = "" # the name of an author
self.id = 0 # the ID of an author
self.paper = False # if the tag is in paper_tags, paper = True
self.author = [] # all authors for the same paper
self.file = file # Output collaboration relation to file
self.edge = set() # Edge's set
self.contents = [] # all authors for the same paper
self.year = "" # the year of publication
self.author_id = [] # all authors' id in one publication
def resolveEntity(self, publicID, systemID):
print("collabrationHandler.resolveEntity(): %s %s" % (publicID, systemID))
return systemID
def startElement(self, tag, attributes):
if tag != None and len(tag.strip()) > 0:
self.CurrentData = tag
if tag in paper_tags:
#if tag == 'article' or tag == 'inproceeding':
self.author_id.clear() # start processing a new paper, old collaboration need to be deleted
self.paper = True
def characters(self, content):
if self.paper == True and content != '\n':
if self.CurrentData == 'author':
self.contents.append(content)
self.name = ''.join(self.contents)
elif self.CurrentData == "year":
self.year = content.strip()
#self.name = content
def endElement(self, tag):
if tag != None and len(tag.strip()) > 0:
if tag in paper_tags:
if len(self.author) > 0 and self.year != '2015':
#if self.CurrentData == 'author': # this tag is author, save it in the dict
# update the dict of authors
for authorname in self.author:
exist = self.dict.get(authorname, -1)
if exist == -1: # if this author have not been added into dict
self.dict[authorname] = self.id
self.id = self.id + 1
# get the id of authors
for authorname1 in self.author:
self.author_id.append(self.dict[authorname1]) # add this author's id
# isAuthor = self.dict.get(authorname, -1) # isAuthor == -1 means that this content is not an author's name
# if isAuthor != -1:
# self.author_id.append(self.dict[self.name]) # add this author's id
self.author.clear()
self.paper = False
for i in self.author_id:
for j in self.author_id:
if i < j and (i, j) not in self.edge: # edge
self.file.write(str(i) + ' ' + str(j) + '\n')
self.edge.add((i, j))
elif self.CurrentData == 'author':
self.author.append(self.name)
self.contents.clear()
#if tag in paper_tags:
#if (tag == 'article' or tag == 'inproceeding') and self.paper == True: # One paper's tag close
"""
self.paper = False
for i in self.author:
for j in self.author:
if i < j and (i, j) not in self.edge: # edge
self.file.write(str(i) + ' ' + str(j) + '\n')
self.edge.add((i, j))
"""
# set xml parser
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
parser.setFeature(feature_external_ges, True)
# handler1 = authorHandler()
# parser.setContentHandler(handler1)
# parser.setEntityResolver(handler1)
# parser.setDTDHandler(handler1)
# parser.parse('dblp.xml')
with open('collaboration.txt', 'w') as f:
handler = collabrationHandler(f)
parser.setContentHandler(handler)
parser.setEntityResolver(handler)
parser.setDTDHandler(handler)
parser.parse('dblp.xml')
f.close()
with open('authors.txt', 'w', encoding='utf-8') as f:
for k, v in handler.dict.items():
f.write(str(v))
f.write(' ' + k)
f.write('\n')
print("done")
f.close()