-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape
executable file
·97 lines (79 loc) · 2.8 KB
/
scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
##!/usr/bin/python
import sys
import re
#src = sys.stdin.read()
with file("diagram.xhtml") as f: src = f.read()
with file("syntax_template.xhtml") as f: templ = f.read()
dst = src
#strip the outer body and header
i = src.find("<body>");
j = src.find("<xhtml:p xmlns:xhtml=\"http://www.w3.org/1999/xhtml\"><xhtml:table")
dst = src[i+6:j]
i = dst.find("</body>")
dst = dst[:i]
#strip the redundant styles (so our styles stick)
i = dst.find("<defs>")
while i >= 0:
j = dst.find("</defs>", i)
dst = dst[0:i] + dst[j+7:]
i = dst.find("<defs>", i+1)
#strip the outermost arrows on the diagrams
i = dst.find("<polygon ")
while i >= 0:
# keep the second of every pair
j = dst.find("<polygon ", i+1)
dst = dst[0:i] + dst[j:]
i = dst.find("<polygon ", i+1)
#strip out the ebnf echo
i = dst.find("<xhtml:p xmlns:xhtml=\"http://www.w3.org/1999/xhtml\"><xhtml:div class=\"ebnf\">")
while i >= 0:
j = dst.find("</xhtml:div></xhtml:p>", i)
dst = dst[0:i] + dst[j+22:]
i = dst.find("<xhtml:p xmlns:xhtml=\"http://www.w3.org/1999/xhtml\"><xhtml:div class=\"ebnf\">", i)
#convert the titles to h3
dst = dst.replace("<xhtml:p xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" style=\"font-size: 14px; font-weight:bold\">", "<xhtml:h3 xmlns:xhtml=\"http://www.w3.org/1999/xhtml\">")
i = dst.find("<xhtml:h3 xmlns:xhtml")
while i >= 0:
j = dst.find("</xhtml:p>", i)
k = dst.find("xhtml:a name=\"", i)
l = dst.find("\">", k)
dst = dst[0:j] + "</xhtml:h3>" + dst[j+10:]
i = dst.find("<xhtml:h3 xmlns:xhtml", i+5)
# strip out the ebnf
i = dst.find("<xhtml:div class=\"ebnf\">")
while i >= 0:
j = dst.find("</xhtml:div>", i)
dst = dst[0:i] + dst[j+12:]
i = dst.find("<xhtml:div class=\"ebnf\">")
#convert references to single line
dst = dst.replace("no references", "")
i = dst.find("referenced by:")
while i >= 0:
delim = ""
j = dst.find("<xhtml:ul>", i)
while j >= 0:
head = dst[0:j]
k = dst.find("</xhtml:ul>", j+10)
tail = dst[k+11:]
item = dst[j+10:k]
between = ""
l = item.find("<xhtml:li>")
delim = ""
while l >= 0:
m = item.find("</xhtml:li>", l)
between = between + delim + item[l+10:m]
delim = ", "
l = item.find("<xhtml:li>", m)
dst = head + between + tail
j = dst.find("<xhtml:ul>", j)
i = dst.find("referenced by:", i+14)
# simplify the text, we already have the xhtml namespace as the default
dst = dst.replace("xhtml:", "")
# and stick the processing xhtml into the template
dst = templ.replace("{{BODY}}", dst);
outfile = open("syntax.xhtml", 'w')
outfile.truncate()
#print dst
outfile.write(dst)
outfile.close()