-
Notifications
You must be signed in to change notification settings - Fork 8
/
check_tgwiki.py
101 lines (86 loc) · 2.87 KB
/
check_tgwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Try to auto-populate family names
# Mike Peel 09-Jun-2018 v1 - start
from __future__ import unicode_literals
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
import urllib
import csv
from pibot_functions import *
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
stepsize = 1000
maximum = 10000000
numsteps = int(maximum / stepsize)
wikidata_site = pywikibot.Site("wikidata", "wikidata")
repo = wikidata_site.data_repository() # this is a DataSite object
debug = 1
def update_report(qid, tgwp, empty=False):
report = pywikibot.Page(wikidata_site, 'User:Mike Peel/tgwiki sitelink fixes')
report_text = report.get()
rep = u'\n*{{Q|'+str(qid)+'}}'
if empty:
rep = u"\n*'''{{Q|"+str(qid)+"}} - [[:tg:" + tgwp + "]] - EMPTY'''"
else:
rep = u'\n*{{Q|'+str(qid)+'}} - [[:tg:' + tgwp + ']]'
if rep in report_text:
return
report.text = report_text + rep
try:
report.save('Update report to include ' + qid)
except:
print 'Could not save report!'
return
for i in range(0,numsteps):
print 'Starting at ' + str(i*stepsize)
query = 'SELECT ?item ?itemLabel ?article\n'\
'WHERE\n'\
'{\n'\
' ?article schema:about ?item ;\n'\
' schema:isPartOf <https://tg.wikipedia.org/> .\n'\
' SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],tg" }\n'\
'}\n'\
'LIMIT ' + str(stepsize) + ' OFFSET ' + str(i*stepsize)
print query
# exit()
i = 0
generator = pagegenerators.WikidataSPARQLPageGenerator(query, site=wikidata_site)
for page in generator:
try:
item_dict = page.get()
qid = page.title()
except:
print 'Huh - no page found'
continue
print "\n" + qid
try:
tgwp = get_sitelink_title(item_dict['sitelinks']['tgwiki'])
print tgwp
except:
print 'tgwiki sitelink not found!'
continue
print tgwp.decode('utf-8')
url = u'https://tg.wikipedia.org/wiki/'+tgwp.replace(' ','_')
url = urllib.quote(url.encode('utf8'), ':/')
print url
#tgwp.decode('unicode-escape').encode('utf-8') #.decode('unicode_escape')#.encode('utf8')
#url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
a=urllib.urlopen(url)
code = a.getcode()
print code
if code == 404:
page.removeSitelink(site='tgwiki', summary=u'Removing broken sitelink to tgwiki')
item_dict = page.get()
print item_dict['sitelinks']
print len(item_dict['sitelinks'])
if len(item_dict['sitelinks']) == 0:
update_report(qid, tgwp, empty=True)
else:
update_report(qid, tgwp)
# exit()
# EOF