Skip to content

Commit fe6c095

Browse files
committed
不足していたファイルを追加
1 parent 5f8db06 commit fe6c095

12 files changed

+1651
-1
lines changed

Makefile_juman

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
2+
# JUMANのインストール先
3+
JUMAN_PREFIX=/share/usr-x86_64
4+
# KKN のディレクトリ
5+
KKN_PREFIX=/home/morita/work/violet/kkn
6+
7+
SCRIPT_DIR=scripts
8+
INFLECTION_DIR=inflection/blib/lib/
9+
10+
#DIC_DIRS=$(shell find . -maxdepth 1 -type d -name "*dic")
11+
DIC_DIRS=$(shell echo -e "dic\nwikipediadic\nwiktionarydic\nautodic\nonomatopedic")
12+
DA_LIST=$(addsuffix /jumandic.da,$(DIC_DIRS))
13+
MDIC_LIST=$(addsuffix .mdic,$(DIC_DIRS))
14+
BASIC_DICTS=$(shell find dic -name "*.dic"|grep -v "Rengo.dic"|grep -v "ContentW.dic")
15+
16+
all: juman kkn kkn_nominalize
17+
18+
juman: $(DA_LIST)
19+
git log -1 --date=local --format="%ad-%h" > dic.version
20+
21+
kkn: $(MDIC_LIST)
22+
mkdir -p kkn &&\
23+
cat $^ | PERL5LIB="" perl -I$(SCRIPT_DIR) -I$(INFLECTION_DIR) $(SCRIPT_DIR)/jumandic2morphdic.perl > kkn.mdic &&\
24+
$(KKN_PREFIX)/mkdarts kkn.mdic kkn/dic &&\
25+
git log -1 --date=local --format="%ad-%h" > kkn/version
26+
27+
kkn_nominalize: $(MDIC_LIST)
28+
mkdir -p kkn_m &&\
29+
cat $^ | PERL5LIB="" perl -I$(SCRIPT_DIR) -I$(INFLECTION_DIR) $(SCRIPT_DIR)/jumandic2morphdic.perl --nominalize > kkn_m.mdic &&\
30+
$(KKN_PREFIX)/mkdarts kkn_m.mdic kkn_m/dic &&\
31+
git log -1 --date=local --format="%ad-%h" > kkn_m/version
32+
33+
kkn_hikkomi: $(MDIC_LIST)
34+
mkdir -p kkn_h &&\
35+
cat $^ | PERL5LIB="" perl -I$(SCRIPT_DIR) -I$(INFLECTION_DIR) $(SCRIPT_DIR)/jumandic2morphdic.perl --nominalize --okurigana > kkn_h.mdic &&\
36+
$(KKN_PREFIX)/mkdarts kkn_h.mdic kkn_h/dic &&\
37+
git log -1 --date=local --format="%ad-%h" > kkn_h/version
38+
39+
# Wikipedia を特殊化する(JUMAN用)
40+
wikipediadic/jumandic.da: wikipediadic/wikipedia.dic
41+
sh $(SCRIPT_DIR)/update.sh -d wikipediadic
42+
43+
%/jumandic.da: %
44+
sh $(SCRIPT_DIR)/update.sh -d $<
45+
46+
%.mdic: %
47+
cat $</*.dic > $@
48+
49+
wikipediadic.mdic: wikipediadic wikipediadic/wikipedia.dic.orig
50+
cat wikipediadic/wikipedia.dic.orig > $@
51+
52+
wikipediadic/wikipedia.dic: wikipediadic/wikipedia.dic.orig
53+
cat $< | ruby $(SCRIPT_DIR)/clean.dic.rb > $@ 2> wikipediadic/clean.log
54+
55+
dic.mdic: $(BASIC_DICTS) dic/ContentW.marked_dic dic/lexicon_from_rengo.mdic
56+
cat $(BASIC_DICTS) dic/ContentW.marked_dic dic/lexicon_from_rengo.mdic > dic.mdic
57+
58+

del_weight.rb

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/env ruby
2+
#encoding: utf-8
3+
4+
while line=gets
5+
if(line =~ /^\(/)
6+
puts line.gsub(/(\( *([^ \(\)]*?) ([0-9][0-9\.]*) *\))/){|source|
7+
#puts "#{$1}, #{$2} #{$3}"
8+
midasi = $2
9+
weight = $3
10+
if(midasi !~ /見出し語/ )
11+
# puts "#{source} => #{midasi}"
12+
midasi
13+
else
14+
source
15+
end
16+
}
17+
else
18+
puts line
19+
end
20+
end

ken2town_method.py

+261
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
#!/bin/env python
2+
#coding: utf-8
3+
4+
import re
5+
import csv
6+
import doctest
7+
from collections import defaultdict
8+
9+
def read_town_list(csvname):#{{{
10+
with open(csvname, newline='',encoding='cp932') as csvfile:
11+
town_list = []
12+
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
13+
14+
last_number = False
15+
last_yomi = False
16+
line = ""
17+
for row in reader:
18+
place_number = row[2]
19+
place_yomi = row[5]
20+
place_area = row[6]
21+
place_name = row[8]
22+
# 同じ郵便番号が連続している
23+
# 地名の読みが同じ
24+
# 括弧の数が一致していない 場合は前の町域名と繋げる必要がある
25+
if( last_number == place_number
26+
and last_yomi == place_yomi
27+
and (not bracket_match(line))):
28+
line += place_name
29+
elif(last_number == place_number # 正例のみ
30+
and (not bracket_match(line))):
31+
line += place_name
32+
else:
33+
town_list.append((line,"日本:"+place_area))
34+
line = place_name
35+
last_yomi = place_yomi
36+
last_number = place_number
37+
town_list.append((line,"日本:"+place_area))
38+
return town_list
39+
#}}}
40+
41+
42+
def bracket_match(string):#{{{
43+
""" 全角括弧()の数が一致しているかどうかを調べる
44+
>>> bracket_match("((()))") # 括弧の数が一致していれば True
45+
True
46+
>>> bracket_match(")))(((") # 順番は関係がない
47+
True
48+
>>> bracket_match("((())") # 一致していなければ False
49+
False
50+
>>> bracket_match("あいうえお") # 括弧が存在しない場合も True
51+
True
52+
"""
53+
lbra = re.findall(r"[(]",string)
54+
rbra = re.findall(r"[)]",string)
55+
if(len(lbra) != len(rbra)):
56+
return False
57+
return True
58+
#}}}
59+
60+
def preprocess(place_name):#{{{
61+
""" 番地,丁目,など一般化できる箇所,その他,階数や住所特有の表現の削除.空白をまとめる等その他前処理.
62+
>>> preprocess("十区")
63+
' '
64+
>>> preprocess("東大阪3丁目")
65+
'東大阪 '
66+
"""
67+
# 一般化,または削除するパターン
68+
## 北二十五条, 等が 北二十で切れるなどの問題があったため,xx条を除く
69+
place_name = re.sub(r"[一二三四五六七八九十0123456789]+条","[N条]",place_name)
70+
place_name = re.sub(r"[一二三四五六七八九十0123456789]+筋目","[N条]",place_name)
71+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+区","[N区]",place_name)
72+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+線","[N線]",place_name)
73+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+号","[N号]",place_name)
74+
place_name = re.sub(r"第[一二三四五六七八九十0123456789~]+(町|町内)?","[第N]",place_name)
75+
place_name = re.sub(r"[0123456789~〜、の]+丁目","[N丁目]",place_name)
76+
place_name = re.sub(r"([0123456789]+丁)","",place_name)
77+
place_name = re.sub(r"[0123456789~〜-、・の]+番地","[N番地]",place_name)
78+
place_name = re.sub(r"[0123456789~〜-、の]+番町","[N番町]",place_name)
79+
place_name = re.sub(r"[0123456789~〜-、 ]+地割","[N地割]",place_name)
80+
place_name = re.sub(r"[0123456789~〜-]+の通り","[Nの通り]",place_name)
81+
place_name = re.sub(r"([0123456789~〜-]+階)","[N階]",place_name)
82+
place_name = re.sub(r"[0123456789~〜-]+以[上下])","[以上])",place_name)
83+
place_name = re.sub(r"「[0123456789~〜-、]+を除く」","[を除く])",place_name)
84+
place_name = re.sub(r"([東西南北0123456789〜]+)","",place_name)
85+
place_name = re.sub(r"[0123456789~〜-、の ]+)","[番地])",place_name)
86+
place_name = re.sub(r"[一二三四五六七八九十0123456789~・]+番","[N番]",place_name)
87+
place_name = re.sub(r"「[0123456789~〜-、]*」","",place_name)
88+
place_name = re.sub(r"[0123456789]+[~〜-、の]+[0123456789~〜-、の]+","[番地]",place_name)
89+
place_name = re.sub(r"駅[東西南北]$","",place_name)
90+
place_name = re.sub(r"その他","",place_name)
91+
place_name = re.sub(r"番地のみ","",place_name)
92+
place_name = re.sub(r"次のビルを除く","",place_name)
93+
place_name = re.sub(r"地階・階層不明","",place_name)
94+
place_name = re.sub(r"・"," ",place_name)
95+
place_name = re.sub(r"市街地","",place_name) # 住所特有の表現を削除
96+
place_name = re.sub(r"([^)]*$","",place_name) # 閉じていない括弧の除去
97+
place_name = re.sub(r"「[^」]*$","",place_name) # 閉じていない括弧の除去
98+
99+
place_name = re.sub(r"\[.*?\]"," ",place_name)
100+
place_name = re.sub(r"( *)"," ", place_name)
101+
place_name = re.sub(r"("," ", place_name)
102+
place_name = re.sub(r"「"," ", place_name)
103+
place_name = re.sub(r"」"," ", place_name)
104+
place_name = re.sub(r")"," ", place_name)
105+
place_name = re.sub(r"、"," ", place_name)
106+
107+
#地割~ 地割
108+
place_name = re.sub(r"地割 *~ *地割"," ", place_name)
109+
110+
111+
# 西入,東入,上る,下る
112+
place_name = re.sub(r"[西東]入"," ", place_name)
113+
place_name = re.sub(r"[上下]る"," ", place_name)
114+
place_name = re.sub(r"[東西南北]側$"," ", place_name)
115+
place_name = re.sub(r"の[上下前後外]$"," ", place_name)
116+
117+
# 連続する空白をまとめる
118+
place_name = re.sub(r" +"," ", place_name)
119+
return place_name
120+
#}}}
121+
122+
def is_townname(name):#{{{
123+
""" 町名として適切かどうかを判定する
124+
>>> is_townname("御徒町")
125+
True
126+
>>> is_townname("以下に掲載がない場合")
127+
False
128+
>>> is_townname("境町の次に番地がくる場合")
129+
False
130+
>>> is_townname("以下に掲載がない場合")
131+
False
132+
"""
133+
if(name == "以下に掲載がない場合"):
134+
return False
135+
if(re.search(r"〔東京電力福島第二原子力発電所構内〕",name)):
136+
return False
137+
# 番地の途中で改行されている場合に生じる断片の除去
138+
if(re.match(r"^[0123456789-、~〜-、の()]*$", name)):
139+
return False
140+
# 対応がとれていない括弧
141+
if(re.match(r"^[^(]*)$", name) or re.match(r"^[^「]*」.*$", name)):
142+
return False
143+
if(re.match(r"^.*の次に番地がくる場合$", name)):
144+
return False
145+
return True
146+
#}}}
147+
148+
def get_all_prefix_map(str1, str2, prefix_map):#{{{
149+
""" str1 と str2 の二文字以上の共通 prefix と prefix以降のmap をdictで返す
150+
>>> sorted(get_all_prefix_map("あいうえお","あいうえおか",defaultdict(list)).items())
151+
[('あい', ['うえお', 'うえおか']), ('あいう', ['えお', 'えおか']), ('あいうえ', ['お', 'おか']), ('あいうえお', ['', 'か'])]
152+
>>> sorted(get_all_prefix_map("あい","あい2",get_all_prefix_map("あい","あい1",defaultdict(list))).items())
153+
[('あい', ['', '1', '', '2'])]
154+
>>> sorted(get_all_prefix_map("あいうえお","あえおか",defaultdict(list)).items())
155+
[]
156+
>>> sorted(get_all_prefix_map('末吉', '住吉',defaultdict(list)).items())
157+
[]
158+
"""
159+
# 共通する prefix で分割
160+
min_length = min(len(str1),len(str2))
161+
if(min_length == 0 or str1[0] != str2[0]):
162+
return prefix_map
163+
for ind in range(1, min_length):
164+
if(str1[ind] == str2[ind]):
165+
prefix = str1[0:ind+1]
166+
suffix1 = str1[ind+1:]
167+
suffix2 = str2[ind+1:]
168+
# ひらがなで分割する prefix-suffix のペアは候補に含めない
169+
if(not (re.search("[ぁ-ん]$",prefix) and re.search("^[ぁ-ん]",suffix1)and re.search("^[ぁ-ん]",suffix2))):
170+
prefix_map[prefix].append(suffix1)
171+
prefix_map[prefix].append(suffix2)
172+
else:
173+
return prefix_map
174+
return prefix_map
175+
#}}}
176+
177+
# 使用していない
178+
def get_prefix(str1, str2):#{{{
179+
""" str1 と str2 の二文字以上で,最長の共通prefixを返す
180+
>>> get_prefix("abcd","abcefg")
181+
'abc'
182+
>>> get_prefix("abcd","bcd")
183+
''
184+
"""
185+
prefix = ""
186+
# 共通する prefix で分割
187+
min_length = min(len(str1),len(str2))
188+
for ind in range(min_length):
189+
if(str1[ind] == str2[ind]):
190+
prefix += str1[ind]
191+
else:
192+
return prefix
193+
return prefix
194+
#}}}
195+
def get_all_prefix(str1, str2):#{{{
196+
""" str1 と str2 の二文字以上の共通 prefix をリストで返す
197+
>>> get_all_prefix("あいうえお","あいうえおか")
198+
['あい', 'あいう', 'あいうえ', 'あいうえお']
199+
>>> get_all_prefix("あいうえお","あえおか")
200+
[]
201+
"""
202+
prefix = []
203+
# 共通する prefix で分割
204+
min_length = min(len(str1),len(str2))
205+
for ind in range(1, min_length):
206+
if(str1[ind] == str2[ind]):
207+
prefix.append(str1[0:ind+1])
208+
else:
209+
return prefix
210+
return prefix
211+
#}}}
212+
def preprocess_old(place_name):#{{{
213+
""" 番地,丁目,など一般化できる箇所,その他,階数や住所特有の表現の削除.空白をまとめる等その他前処理.
214+
>>> preprocess_old("十条")
215+
' '
216+
>>> preprocess_old("東大阪3丁目")
217+
'東大阪 '
218+
"""
219+
# 一般化,または削除するパターン
220+
place_name = re.sub(r"[一二三四五六七八九十0123456789]+条","[N条]",place_name)
221+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+区","[N区]",place_name)
222+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+線","[N線]",place_name)
223+
place_name = re.sub(r"[一二三四五六七八九十0123456789~]+号","[N号]",place_name)
224+
place_name = re.sub(r"第[一二三四五六七八九十0123456789~]+(町|町内)?","[第N]",place_name)
225+
place_name = re.sub(r"[0123456789~〜、の]+丁目","[N丁目]",place_name)
226+
place_name = re.sub(r"([0123456789]+丁)","",place_name)
227+
place_name = re.sub(r"[0123456789~〜-、・の]+番地","[N番地]",place_name)
228+
place_name = re.sub(r"[0123456789~〜-、の]+番町","[N番町]",place_name)
229+
place_name = re.sub(r"[0123456789~〜-、 ]+地割","[N地割]",place_name)
230+
place_name = re.sub(r"[0123456789~〜-]+の通り","[Nの通り]",place_name)
231+
place_name = re.sub(r"([0123456789~〜-]+階)","[N階]",place_name)
232+
place_name = re.sub(r"[0123456789~〜-]+以[上下])","[以上])",place_name)
233+
place_name = re.sub(r"「[0123456789~〜-、]+を除く」","[を除く])",place_name)
234+
place_name = re.sub(r"([東西南北0123456789〜]+)","",place_name)
235+
place_name = re.sub(r"[0123456789~〜-、の ]+)","[番地])",place_name)
236+
place_name = re.sub(r"[一二三四五六七八九十0123456789~・]+番","[N番]",place_name)
237+
place_name = re.sub(r"「[0123456789~〜-、]*」","",place_name)
238+
place_name = re.sub(r"[0123456789]+[~〜-、の]+[0123456789~〜-、の]+","[番地]",place_name)
239+
place_name = re.sub(r"その他","",place_name)
240+
place_name = re.sub(r"次のビルを除く","",place_name)
241+
place_name = re.sub(r"地階・階層不明","",place_name)
242+
place_name = re.sub(r"・"," ",place_name)
243+
place_name = re.sub(r"市街地","",place_name) # 住所特有の表現を削除
244+
place_name = re.sub(r"([^)]*$","",place_name) # 閉じていない括弧の除去
245+
place_name = re.sub(r"「[^」]*$","",place_name) # 閉じていない括弧の除去
246+
247+
place_name = re.sub(r"\[.*?\]"," ",place_name)
248+
place_name = re.sub(r"( *)"," ", place_name)
249+
place_name = re.sub(r"("," ", place_name)
250+
place_name = re.sub(r"「"," ", place_name)
251+
place_name = re.sub(r"」"," ", place_name)
252+
place_name = re.sub(r")"," ", place_name)
253+
place_name = re.sub(r"、"," ", place_name)
254+
255+
# 連続する空白をまとめる
256+
place_name = re.sub(r" +"," ", place_name)
257+
return place_name
258+
#}}}
259+
260+
if __name__ == "__main__":
261+
doctest.testmod()

0 commit comments

Comments
 (0)