|
| 1 | +#!/bin/env python |
| 2 | +#coding: utf-8 |
| 3 | + |
| 4 | +import re |
| 5 | +import csv |
| 6 | +import doctest |
| 7 | +from collections import defaultdict |
| 8 | + |
| 9 | +def read_town_list(csvname):#{{{ |
| 10 | + with open(csvname, newline='',encoding='cp932') as csvfile: |
| 11 | + town_list = [] |
| 12 | + reader = csv.reader(csvfile, delimiter=',', quotechar='"') |
| 13 | + |
| 14 | + last_number = False |
| 15 | + last_yomi = False |
| 16 | + line = "" |
| 17 | + for row in reader: |
| 18 | + place_number = row[2] |
| 19 | + place_yomi = row[5] |
| 20 | + place_area = row[6] |
| 21 | + place_name = row[8] |
| 22 | + # 同じ郵便番号が連続している |
| 23 | + # 地名の読みが同じ |
| 24 | + # 括弧の数が一致していない 場合は前の町域名と繋げる必要がある |
| 25 | + if( last_number == place_number |
| 26 | + and last_yomi == place_yomi |
| 27 | + and (not bracket_match(line))): |
| 28 | + line += place_name |
| 29 | + elif(last_number == place_number # 正例のみ |
| 30 | + and (not bracket_match(line))): |
| 31 | + line += place_name |
| 32 | + else: |
| 33 | + town_list.append((line,"日本:"+place_area)) |
| 34 | + line = place_name |
| 35 | + last_yomi = place_yomi |
| 36 | + last_number = place_number |
| 37 | + town_list.append((line,"日本:"+place_area)) |
| 38 | + return town_list |
| 39 | +#}}} |
| 40 | + |
| 41 | + |
| 42 | +def bracket_match(string):#{{{ |
| 43 | + """ 全角括弧()の数が一致しているかどうかを調べる |
| 44 | + >>> bracket_match("((()))") # 括弧の数が一致していれば True |
| 45 | + True |
| 46 | + >>> bracket_match(")))(((") # 順番は関係がない |
| 47 | + True |
| 48 | + >>> bracket_match("((())") # 一致していなければ False |
| 49 | + False |
| 50 | + >>> bracket_match("あいうえお") # 括弧が存在しない場合も True |
| 51 | + True |
| 52 | + """ |
| 53 | + lbra = re.findall(r"[(]",string) |
| 54 | + rbra = re.findall(r"[)]",string) |
| 55 | + if(len(lbra) != len(rbra)): |
| 56 | + return False |
| 57 | + return True |
| 58 | +#}}} |
| 59 | + |
| 60 | +def preprocess(place_name):#{{{ |
| 61 | + """ 番地,丁目,など一般化できる箇所,その他,階数や住所特有の表現の削除.空白をまとめる等その他前処理. |
| 62 | + >>> preprocess("十区") |
| 63 | + ' ' |
| 64 | + >>> preprocess("東大阪3丁目") |
| 65 | + '東大阪 ' |
| 66 | + """ |
| 67 | + # 一般化,または削除するパターン |
| 68 | + ## 北二十五条, 等が 北二十で切れるなどの問題があったため,xx条を除く |
| 69 | + place_name = re.sub(r"[一二三四五六七八九十0123456789]+条","[N条]",place_name) |
| 70 | + place_name = re.sub(r"[一二三四五六七八九十0123456789]+筋目","[N条]",place_name) |
| 71 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+区","[N区]",place_name) |
| 72 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+線","[N線]",place_name) |
| 73 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+号","[N号]",place_name) |
| 74 | + place_name = re.sub(r"第[一二三四五六七八九十0123456789~]+(町|町内)?","[第N]",place_name) |
| 75 | + place_name = re.sub(r"[0123456789~〜、の]+丁目","[N丁目]",place_name) |
| 76 | + place_name = re.sub(r"([0123456789]+丁)","",place_name) |
| 77 | + place_name = re.sub(r"[0123456789~〜-、・の]+番地","[N番地]",place_name) |
| 78 | + place_name = re.sub(r"[0123456789~〜-、の]+番町","[N番町]",place_name) |
| 79 | + place_name = re.sub(r"[0123456789~〜-、 ]+地割","[N地割]",place_name) |
| 80 | + place_name = re.sub(r"[0123456789~〜-]+の通り","[Nの通り]",place_name) |
| 81 | + place_name = re.sub(r"([0123456789~〜-]+階)","[N階]",place_name) |
| 82 | + place_name = re.sub(r"[0123456789~〜-]+以[上下])","[以上])",place_name) |
| 83 | + place_name = re.sub(r"「[0123456789~〜-、]+を除く」","[を除く])",place_name) |
| 84 | + place_name = re.sub(r"([東西南北0123456789〜]+)","",place_name) |
| 85 | + place_name = re.sub(r"[0123456789~〜-、の ]+)","[番地])",place_name) |
| 86 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~・]+番","[N番]",place_name) |
| 87 | + place_name = re.sub(r"「[0123456789~〜-、]*」","",place_name) |
| 88 | + place_name = re.sub(r"[0123456789]+[~〜-、の]+[0123456789~〜-、の]+","[番地]",place_name) |
| 89 | + place_name = re.sub(r"駅[東西南北]$","",place_name) |
| 90 | + place_name = re.sub(r"その他","",place_name) |
| 91 | + place_name = re.sub(r"番地のみ","",place_name) |
| 92 | + place_name = re.sub(r"次のビルを除く","",place_name) |
| 93 | + place_name = re.sub(r"地階・階層不明","",place_name) |
| 94 | + place_name = re.sub(r"・"," ",place_name) |
| 95 | + place_name = re.sub(r"市街地","",place_name) # 住所特有の表現を削除 |
| 96 | + place_name = re.sub(r"([^)]*$","",place_name) # 閉じていない括弧の除去 |
| 97 | + place_name = re.sub(r"「[^」]*$","",place_name) # 閉じていない括弧の除去 |
| 98 | + |
| 99 | + place_name = re.sub(r"\[.*?\]"," ",place_name) |
| 100 | + place_name = re.sub(r"( *)"," ", place_name) |
| 101 | + place_name = re.sub(r"("," ", place_name) |
| 102 | + place_name = re.sub(r"「"," ", place_name) |
| 103 | + place_name = re.sub(r"」"," ", place_name) |
| 104 | + place_name = re.sub(r")"," ", place_name) |
| 105 | + place_name = re.sub(r"、"," ", place_name) |
| 106 | + |
| 107 | + #地割~ 地割 |
| 108 | + place_name = re.sub(r"地割 *~ *地割"," ", place_name) |
| 109 | + |
| 110 | + |
| 111 | + # 西入,東入,上る,下る |
| 112 | + place_name = re.sub(r"[西東]入"," ", place_name) |
| 113 | + place_name = re.sub(r"[上下]る"," ", place_name) |
| 114 | + place_name = re.sub(r"[東西南北]側$"," ", place_name) |
| 115 | + place_name = re.sub(r"の[上下前後外]$"," ", place_name) |
| 116 | + |
| 117 | + # 連続する空白をまとめる |
| 118 | + place_name = re.sub(r" +"," ", place_name) |
| 119 | + return place_name |
| 120 | +#}}} |
| 121 | + |
| 122 | +def is_townname(name):#{{{ |
| 123 | + """ 町名として適切かどうかを判定する |
| 124 | + >>> is_townname("御徒町") |
| 125 | + True |
| 126 | + >>> is_townname("以下に掲載がない場合") |
| 127 | + False |
| 128 | + >>> is_townname("境町の次に番地がくる場合") |
| 129 | + False |
| 130 | + >>> is_townname("以下に掲載がない場合") |
| 131 | + False |
| 132 | + """ |
| 133 | + if(name == "以下に掲載がない場合"): |
| 134 | + return False |
| 135 | + if(re.search(r"〔東京電力福島第二原子力発電所構内〕",name)): |
| 136 | + return False |
| 137 | + # 番地の途中で改行されている場合に生じる断片の除去 |
| 138 | + if(re.match(r"^[0123456789-、~〜-、の()]*$", name)): |
| 139 | + return False |
| 140 | + # 対応がとれていない括弧 |
| 141 | + if(re.match(r"^[^(]*)$", name) or re.match(r"^[^「]*」.*$", name)): |
| 142 | + return False |
| 143 | + if(re.match(r"^.*の次に番地がくる場合$", name)): |
| 144 | + return False |
| 145 | + return True |
| 146 | +#}}} |
| 147 | + |
| 148 | +def get_all_prefix_map(str1, str2, prefix_map):#{{{ |
| 149 | + """ str1 と str2 の二文字以上の共通 prefix と prefix以降のmap をdictで返す |
| 150 | + >>> sorted(get_all_prefix_map("あいうえお","あいうえおか",defaultdict(list)).items()) |
| 151 | + [('あい', ['うえお', 'うえおか']), ('あいう', ['えお', 'えおか']), ('あいうえ', ['お', 'おか']), ('あいうえお', ['', 'か'])] |
| 152 | + >>> sorted(get_all_prefix_map("あい","あい2",get_all_prefix_map("あい","あい1",defaultdict(list))).items()) |
| 153 | + [('あい', ['', '1', '', '2'])] |
| 154 | + >>> sorted(get_all_prefix_map("あいうえお","あえおか",defaultdict(list)).items()) |
| 155 | + [] |
| 156 | + >>> sorted(get_all_prefix_map('末吉', '住吉',defaultdict(list)).items()) |
| 157 | + [] |
| 158 | + """ |
| 159 | + # 共通する prefix で分割 |
| 160 | + min_length = min(len(str1),len(str2)) |
| 161 | + if(min_length == 0 or str1[0] != str2[0]): |
| 162 | + return prefix_map |
| 163 | + for ind in range(1, min_length): |
| 164 | + if(str1[ind] == str2[ind]): |
| 165 | + prefix = str1[0:ind+1] |
| 166 | + suffix1 = str1[ind+1:] |
| 167 | + suffix2 = str2[ind+1:] |
| 168 | + # ひらがなで分割する prefix-suffix のペアは候補に含めない |
| 169 | + if(not (re.search("[ぁ-ん]$",prefix) and re.search("^[ぁ-ん]",suffix1)and re.search("^[ぁ-ん]",suffix2))): |
| 170 | + prefix_map[prefix].append(suffix1) |
| 171 | + prefix_map[prefix].append(suffix2) |
| 172 | + else: |
| 173 | + return prefix_map |
| 174 | + return prefix_map |
| 175 | +#}}} |
| 176 | + |
| 177 | +# 使用していない |
| 178 | +def get_prefix(str1, str2):#{{{ |
| 179 | + """ str1 と str2 の二文字以上で,最長の共通prefixを返す |
| 180 | + >>> get_prefix("abcd","abcefg") |
| 181 | + 'abc' |
| 182 | + >>> get_prefix("abcd","bcd") |
| 183 | + '' |
| 184 | + """ |
| 185 | + prefix = "" |
| 186 | + # 共通する prefix で分割 |
| 187 | + min_length = min(len(str1),len(str2)) |
| 188 | + for ind in range(min_length): |
| 189 | + if(str1[ind] == str2[ind]): |
| 190 | + prefix += str1[ind] |
| 191 | + else: |
| 192 | + return prefix |
| 193 | + return prefix |
| 194 | +#}}} |
| 195 | +def get_all_prefix(str1, str2):#{{{ |
| 196 | + """ str1 と str2 の二文字以上の共通 prefix をリストで返す |
| 197 | + >>> get_all_prefix("あいうえお","あいうえおか") |
| 198 | + ['あい', 'あいう', 'あいうえ', 'あいうえお'] |
| 199 | + >>> get_all_prefix("あいうえお","あえおか") |
| 200 | + [] |
| 201 | + """ |
| 202 | + prefix = [] |
| 203 | + # 共通する prefix で分割 |
| 204 | + min_length = min(len(str1),len(str2)) |
| 205 | + for ind in range(1, min_length): |
| 206 | + if(str1[ind] == str2[ind]): |
| 207 | + prefix.append(str1[0:ind+1]) |
| 208 | + else: |
| 209 | + return prefix |
| 210 | + return prefix |
| 211 | +#}}} |
| 212 | +def preprocess_old(place_name):#{{{ |
| 213 | + """ 番地,丁目,など一般化できる箇所,その他,階数や住所特有の表現の削除.空白をまとめる等その他前処理. |
| 214 | + >>> preprocess_old("十条") |
| 215 | + ' ' |
| 216 | + >>> preprocess_old("東大阪3丁目") |
| 217 | + '東大阪 ' |
| 218 | + """ |
| 219 | + # 一般化,または削除するパターン |
| 220 | + place_name = re.sub(r"[一二三四五六七八九十0123456789]+条","[N条]",place_name) |
| 221 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+区","[N区]",place_name) |
| 222 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+線","[N線]",place_name) |
| 223 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~]+号","[N号]",place_name) |
| 224 | + place_name = re.sub(r"第[一二三四五六七八九十0123456789~]+(町|町内)?","[第N]",place_name) |
| 225 | + place_name = re.sub(r"[0123456789~〜、の]+丁目","[N丁目]",place_name) |
| 226 | + place_name = re.sub(r"([0123456789]+丁)","",place_name) |
| 227 | + place_name = re.sub(r"[0123456789~〜-、・の]+番地","[N番地]",place_name) |
| 228 | + place_name = re.sub(r"[0123456789~〜-、の]+番町","[N番町]",place_name) |
| 229 | + place_name = re.sub(r"[0123456789~〜-、 ]+地割","[N地割]",place_name) |
| 230 | + place_name = re.sub(r"[0123456789~〜-]+の通り","[Nの通り]",place_name) |
| 231 | + place_name = re.sub(r"([0123456789~〜-]+階)","[N階]",place_name) |
| 232 | + place_name = re.sub(r"[0123456789~〜-]+以[上下])","[以上])",place_name) |
| 233 | + place_name = re.sub(r"「[0123456789~〜-、]+を除く」","[を除く])",place_name) |
| 234 | + place_name = re.sub(r"([東西南北0123456789〜]+)","",place_name) |
| 235 | + place_name = re.sub(r"[0123456789~〜-、の ]+)","[番地])",place_name) |
| 236 | + place_name = re.sub(r"[一二三四五六七八九十0123456789~・]+番","[N番]",place_name) |
| 237 | + place_name = re.sub(r"「[0123456789~〜-、]*」","",place_name) |
| 238 | + place_name = re.sub(r"[0123456789]+[~〜-、の]+[0123456789~〜-、の]+","[番地]",place_name) |
| 239 | + place_name = re.sub(r"その他","",place_name) |
| 240 | + place_name = re.sub(r"次のビルを除く","",place_name) |
| 241 | + place_name = re.sub(r"地階・階層不明","",place_name) |
| 242 | + place_name = re.sub(r"・"," ",place_name) |
| 243 | + place_name = re.sub(r"市街地","",place_name) # 住所特有の表現を削除 |
| 244 | + place_name = re.sub(r"([^)]*$","",place_name) # 閉じていない括弧の除去 |
| 245 | + place_name = re.sub(r"「[^」]*$","",place_name) # 閉じていない括弧の除去 |
| 246 | + |
| 247 | + place_name = re.sub(r"\[.*?\]"," ",place_name) |
| 248 | + place_name = re.sub(r"( *)"," ", place_name) |
| 249 | + place_name = re.sub(r"("," ", place_name) |
| 250 | + place_name = re.sub(r"「"," ", place_name) |
| 251 | + place_name = re.sub(r"」"," ", place_name) |
| 252 | + place_name = re.sub(r")"," ", place_name) |
| 253 | + place_name = re.sub(r"、"," ", place_name) |
| 254 | + |
| 255 | + # 連続する空白をまとめる |
| 256 | + place_name = re.sub(r" +"," ", place_name) |
| 257 | + return place_name |
| 258 | +#}}} |
| 259 | + |
| 260 | +if __name__ == "__main__": |
| 261 | + doctest.testmod() |
0 commit comments