3131
3232LOG = logging .getLogger ()
3333
34+ WORD_TYPES = (('country_names' , 'C' ),
35+ ('postcodes' , 'P' ),
36+ ('full_word' , 'W' ),
37+ ('housenumbers' , 'H' ))
38+
3439def create (dsn : str , data_dir : Path ) -> 'ICUTokenizer' :
3540 """ Create a new instance of the tokenizer provided by this module.
3641 """
@@ -62,7 +67,8 @@ def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
6267
6368 if init_db :
6469 self .update_sql_functions (config )
65- self ._init_db_tables (config )
70+ self ._setup_db_tables (config , 'word' )
71+ self ._create_base_indices (config , 'word' )
6672
6773
6874 def init_from_project (self , config : Configuration ) -> None :
@@ -80,9 +86,7 @@ def finalize_import(self, config: Configuration) -> None:
8086 """ Do any required postprocessing to make the tokenizer data ready
8187 for use.
8288 """
83- with connect (self .dsn ) as conn :
84- sqlp = SQLPreprocessor (conn , config )
85- sqlp .run_sql_file (conn , 'tokenizer/legacy_tokenizer_indices.sql' )
89+ self ._create_lookup_indices (config , 'word' )
8690
8791
8892 def update_sql_functions (self , config : Configuration ) -> None :
@@ -100,24 +104,35 @@ def check_database(self, config: Configuration) -> None:
100104 self .init_from_project (config )
101105
102106
103- def update_statistics (self ) -> None :
107+ def update_statistics (self , config : Configuration ) -> None :
104108 """ Recompute frequencies for all name words.
105109 """
106110 with connect (self .dsn ) as conn :
107- if conn .table_exists ('search_name' ):
108- with conn .cursor () as cur :
109- cur .drop_table ("word_frequencies" )
110- LOG .info ("Computing word frequencies" )
111- cur .execute ("""CREATE TEMP TABLE word_frequencies AS
112- SELECT unnest(name_vector) as id, count(*)
113- FROM search_name GROUP BY id""" )
114- cur .execute ("CREATE INDEX ON word_frequencies(id)" )
115- LOG .info ("Update word table with recomputed frequencies" )
116- cur .execute ("""UPDATE word
117- SET info = info || jsonb_build_object('count', count)
118- FROM word_frequencies WHERE word_id = id""" )
119- cur .drop_table ("word_frequencies" )
111+ if not conn .table_exists ('search_name' ):
112+ return
113+
114+ with conn .cursor () as cur :
115+ LOG .info ('Computing word frequencies' )
116+ cur .drop_table ('word_frequencies' )
117+ cur .execute ("""CREATE TEMP TABLE word_frequencies AS
118+ SELECT unnest(name_vector) as id, count(*)
119+ FROM search_name GROUP BY id""" )
120+ cur .execute ('CREATE INDEX ON word_frequencies(id)' )
121+ LOG .info ('Update word table with recomputed frequencies' )
122+ cur .drop_table ('tmp_word' )
123+ cur .execute ("""CREATE TABLE tmp_word AS
124+ SELECT word_id, word_token, type, word,
125+ (CASE WHEN wf.count is null THEN info
126+ ELSE info || jsonb_build_object('count', wf.count)
127+ END) as info
128+ FROM word LEFT JOIN word_frequencies wf
129+ ON word.word_id = wf.id""" )
130+ cur .drop_table ('word_frequencies' )
120131 conn .commit ()
132+ self ._create_base_indices (config , 'tmp_word' )
133+ self ._create_lookup_indices (config , 'tmp_word' )
134+ self ._move_temporary_word_table ('tmp_word' )
135+
121136
122137
123138 def _cleanup_housenumbers (self ) -> None :
@@ -219,16 +234,81 @@ def _save_config(self) -> None:
219234 self .loader .save_config_to_db (conn )
220235
221236
222- def _init_db_tables (self , config : Configuration ) -> None :
237+ def _setup_db_tables (self , config : Configuration , table_name : str ) -> None :
238+ """ Set up the word table and fill it with pre-computed word
239+ frequencies.
240+ """
241+ with connect (self .dsn ) as conn :
242+ with conn .cursor () as cur :
243+ cur .drop_table (table_name )
244+ sqlp = SQLPreprocessor (conn , config )
245+ sqlp .run_string (conn , """
246+ CREATE TABLE {{table_name}} (
247+ word_id INTEGER,
248+ word_token text NOT NULL,
249+ type text NOT NULL,
250+ word text,
251+ info jsonb
252+ ) {{db.tablespace.search_data}};
253+ GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
254+
255+ DROP SEQUENCE IF EXISTS seq_{{table_name}};
256+ CREATE SEQUENCE seq_{{table_name}} start 1;
257+ GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
258+ """ , table_name = table_name )
259+
260+
261+ def _create_base_indices (self , config : Configuration , table_name : str ) -> None :
223262 """ Set up the word table and fill it with pre-computed word
224263 frequencies.
225264 """
226265 with connect (self .dsn ) as conn :
227266 sqlp = SQLPreprocessor (conn , config )
228- sqlp .run_sql_file (conn , 'tokenizer/icu_tokenizer_tables.sql' )
267+ sqlp .run_string (conn ,
268+ """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
269+ USING BTREE (word_token) {{db.tablespace.search_index}}""" ,
270+ table_name = table_name )
271+ for name , ctype in WORD_TYPES :
272+ sqlp .run_string (conn ,
273+ """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
274+ USING BTREE (word) {{db.tablespace.address_index}}
275+ WHERE type = '{{column_type}}'
276+ """ ,
277+ table_name = table_name , idx_name = name ,
278+ column_type = ctype )
279+
280+
281+ def _create_lookup_indices (self , config : Configuration , table_name : str ) -> None :
282+ """ Create addtional indexes used when running the API.
283+ """
284+ with connect (self .dsn ) as conn :
285+ sqlp = SQLPreprocessor (conn , config )
286+ # Index required for details lookup.
287+ sqlp .run_string (conn , """
288+ CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
289+ ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
290+ """ ,
291+ table_name = table_name )
292+
293+
294+ def _move_temporary_word_table (self , old : str ) -> None :
295+ """ Rename all tables and indexes used by the tokenizer.
296+ """
297+ with connect (self .dsn ) as conn :
298+ with conn .cursor () as cur :
299+ cur .drop_table ('word' )
300+ cur .execute (f"ALTER TABLE { old } RENAME TO word" )
301+ for idx in ('word_token' , 'word_id' ):
302+ cur .execute (f"""ALTER INDEX idx_{ old } _{ idx }
303+ RENAME TO idx_word_{ idx } """ )
304+ for name , _ in WORD_TYPES :
305+ cur .execute (f"""ALTER INDEX idx_{ old } _{ name }
306+ RENAME TO idx_word_{ name } """ )
229307 conn .commit ()
230308
231309
310+
311+
232312class ICUNameAnalyzer (AbstractAnalyzer ):
233313 """ The ICU analyzer uses the ICU library for splitting names.
234314
0 commit comments