Skip to content

Commit

Permalink
Merge pull request #3328 from lonvia/word-count-into-new-table
Browse files Browse the repository at this point in the history
Recreate word table when refreshing counts
  • Loading branch information
lonvia authored Feb 5, 2024
2 parents 33c0f24 + 81eed06 commit f523c01
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 82 deletions.
40 changes: 0 additions & 40 deletions lib-sql/tokenizer/icu_tokenizer_tables.sql

This file was deleted.

2 changes: 1 addition & 1 deletion nominatim/clicmd/refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t

if args.word_counts:
LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics()
self._get_tokenizer(args.config).update_statistics(args.config)

if args.address_levels:
LOG.warning('Updating address levels')
Expand Down
2 changes: 1 addition & 1 deletion nominatim/clicmd/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements
tokenizer.finalize_import(args.config)

LOG.warning('Recompute word counts')
tokenizer.update_statistics()
tokenizer.update_statistics(args.config)

webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
Expand Down
12 changes: 12 additions & 0 deletions nominatim/db/sql_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,18 @@ def __init__(self, conn: Connection, config: Configuration) -> None:
self.env.globals['postgres'] = _setup_postgresql_features(conn)


def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
""" Execute the given SQL template string on the connection.
The keyword arguments may supply additional parameters
for preprocessing.
"""
sql = self.env.from_string(template).render(**kwargs)

with conn.cursor() as cur:
cur.execute(sql)
conn.commit()


def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
""" Execute the given SQL file on the connection. The keyword arguments
may supply additional parameters for preprocessing.
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]:


@abstractmethod
def update_statistics(self) -> None:
def update_statistics(self, config: Configuration) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
Expand Down
120 changes: 100 additions & 20 deletions nominatim/tokenizer/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@

LOG = logging.getLogger()

WORD_TYPES =(('country_names', 'C'),
('postcodes', 'P'),
('full_word', 'W'),
('housenumbers', 'H'))

def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
Expand Down Expand Up @@ -62,7 +67,8 @@ def init_new_db(self, config: Configuration, init_db: bool = True) -> None:

if init_db:
self.update_sql_functions(config)
self._init_db_tables(config)
self._setup_db_tables(config, 'word')
self._create_base_indices(config, 'word')


def init_from_project(self, config: Configuration) -> None:
Expand All @@ -80,9 +86,7 @@ def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
self._create_lookup_indices(config, 'word')


def update_sql_functions(self, config: Configuration) -> None:
Expand All @@ -100,24 +104,35 @@ def check_database(self, config: Configuration) -> None:
self.init_from_project(config)


def update_statistics(self) -> None:
def update_statistics(self, config: Configuration) -> None:
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
if conn.table_exists('search_name'):
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word
SET info = info || jsonb_build_object('count', count)
FROM word_frequencies WHERE word_id = id""")
cur.drop_table("word_frequencies")
if not conn.table_exists('search_name'):
return

with conn.cursor() as cur:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.count is null THEN info
ELSE info || jsonb_build_object('count', wf.count)
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id""")
cur.drop_table('word_frequencies')
conn.commit()
self._create_base_indices(config, 'tmp_word')
self._create_lookup_indices(config, 'tmp_word')
self._move_temporary_word_table('tmp_word')



def _cleanup_housenumbers(self) -> None:
Expand Down Expand Up @@ -219,16 +234,81 @@ def _save_config(self) -> None:
self.loader.save_config_to_db(conn)


def _init_db_tables(self, config: Configuration) -> None:
def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table(table_name)
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE {{table_name}} (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_{{table_name}};
CREATE SEQUENCE seq_{{table_name}} start 1;
GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
""", table_name=table_name)


def _create_base_indices(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
USING BTREE (word_token) {{db.tablespace.search_index}}""",
table_name=table_name)
for name, ctype in WORD_TYPES:
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
USING BTREE (word) {{db.tablespace.address_index}}
WHERE type = '{{column_type}}'
""",
table_name=table_name, idx_name=name,
column_type=ctype)


def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
""" Create addtional indexes used when running the API.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
# Index required for details lookup.
sqlp.run_string(conn, """
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
""",
table_name=table_name)


def _move_temporary_word_table(self, old: str) -> None:
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
RENAME TO idx_word_{idx}""")
for name, _ in WORD_TYPES:
cur.execute(f"""ALTER INDEX idx_{old}_{name}
RENAME TO idx_word_{name}""")
conn.commit()




class ICUNameAnalyzer(AbstractAnalyzer):
""" The ICU analyzer uses the ICU library for splitting names.
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/legacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None:
self._save_config(conn, config)


def update_statistics(self) -> None:
def update_statistics(self, _: Configuration) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
Expand Down
4 changes: 2 additions & 2 deletions test/python/cli/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def update_sql_functions(self, *args):
def finalize_import(self, *args):
self.finalize_import_called = True

def update_statistics(self):
def update_statistics(self, *args):
self.update_statistics_called = True

def update_word_tokens(self):
def update_word_tokens(self, *args):
self.update_word_tokens_called = True


Expand Down
20 changes: 8 additions & 12 deletions test/python/tokenizer/test_icu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""
Tests for ICU tokenizer.
"""
import shutil
import yaml
import itertools

Expand All @@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))

project_env.lib_dir.sql = sqldir

Expand Down Expand Up @@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,

def test_finalize_import(tokenizer_factory, temp_db_conn,
temp_db_cursor, test_config, sql_preprocessor_cfg):
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")

tok = tokenizer_factory()
tok.init_new_db(test_config)

assert not temp_db_conn.index_exists('idx_word_word_id')

tok.finalize_import(test_config)

temp_db_cursor.scalar('SELECT test()') == 'b'
assert temp_db_conn.index_exists('idx_word_word_id')


def test_check_database(test_config, tokenizer_factory,
Expand All @@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
assert tok.check_database(test_config) is None


def test_update_statistics_reverse_only(word_table, tokenizer_factory):
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.update_statistics()
tok.update_statistics(test_config)


def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
def test_update_statistics(word_table, table_factory, temp_db_cursor,
tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello')
table_factory('search_name',
'place_id BIGINT, name_vector INT[]',
[(12, [1000])])
tok = tokenizer_factory()

tok.update_statistics()
tok.update_statistics(test_config)

assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and
Expand Down
8 changes: 4 additions & 4 deletions test/python/tokenizer/test_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
assert tok.check_database(False) is not None


def test_update_statistics_reverse_only(word_table, tokenizer_factory):
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.update_statistics()
tok.update_statistics(test_config)


def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello')
table_factory('search_name',
'place_id BIGINT, name_vector INT[]',
[(12, [1000])])
tok = tokenizer_factory()

tok.update_statistics()
tok.update_statistics(test_config)

assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE word_token like ' %' and
Expand Down

0 comments on commit f523c01

Please sign in to comment.