|
3 | 3 | from typing import Dict, List |
4 | 4 |
|
5 | 5 | import pandas as pd |
| 6 | +from sqlalchemy import insert |
| 7 | + |
6 | 8 | from ..crud.metadata import cast_tabular_cell_value_type |
7 | 9 | from ..crud import dimension_types as types_crud |
8 | 10 |
|
@@ -33,11 +35,12 @@ class PropertyValuePair: |
33 | 35 | class MetadataCacheEntry: |
34 | 36 | properties_to_index_df: pd.DataFrame |
35 | 37 | columns_metadata: Dict[str, ColumnMetadata] |
36 | | - label_by_given_id: Dict[str, str] |
37 | | - rows_by_index: Dict[str, Dict[str, str]] |
| 38 | + |
| 39 | + def get_label_for_given_id(self, given_id): |
| 40 | + return self.properties_to_index_df.loc[given_id, "label"] |
38 | 41 |
|
39 | 42 | def get_properties_dict(self, given_id: str): |
40 | | - return self.rows_by_index.get(given_id) |
| 43 | + return self.properties_to_index_df.loc[given_id].to_dict() |
41 | 44 |
|
42 | 45 |
|
43 | 46 | class MetadataCache: |
@@ -74,19 +77,13 @@ def get(self, dimension_type_name: str): |
74 | 77 |
|
75 | 78 | columns_metadata = dict(dimension_type.dataset.columns_metadata) |
76 | 79 |
|
77 | | - label_by_given_id = get_dimension_type_metadata_col( |
78 | | - self.db, dimension_type_name=dimension_type.name, col_name="label" |
79 | | - ) |
80 | | - |
81 | 80 | rows_by_index = {} |
82 | 81 | for record in properties_to_index_df.to_records(): |
83 | 82 | rows_by_index[record.index] = record |
84 | 83 |
|
85 | 84 | entry = MetadataCacheEntry( |
86 | 85 | properties_to_index_df=properties_to_index_df, |
87 | 86 | columns_metadata=columns_metadata, |
88 | | - label_by_given_id=label_by_given_id, |
89 | | - rows_by_index=rows_by_index, |
90 | 87 | ) |
91 | 88 | self.cache[dimension_type_name] = entry |
92 | 89 |
|
@@ -140,36 +137,46 @@ def refresh_search_index_for_dimension_type( |
140 | 137 | _delete_search_index_records(db, dimension_type) |
141 | 138 | log.info("_delete_search_index_records complete") |
142 | 139 |
|
143 | | - dimension_search_index_rows = [] |
144 | | - |
145 | 140 | cache_entry = metadata_cache.get(dimension_type.name) |
146 | 141 |
|
147 | | - for given_id in cache_entry.properties_to_index_df.index: |
148 | | - for record in get_property_value_pairs_for_given_id( |
149 | | - db=db, |
150 | | - dimension_type_name=dimension_type.name, |
151 | | - given_id=given_id, |
152 | | - metadata_cache=metadata_cache, |
153 | | - ): |
154 | | - # if given_id in cache_entry.dimension_id_by_given_id: |
155 | | - dimension_search_index_rows.append( |
156 | | - DimensionSearchIndex( |
157 | | - # dimension_id=cache_entry.dimension_id_by_given_id[given_id], |
| 142 | + def row_generator(): |
| 143 | + for given_id in cache_entry.properties_to_index_df.index: |
| 144 | + for record in get_property_value_pairs_for_given_id( |
| 145 | + db=db, |
| 146 | + dimension_type_name=dimension_type.name, |
| 147 | + given_id=given_id, |
| 148 | + metadata_cache=metadata_cache, |
| 149 | + ): |
| 150 | + # if given_id in cache_entry.dimension_id_by_given_id: |
| 151 | + yield dict( |
158 | 152 | property=record.property, |
159 | 153 | value=record.value, |
160 | 154 | group_id=dimension_type.dataset.group_id, |
161 | 155 | dimension_type_name=dimension_type.name, |
162 | 156 | dimension_given_id=given_id, |
163 | | - label=cache_entry.label_by_given_id[given_id], |
| 157 | + label=cache_entry.get_label_for_given_id(given_id), |
164 | 158 | ) |
165 | | - ) |
| 159 | + |
| 160 | + dimension_search_index_row_count = 0 |
| 161 | + for batch in _make_batches(row_generator(), batch_size=1000): |
| 162 | + db.execute(insert(DimensionSearchIndex), batch) |
| 163 | + dimension_search_index_row_count += len(batch) |
| 164 | + f"Wrote batch of {len(batch)} search index records for {dimension_type_name}" |
166 | 165 |
|
167 | 166 | log.info( |
168 | | - f"refresh_search_index_for_dimension_type generated {len(dimension_search_index_rows)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}. Writing..." |
| 167 | + f"Finished writing all {(dimension_search_index_row_count)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}" |
169 | 168 | ) |
170 | 169 |
|
171 | | - db.bulk_save_objects(dimension_search_index_rows) |
172 | | - log.info("refresh_search_index_for_dimension_type complete") |
| 170 | + |
| 171 | +def _make_batches(iterable, batch_size): |
| 172 | + batch = [] |
| 173 | + for item in iterable: |
| 174 | + batch.append(item) |
| 175 | + if len(batch) >= batch_size: |
| 176 | + yield batch |
| 177 | + batch = [] |
| 178 | + if len(batch) > 0: |
| 179 | + yield batch |
173 | 180 |
|
174 | 181 |
|
175 | 182 | def _get_datatypes_referencing(db, dimension_type_name): |
|
0 commit comments