|
85 | 85 | # Constants for table naming conventions |
86 | 86 | VIEW_SUFFIX = "_view" |
87 | 87 |
|
| 88 | +# Tables that use ID-based sharding (sipHash64(field)) instead of random sharding |
| 89 | +# in distributed mode. Maps table name to the field used for sharding. |
| 90 | +# This ensures all data for a specific ID goes to the same shard, enabling |
| 91 | +# efficient point lookups. |
| 92 | +ID_SHARDED_TABLES: dict[str, str] = {"calls_complete": "id"} |
| 93 | + |
88 | 94 |
|
89 | 95 | @dataclass(frozen=True) |
90 | 96 | class PostMigrationHookContext: |
@@ -781,12 +787,21 @@ def _format_distributed_sql(self, sql_query: str) -> DistributedTransformResult: |
781 | 787 | ) |
782 | 788 |
|
783 | 789 | def _create_distributed_table_sql(self, table_name: str) -> str: |
784 | | - """Generate SQL to create a distributed table.""" |
| 790 | + """Generate SQL to create a distributed table. |
| 791 | +
|
| 792 | + For tables in ID_SHARDED_TABLES, uses sipHash64(field) as the sharding key |
| 793 | + to ensure all data for a specific ID goes to the same shard, enabling |
| 794 | + efficient point lookups. Other tables use rand() for even distribution. |
| 795 | + """ |
785 | 796 | local_table_name = table_name + ch_settings.LOCAL_TABLE_SUFFIX |
| 797 | + if shard_field := ID_SHARDED_TABLES.get(table_name): |
| 798 | + sharding_key = f"sipHash64({shard_field})" |
| 799 | + else: |
| 800 | + sharding_key = "rand()" |
786 | 801 | return f""" |
787 | 802 | CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {self.replicated_cluster} |
788 | 803 | AS {local_table_name} |
789 | | - ENGINE = Distributed({self.replicated_cluster}, currentDatabase(), {local_table_name}, rand()) |
| 804 | + ENGINE = Distributed({self.replicated_cluster}, currentDatabase(), {local_table_name}, {sharding_key}) |
790 | 805 | """ |
791 | 806 |
|
792 | 807 | @staticmethod |
|
0 commit comments