|
| 1 | +--- |
| 2 | +title: WarehousePG Copy configuration file |
| 3 | +navTitle: whpg-copy configuration file |
| 4 | +description: The complete reference to the WarehousePG Copy configuration file. |
| 5 | +--- |
| 6 | + |
| 7 | +This is the TOML-based configuration file reference for WarehousePG Copy TOML-based. |
| 8 | + |
| 9 | +!!! Note |
| 10 | + Command-line arguments take precedence over settings defined in a TOML file. |
| 11 | + |
| 12 | +You can generate a sample configuration file using the [whpg-copy config-example](whpg-copy-utility#config-example) command. |
| 13 | + |
| 14 | +```yaml |
| 15 | +# ============================================================================== |
| 16 | +# whpg-copy Configuration Example |
| 17 | +# ============================================================================== |
| 18 | +# Source database connection URL. |
| 19 | +# Supports standard PostgreSQL connection strings. |
| 20 | +src_url = "postgres://gpadmin:@10.0.0.1:5432/source_db" |
| 21 | +# Destination database connection URL. |
| 22 | +dst_url = "postgres://gpadmin:@10.0.0.2:5432/target_db" |
| 23 | +# Tables to include in the copy operation. To have uppercase letters or other |
| 24 | +# special characters in schema or table names, follow PostgreSQL's qualified |
| 25 | +# identifier rules to quote them. |
| 26 | +# Default: None |
| 27 | +include_table = [ |
| 28 | + "public.users", |
| 29 | + "\"Inventory\".\"StockItems\"" |
| 30 | +] |
| 31 | +# Tables to exclude. |
| 32 | +# Default: None |
| 33 | +exclude_table = [ |
| 34 | + "public.temp_cache", |
| 35 | + "sales.test_data" |
| 36 | +] |
| 37 | +# Enable compression during data transfer to reduce network bandwidth usage. |
| 38 | +# Recommended for transfers over WAN or slow networks. |
| 39 | +# Default: true |
| 40 | +compression = true |
| 41 | +# Copy partitioned tables through their leaf partitions in parallel. |
| 42 | +# Disable this to enforce data goes through the root partition table or |
| 43 | +# intermediate partition table. |
| 44 | +# Default: true |
| 45 | +through_partition_leaves = true |
| 46 | +# How to handle existing tables on the destination: |
| 47 | +# "append" : Insert data into existing tables. |
| 48 | +# "truncate" : Clear the destination table before copying. |
| 49 | +# "skip-existing" : Do not copy if the table already exists. |
| 50 | +# Default: append |
| 51 | +target_mode = "append" |
| 52 | +# Validation method to perform after copying data: |
| 53 | +# "none" : No validation. |
| 54 | +# "count" : Compare row counts between source and destination. |
| 55 | +# "checksum" : Calculate and compare data hashes. |
| 56 | +# Default: none |
| 57 | +validate_method = "count" |
| 58 | +# Number of parallel workers to run. |
| 59 | +# Default: 4 |
| 60 | +workers = 4 |
| 61 | +# Listening port range on the destination for data transferring. Those |
| 62 | +# ports need to be enabled to be accessed from the source segments. |
| 63 | +# whpg-copy will try to start listening on the port one by one from |
| 64 | +# the range. At least one port is required in the range. |
| 65 | +port_range = "60000-60001" |
| 66 | +# Define rules to rename schemas or tables during the copy process. |
| 67 | +# Each rule requires at least a source pattern. |
| 68 | +# |
| 69 | +# public.sales -> new_schema.sales |
| 70 | +[[mapping_rules]] |
| 71 | +src_table= "sales" |
| 72 | +dst_schema = "new_schema" |
| 73 | +# old_schema.raw_logs -> new_schema.processed_logs |
| 74 | +[[mapping_rules]] |
| 75 | +src_schema = "old_schema" |
| 76 | +src_table = "raw_logs" |
| 77 | +dst_schema = "new_schema" |
| 78 | +dst_table = "processed_logs" |
| 79 | +[[mapping_rules]] |
| 80 | +src_schema = "old_schema(\\d+)" |
| 81 | +dst_schema = "new_schema${1}" |
| 82 | +src_table = "old_table(\\d+)" |
| 83 | +dst_table = "new_table${1}" |
| 84 | +# Run the operation without actually changing any data on the destination. |
| 85 | +dry_run = false |
| 86 | +``` |
| 87 | + |
| 88 | +## Keywords and values |
| 89 | + |
| 90 | + |
| 91 | +**src_url** |
| 92 | + |
| 93 | +Connection string for the source database. Supports standard PostgreSQL connection strings. It follows the format `postgres://[user@]host[:port][/dbname]`. |
| 94 | + |
| 95 | +**dst_url** |
| 96 | + |
| 97 | +Connection string for the destination database. It follows the format `postgres://[user@]host[:port][/dbname]`. |
| 98 | + |
| 99 | +**include_table** |
| 100 | + |
| 101 | +Specifies tables to include. Use the format `schema.table` to specify the relations. If you are using special characters, follow PostgreSQL's qualified identifier rules to quote them. |
| 102 | + |
| 103 | +**exclude_table** |
| 104 | + |
| 105 | +Specifies tables to exclude. Uses the same format as `include-table`. |
| 106 | + |
| 107 | +**compression** |
| 108 | + |
| 109 | +Enables or disables ZSTD compression during data transfer. Recommended for transfers over WAN or slow networks. Default is `true`. |
| 110 | + |
| 111 | +**through_partition_leaves** |
| 112 | + |
| 113 | +If `true` (default), copies data directly between leaf partitions in parallel. If `false`, data goes through the specified root/intermediate partition table. |
| 114 | + |
| 115 | +**target_mode** |
| 116 | + |
| 117 | +Determines how to handle existing tables on the destination. The supported options are: |
| 118 | +- **append**: (Default) Inserts data into existing tables. |
| 119 | +- **truncate**: Truncates the destination table before copying. |
| 120 | +- **skip-existing** : Skips the copy operation if the table already exists. |
| 121 | + |
| 122 | +**validate_method** |
| 123 | + |
| 124 | +Validation to perform after copying. The supported options are: |
| 125 | +- **none**: (Default) No validation. |
| 126 | +- **count**: Compares row counts. |
| 127 | +- **checksum**: Calculates and compares data hashes. |
| 128 | + |
| 129 | +**workers** |
| 130 | + |
| 131 | +Specifies the number of concurrent worker tasks. Default is 4. |
| 132 | + |
| 133 | +**port_range** |
| 134 | + |
| 135 | +Defines the ports on the destination cluster used to receive data from the source segments. `whpg-copy` scans this range sequentially and binds to the first available port it finds. You must specify at least one port and ensure the entire range is accessible from the source segment hosts. |
| 136 | + |
| 137 | +**mapping_rules** |
| 138 | + |
| 139 | +Mapping rules allow for powerful renaming and selection logic using regular expressions (Regex). You can define multiple `[[mapping_rules]]` blocks in your configuration file. The supported options are: |
| 140 | + |
| 141 | +- **src_schema**, **scr_table**: Define the source objects. These fields use standard Rust regular expression patterns to match your source objects. Patterns are automatically anchored (wrapped in `^` and `$`). For example, `src_table = "users"` matches only the table `"users"`, not `"super_users"`. To match multiple tables, use the `.*` wildcard. |
| 142 | +- **dst_schema**, **dst_table**: Define the destination objects. These fields support Regex's Capture Groups. If your source pattern contains groups in parentheses `()`, you can reference them in the destination using `${1}`, `${2}`, etcetera. |
| 143 | +- **sql**: Custom SQL query to use for extracting data from the source table. Instead of copying the entire table, `whpg-copy` will execute this SQL and copy its result. Supports placeholders: `${src_schema}` and `${src_table}`, the utility will automatically replace them with the escaped source objects. This is ideal for joining tables, masking sensitive data, or changing data types on the fly. |
| 144 | + |
| 145 | +!!! Note |
| 146 | + If your mapping rule involves a rename (the destination schema or table name is different from the source), `whpg-copy` cannot automatically create the table on the destination cluster. You must ensure the destination table exists with the correct schema before initiating the copy. |
| 147 | +!!! |
| 148 | + |
| 149 | +**dry_run** |
| 150 | + |
| 151 | +Run the operation without changing any data on the destination (default is `false`). |
| 152 | + |
| 153 | +## Examples |
| 154 | + |
| 155 | +- Copy the table `public.users` to a table named `public.customers`: |
| 156 | + |
| 157 | + ``` |
| 158 | + [[mapping_rules]] |
| 159 | + src_schema = "public" |
| 160 | + src_table = "users" |
| 161 | + dst_table = "customers" |
| 162 | + ``` |
| 163 | + |
| 164 | +- Copy all tables in schema `legacy` to the schema `archived`: |
| 165 | + |
| 166 | + ``` |
| 167 | + [[mapping_rules]] |
| 168 | + src_schema = "legacy" |
| 169 | + src_table = ".*" # Match all tables in 'legacy' schema |
| 170 | + dst_schema = "archived" |
| 171 | + ``` |
| 172 | + |
| 173 | +- Copy using Capture Groups to dynamically rename tables during the copy operation. |
| 174 | + |
| 175 | + By using parentheses `()` in your `src_table` pattern, you can "save" parts of the table name and "paste" them into the new name. |
| 176 | + |
| 177 | + ``` |
| 178 | + [[mapping_rules]] |
| 179 | + src_table = "data_(\\d+)_(\\d+)" |
| 180 | + dst_table = "record_${1}_v${2}" |
| 181 | + ``` |
| 182 | + |
| 183 | + The `src_table` pattern looks for tables starting with `data_`, followed by two groups of digits. |
| 184 | + |
| 185 | + The `dst_table` template defines how the new table should be named using the saved groups. |
| 186 | + |
| 187 | + As a result, a table named `data_2023_01` will be copied as `record_2023_v01`. |
| 188 | + |
| 189 | + |
| 190 | +- Use a custom SQL only to copy orders from 2024 onwards: |
| 191 | + |
| 192 | + ``` |
| 193 | + [[mapping_rules]] |
| 194 | + src_table = "orders" |
| 195 | + sql = "SELECT * FROM ${src_schema}.${src_table} WHERE order_date >= '2024-01-01'" |
| 196 | + ``` |
0 commit comments