Skip to content

Commit

Permalink
Payments: Remove over-aggressive file deduping (#2990)
Browse files Browse the repository at this point in the history
* remove over-aggressive file deduping

* add content hash docs and yaml anchor for fuzzy uniqueness
  • Loading branch information
lauriemerrell authored Oct 5, 2023
1 parent 32c1f76 commit 3491334
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 16 deletions.
12 changes: 1 addition & 11 deletions warehouse/macros/littlepay_staging_transforms.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,7 @@ CASE
END
{% endmacro %}

{% macro qualify_dedupe_lp_files(instance_col = 'instance', file_dt_col = 'littlepay_export_date', file_ts_col = 'littlepay_export_ts', ts_col = 'ts') %}

-- remove duplicate instances of the same file (file defined as date-level update from LP)
-- partition by file date, order by LP-defined timestamp (most recent first), and then order by our extract timestamp (most recent first)
-- use dense rank instead of row number because we need to allow all rows from a given file to be included (allow ties)
QUALIFY DENSE_RANK()
OVER (PARTITION BY {{ instance_col }}, {{ file_dt_col }} ORDER BY {{ file_ts_col }} DESC, {{ ts_col }} DESC) = 1

{% endmacro %}

{% macro qualify_dedupe_full_duplicate_lp_rows(content_hash_col = 'content_hash', file_ts_col = 'littlepay_export_ts', line_number_col = '_line_number') %}
{% macro qualify_dedupe_full_duplicate_lp_rows(content_hash_col = '_content_hash', file_ts_col = 'littlepay_export_ts', line_number_col = '_line_number') %}

-- remove full duplicate rows where *all* content is the same
-- get most recent instance across files and then highest-line-number instance within most recent file
Expand Down
8 changes: 7 additions & 1 deletion warehouse/models/staging/payments/littlepay/_littlepay.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,19 @@ models:
tests:
- not_null
- unique
- name: _payments_key
- &payments_key_fuzzy_uniqueness
name: _payments_key
description: |
Synthentic key composed of the elements that define a natural key within the source data (primary key according to Littlepay schema.)
tests:
- not_null
- unique_proportion:
at_least: 0.999
- &_content_hash
name: _content_hash
description: |
Hash of all data columns to uniquely identify row's content, mostly for debugging purposes.
Should ideally be handled by uniqueness of _payments_key but surfaced for troubleshooting.
- name: stg_littlepay__customer_funding_source
tests:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,17 @@ clean_columns_and_dedupe_files AS (
{{ dbt_utils.generate_surrogate_key(['participant_id',
'aggregation_id', 'acquirer_id', 'request_type', 'transaction_amount', 'currency_code',
'retrieval_reference_number', 'littlepay_reference_number', 'external_reference_number',
'response_code', 'status', 'authorisation_date_time_utc']) }} AS content_hash,
'response_code', 'status', 'authorisation_date_time_utc']) }} AS _content_hash,
FROM source
-- drop extra header rows
WHERE aggregation_id != "aggregation_id"
{{ qualify_dedupe_lp_files() }}
),

add_keys_drop_full_dupes AS (
SELECT
*,
-- generate keys now that input columns have been trimmed & cast and files deduped
{{ dbt_utils.generate_surrogate_key(['littlepay_export_date', '_line_number', 'instance']) }} AS _key,
-- generate keys now that input columns have been trimmed & cast
{{ dbt_utils.generate_surrogate_key(['littlepay_export_ts', '_line_number', 'instance']) }} AS _key,
{{ dbt_utils.generate_surrogate_key(['aggregation_id', 'authorisation_date_time_utc']) }} AS _payments_key,
FROM clean_columns_and_dedupe_files
{{ qualify_dedupe_full_duplicate_lp_rows() }}
Expand Down Expand Up @@ -90,6 +89,7 @@ stg_littlepay__authorisations AS (
ts,
_key,
_payments_key,
_content_hash,
FROM add_keys_drop_full_dupes
LEFT JOIN same_timestamp_simple_dupes
USING(_payments_key)
Expand Down

0 comments on commit 3491334

Please sign in to comment.