Payments: Remove over-aggressive file deduping (#2990)

* remove over-aggressive file deduping * add content hash docs and yaml anchor for fuzzy uniqueness
cal-itp · Oct 5, 2023 · 3491334 · 3491334
1 parent 32c1f76
commit 3491334
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 16 deletions.
diff --git a/warehouse/macros/littlepay_staging_transforms.sql b/warehouse/macros/littlepay_staging_transforms.sql
@@ -32,17 +32,7 @@ CASE
 END
 {% endmacro %}
 
-{% macro qualify_dedupe_lp_files(instance_col = 'instance', file_dt_col = 'littlepay_export_date', file_ts_col = 'littlepay_export_ts', ts_col = 'ts') %}
-
--- remove duplicate instances of the same file (file defined as date-level update from LP)
--- partition by file date, order by LP-defined timestamp (most recent first), and then order by our extract timestamp (most recent first)
--- use dense rank instead of row number because we need to allow all rows from a given file to be included (allow ties)
-QUALIFY DENSE_RANK()
-    OVER (PARTITION BY {{ instance_col }}, {{ file_dt_col }} ORDER BY {{ file_ts_col }} DESC, {{ ts_col }} DESC) = 1
-
-{% endmacro %}
-
-{% macro qualify_dedupe_full_duplicate_lp_rows(content_hash_col = 'content_hash', file_ts_col = 'littlepay_export_ts', line_number_col = '_line_number') %}
+{% macro qualify_dedupe_full_duplicate_lp_rows(content_hash_col = '_content_hash', file_ts_col = 'littlepay_export_ts', line_number_col = '_line_number') %}
 
 -- remove full duplicate rows where *all* content is the same
 -- get most recent instance across files and then highest-line-number instance within most recent file

diff --git a/warehouse/models/staging/payments/littlepay/_littlepay.yml b/warehouse/models/staging/payments/littlepay/_littlepay.yml
@@ -181,13 +181,19 @@ models:
         tests:
           - not_null
           - unique
-      - name: _payments_key
+      - &payments_key_fuzzy_uniqueness
+        name: _payments_key
         description: |
           Synthentic key composed of the elements that define a natural key within the source data (primary key according to Littlepay schema.)
         tests:
           - not_null
           - unique_proportion:
               at_least: 0.999
+      - &_content_hash
+        name: _content_hash
+        description: |
+          Hash of all data columns to uniquely identify row's content, mostly for debugging purposes.
+          Should ideally be handled by uniqueness of _payments_key but surfaced for troubleshooting.
 
   - name: stg_littlepay__customer_funding_source
     tests:

diff --git a/warehouse/models/staging/payments/littlepay/stg_littlepay__authorisations.sql b/warehouse/models/staging/payments/littlepay/stg_littlepay__authorisations.sql
@@ -37,18 +37,17 @@ clean_columns_and_dedupe_files AS (
         {{ dbt_utils.generate_surrogate_key(['participant_id',
             'aggregation_id', 'acquirer_id', 'request_type', 'transaction_amount', 'currency_code',
             'retrieval_reference_number', 'littlepay_reference_number', 'external_reference_number',
-            'response_code', 'status', 'authorisation_date_time_utc']) }} AS content_hash,
+            'response_code', 'status', 'authorisation_date_time_utc']) }} AS _content_hash,
     FROM source
     -- drop extra header rows
     WHERE aggregation_id != "aggregation_id"
-    {{ qualify_dedupe_lp_files() }}
 ),
 
 add_keys_drop_full_dupes AS (
     SELECT
         *,
-        -- generate keys now that input columns have been trimmed & cast and files deduped
-        {{ dbt_utils.generate_surrogate_key(['littlepay_export_date', '_line_number', 'instance']) }} AS _key,
+        -- generate keys now that input columns have been trimmed & cast
+        {{ dbt_utils.generate_surrogate_key(['littlepay_export_ts', '_line_number', 'instance']) }} AS _key,
         {{ dbt_utils.generate_surrogate_key(['aggregation_id', 'authorisation_date_time_utc']) }} AS _payments_key,
     FROM clean_columns_and_dedupe_files
     {{ qualify_dedupe_full_duplicate_lp_rows() }}
@@ -90,6 +89,7 @@ stg_littlepay__authorisations AS (
         ts,
         _key,
         _payments_key,
+        _content_hash,
     FROM add_keys_drop_full_dupes
     LEFT JOIN same_timestamp_simple_dupes
     USING(_payments_key)