Skip to content

Commit 045aac3

Browse files
nj1973helensilva14
andcommitted
feat: Prevent column validation exceptions caused by Oracle CLOB JSON columns (#1365)
* tests: Add some failing tests so I can work on adding JSON support * feat: JSON columns are compared using length in column validations like string columns * chore: Typos * Update data_validation/config_manager.py Co-authored-by: Helen Cristina <[email protected]> * Update data_validation/config_manager.py Co-authored-by: Helen Cristina <[email protected]> * Update data_validation/config_manager.py Co-authored-by: Helen Cristina <[email protected]> --------- Co-authored-by: Helen Cristina <[email protected]>
1 parent e9f79ee commit 045aac3

File tree

4 files changed

+58
-15
lines changed

4 files changed

+58
-15
lines changed

data_validation/config_manager.py

+34-11
Original file line numberDiff line numberDiff line change
@@ -748,11 +748,32 @@ def build_and_append_pre_agg_calc_config(
748748
return calculated_config
749749

750750
def append_pre_agg_calc_field(
751-
self, source_column, target_column, agg_type, column_type, column_position
751+
self,
752+
source_column: str,
753+
target_column: str,
754+
agg_type: str,
755+
column_type: str,
756+
target_column_type: str,
757+
column_position: int,
752758
) -> dict:
753-
"""Append calculated field for length(string | binary) or epoch_seconds(timestamp) for preprocessing before column validation aggregation."""
759+
"""Append calculated field for length() or epoch_seconds(timestamp) for preprocessing before column validation aggregation."""
754760
depth, cast_type = 0, None
755-
if column_type in ["string", "!string"]:
761+
if any(_ in ["json", "!json"] for _ in [column_type, target_column_type]):
762+
# JSON data which needs casting to string before we apply a length function.
763+
pre_calculated_config = self.build_and_append_pre_agg_calc_config(
764+
source_column,
765+
target_column,
766+
"cast",
767+
column_position,
768+
"string",
769+
depth,
770+
)
771+
source_column = target_column = pre_calculated_config[
772+
consts.CONFIG_FIELD_ALIAS
773+
]
774+
depth = 1
775+
calc_func = "length"
776+
elif column_type in ["string", "!string"]:
756777
calc_func = "length"
757778

758779
elif column_type in ["binary", "!binary"]:
@@ -763,14 +784,12 @@ def append_pre_agg_calc_field(
763784
self.source_client.name == "bigquery"
764785
or self.target_client.name == "bigquery"
765786
):
766-
calc_func = "cast"
767-
cast_type = "timestamp"
768787
pre_calculated_config = self.build_and_append_pre_agg_calc_config(
769788
source_column,
770789
target_column,
771-
calc_func,
790+
"cast",
772791
column_position,
773-
cast_type,
792+
"timestamp",
774793
depth,
775794
)
776795
source_column = target_column = pre_calculated_config[
@@ -852,14 +871,17 @@ def require_pre_agg_calc_field(
852871
agg_type: str,
853872
cast_to_bigint: bool,
854873
) -> bool:
855-
if column_type in ["string", "!string"] and target_column_type in [
856-
"string",
857-
"!string",
858-
]:
874+
if all(
875+
_ in ["string", "!string", "json", "!json"]
876+
for _ in [column_type, target_column_type]
877+
):
878+
# These data types are aggregated using their lengths.
859879
return True
860880
elif column_type in ["binary", "!binary"]:
861881
if agg_type == "count":
862882
# Oracle BLOB is invalid for use with SQL COUNT function.
883+
# The expression below returns True if client is Oracle which
884+
# has the effect of triggering use of byte_length transformation.
863885
return bool(
864886
self.source_client.name == "oracle"
865887
or self.target_client.name == "oracle"
@@ -955,6 +977,7 @@ def require_pre_agg_calc_field(
955977
casefold_target_columns[column],
956978
agg_type,
957979
column_type,
980+
target_column_type,
958981
column_position,
959982
)
960983
else:

tests/resources/oracle_test_tables.sql

+9
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,13 @@ CREATE TABLE pso_data_validator.dvt_ora2pg_types
105105
, col_blob BLOB
106106
, col_clob CLOB
107107
, col_nclob NCLOB
108+
, col_json CLOB
109+
, col_jsonb CLOB
108110
);
111+
ALTER TABLE pso_data_validator.dvt_ora2pg_types
112+
ADD CONSTRAINT dvt_ora2pg_types_chk1 CHECK (col_json IS JSON) ENABLE;
113+
ALTER TABLE pso_data_validator.dvt_ora2pg_types
114+
ADD CONSTRAINT dvt_ora2pg_types_chk2 CHECK (col_jsonb IS JSON) ENABLE;
109115
COMMENT ON TABLE pso_data_validator.dvt_ora2pg_types IS 'Oracle to PostgreSQL integration test table';
110116

111117
-- Literals below match corresponding table in postgresql_test_tables.sql
@@ -121,6 +127,7 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
121127
,INTERVAL '1 2:03:44.0' DAY TO SECOND(3)
122128
,UTL_RAW.CAST_TO_RAW('DVT'),UTL_RAW.CAST_TO_RAW('DVT')
123129
,UTL_RAW.CAST_TO_RAW('DVT'),'DVT A','DVT A'
130+
,'{"dvt": 123, "status": "abc"}','{"dvt": 123, "status": "abc"}'
124131
);
125132
INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
126133
(2,2222,123456789,123456789012345678,1234567890123456789012345
@@ -134,6 +141,7 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
134141
,INTERVAL '2 3:04:55.666' DAY TO SECOND(3)
135142
,UTL_RAW.CAST_TO_RAW('DVT'),UTL_RAW.CAST_TO_RAW('DVT DVT')
136143
,UTL_RAW.CAST_TO_RAW('DVT DVT'),'DVT B','DVT B'
144+
,'{"dvt": 234, "status": "def"}','{"dvt": 234, "status": "def"}'
137145
);
138146
INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
139147
(3,3333,123456789,123456789012345678,1234567890123456789012345
@@ -147,6 +155,7 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
147155
,INTERVAL '3 4:05:06.7' DAY TO SECOND(3)
148156
,UTL_RAW.CAST_TO_RAW('DVT'),UTL_RAW.CAST_TO_RAW('DVT DVT DVT')
149157
,UTL_RAW.CAST_TO_RAW('DVT DVT DVT'),'DVT C','DVT C'
158+
,'{"dvt": 345, "status": "ghi"}','{"dvt": 345, "status": "ghi"}'
150159
);
151160
COMMIT;
152161

tests/resources/postgresql_test_tables.sql

+7-2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ CREATE TABLE pso_data_validator.dvt_ora2pg_types
7979
, col_blob bytea
8080
, col_clob text
8181
, col_nclob text
82+
, col_json json
83+
, col_jsonb jsonb
8284
);
8385
COMMENT ON TABLE pso_data_validator.dvt_ora2pg_types IS 'Oracle to PostgreSQL integration test table';
8486

@@ -94,7 +96,8 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
9496
,TIMESTAMP WITH TIME ZONE'1970-01-01 00:00:01.123456 +00:00'
9597
,INTERVAL '1 2:03:44.0' DAY TO SECOND(3)
9698
,CAST('DVT' AS BYTEA),CAST('DVT' AS BYTEA)
97-
,CAST('DVT' AS BYTEA),'DVT A','DVT A')
99+
,CAST('DVT' AS BYTEA),'DVT A','DVT A'
100+
,'{"dvt": 123, "status": "abc"}','{"dvt": 123, "status": "abc"}')
98101
,(2,2222,123456789,123456789012345678,1234567890123456789012345
99102
,123.12,123.11
100103
--,123400,0.002
@@ -105,7 +108,8 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
105108
,TIMESTAMP WITH TIME ZONE'1970-01-02 00:00:02.123456 -02:00'
106109
,INTERVAL '2 3:04:55.666' DAY TO SECOND(3)
107110
,CAST('DVT' AS BYTEA),CAST('DVT DVT' AS BYTEA)
108-
,CAST('DVT DVT' AS BYTEA),'DVT B','DVT B')
111+
,CAST('DVT DVT' AS BYTEA),'DVT B','DVT B'
112+
,'{"dvt": 234, "status": "def"}','{"dvt": 234, "status": "def"}')
109113
,(3,3333,123456789,123456789012345678,1234567890123456789012345
110114
,123.123,123.11
111115
--,123400,0.003
@@ -117,6 +121,7 @@ INSERT INTO pso_data_validator.dvt_ora2pg_types VALUES
117121
,INTERVAL '3 4:05:06.7' DAY TO SECOND(3)
118122
,CAST('DVT' AS BYTEA),CAST('DVT DVT DVT' AS BYTEA)
119123
,CAST('DVT DVT DVT' AS BYTEA),'DVT C','DVT C'
124+
,'{"dvt": 345, "status": "ghi"}','{"dvt": 345, "status": "ghi"}'
120125
);
121126

122127
/* Following table used for validating generating table partitions */

tests/system/data_sources/test_oracle.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@
100100
"col_blob",
101101
"col_clob",
102102
"col_nclob",
103+
"col_json",
104+
"col_jsonb",
103105
]
104106

105107

@@ -374,8 +376,8 @@ def test_row_validation_oracle_to_postgres():
374376
# TODO col_raw/col_long_raw are blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
375377
# TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
376378
# TODO Change hash_cols below to include col_interval_ds when issue-1214 is complete.
379+
# TODO Change hash_cols below to include col_clob/col_nclob/col_blob/col_json/col_jsonb when issue-1364 is complete.
377380
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
378-
# Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
379381
hash_cols = ",".join(
380382
[
381383
_
@@ -393,6 +395,8 @@ def test_row_validation_oracle_to_postgres():
393395
"col_nvarchar_30",
394396
"col_nchar_2",
395397
"col_interval_ds",
398+
"col_json",
399+
"col_jsonb",
396400
)
397401
]
398402
)
@@ -592,8 +596,8 @@ def test_custom_query_row_validation_oracle_to_postgres():
592596
# TODO col_raw/col_long_raw are blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
593597
# TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
594598
# TODO Change hash_cols below to include col_interval_ds when issue-1214 is complete.
599+
# TODO Change hash_cols below to include col_clob/col_nclob/col_blob/col_json/col_jsonb when issue-1364 is complete.
595600
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
596-
# Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
597601
hash_cols = ",".join(
598602
[
599603
_
@@ -611,6 +615,8 @@ def test_custom_query_row_validation_oracle_to_postgres():
611615
"col_nvarchar_30",
612616
"col_nchar_2",
613617
"col_interval_ds",
618+
"col_json",
619+
"col_jsonb",
614620
)
615621
]
616622
)

0 commit comments

Comments
 (0)