@@ -70,14 +70,39 @@ def check_schema_for_dupes_in_field_names(stream_name, schema):
7070 Alerts user if there are duplicate field names in JSON schema.
7171
7272 For example, if JSON schema contains:
73- "Name" and "name"
73+ "Name" and "name" (this will be considered a dupe field in BigQuery and it'll throw an error)
7474 or
7575 "first name" and "first_name" (this example is also a dupe because "first name" will be converted to "first_name" by schema.py)
7676 :param stream_name: name of stream
7777 :param schema: JSON schema of the stream
7878 :return:
7979 """
8080 def build_field_list (schema ):
81+ """
82+
83+ :param schema:
84+ :return: a dictionary, where:
85+
86+ every key is uppercase of BigQuery transformed key (uppercase of field name cleaned up to load into BigQuery)
87+
88+ every value is original field names from JSON schema
89+
90+ This dictionary is flat, not nested.
91+
92+ JSON nested fields are represented in dictionary with a . dot.
93+
94+ This dict makes it easy to detect dupes and tell the user exactly where the dupe is located
95+ (what its parent field is).
96+
97+ Sample output:
98+
99+ f_dict / fields = {'OBJECT': ['object'], 'ID': ['id'],
100+ 'PERSON._SOURCE': ['person.$source'],
101+ 'PERSON.NAME': ['person.name', 'person.Name']}
102+
103+ dupes = {'PERSON.NAME': ['person.name', 'person.Name']}
104+
105+ """
81106 f_dict = {}
82107 for field_name , field_property in schema .get ("properties" , schema .get ("items" , {}).get ("properties" , {})).items ():
83108 if not ("items" in field_property and "properties" in field_property ["items" ]) \
@@ -98,7 +123,6 @@ def build_field_list(schema):
98123 else :
99124 f_dict [f"{ key } .{ k } " ].extend ([f"{ field_name } .{ i } " for i in v ])
100125
101- # sample f_dict: {"BQ_FIELD_NAME.BQ_NESTED_FIELD": ["json_schema_field_name.$nested_name", "json_schema_field_name.nested name"]}
102126 return f_dict
103127
104128 fields = build_field_list (schema )
0 commit comments