Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit bf88f71

Browse files
martinv13cre-os
andauthoredJan 22, 2025
Improve support for some XSD features (#17)
Improve support for some XSD: * `short_name` is actually required in some cases despite being an optional argument -> set a default value * support of recursive XSD definition -> minimal support (e.g. not correct but does not fail) by dropping the fields that make the schema recursive * support of `xsd:list` types * support of `complexTypes` with an attribute with the same name as an element of the same `complexType` * support of `sequence`s with unbounded `max_occurs` containing `element`s also with unbounded `max_occurs` * support of extra `integer` simple types Closes #15 --------- Co-authored-by: cre-os <opensource@cre.fr>
1 parent ca66bb2 commit bf88f71

38 files changed

+918
-171
lines changed
 

‎.github/workflows/integration-tests-mssql.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99

1010
jobs:
1111
integration-tests:
12-
runs-on: ubuntu-latest
12+
runs-on: ubuntu-22.04
1313
services:
1414
mssql:
1515
image: mcr.microsoft.com/mssql/server:2019-latest

‎.github/workflows/integration-tests-mysql.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99

1010
jobs:
1111
integration-tests:
12-
runs-on: ubuntu-latest
12+
runs-on: ubuntu-22.04
1313
services:
1414
mysql:
1515
image: mysql:8.0

‎.github/workflows/integration-tests-postgres.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99

1010
jobs:
1111
integration-tests:
12-
runs-on: ubuntu-latest
12+
runs-on: ubuntu-22.04
1313
container: python:3.12-bookworm
1414
services:
1515
postgres:

‎docs/how_it_works.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,10 @@ original one.
123123

124124
### Recursive XSD
125125

126-
Recursive XML schemas are not supported, because most of the time they will result in cycles in foreign key constraints
127-
dependencies, which we cannot handle easily.
126+
Recursive XML schemas are not fully supported, because they result in cycles in tables dependencies, which would make
127+
the process much more complex. Whenever a field which would introduce a dependency cycle is detected in the XSD, it is
128+
discarded with a warning, which means that the corresponding data in XML files will not be imported. The rest of the
129+
data should be processed correctly.
128130

129131
### Mixed content elements
130132

‎src/xml2db/document.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -171,17 +171,24 @@ def _extract_node(
171171
record["xml2db_row_number"] = row_number
172172

173173
# build record from fields for columns and n-1 relations
174-
for field_type, key, _ in model_table.fields:
174+
for field_type, key, field in model_table.fields:
175175
if field_type == "col":
176-
if key in content:
176+
content_key = (
177+
(f"{key[:-5]}__attr" if field.has_suffix else f"{key}__attr")
178+
if field.is_attr
179+
else key
180+
)
181+
if content_key in content:
177182
if model_table.columns[key].data_type in ["decimal", "float"]:
178-
val = [float(v) for v in content[key]]
183+
val = [float(v) for v in content[content_key]]
179184
elif model_table.columns[key].data_type == "integer":
180-
val = [int(v) for v in content[key]]
185+
val = [int(v) for v in content[content_key]]
181186
elif model_table.columns[key].data_type == "boolean":
182-
val = [v == "true" or v == "1" for v in content[key]]
187+
val = [
188+
v == "true" or v == "1" for v in content[content_key]
189+
]
183190
else:
184-
val = content[key]
191+
val = content[content_key]
185192

186193
if len(val) == 1:
187194
record[key] = val[0]
@@ -320,20 +327,29 @@ def _build_node(node_type: str, node_pk: int) -> tuple:
320327
record = data_index[node_type]["records"][node_pk]
321328
for field_type, rel_name, rel in tb.fields:
322329
if field_type == "col" and record[rel_name] is not None:
330+
content_key = (
331+
(
332+
f"{rel_name[:-5]}__attr"
333+
if rel.has_suffix
334+
else f"{rel_name}__attr"
335+
)
336+
if rel.is_attr
337+
else rel_name
338+
)
323339
if rel.data_type in [
324340
"decimal",
325341
"float",
326342
]: # remove trailing ".0" for decimal and float
327-
content[rel_name] = [
343+
content[content_key] = [
328344
value.rstrip("0").rstrip(".") if "." in value else value
329345
for value in str(record[rel_name]).split(",")
330346
]
331347
elif isinstance(record[rel_name], datetime.datetime):
332-
content[rel_name] = [
348+
content[content_key] = [
333349
record[rel_name].isoformat(timespec="milliseconds")
334350
]
335351
else:
336-
content[rel_name] = (
352+
content[content_key] = (
337353
list(csv.reader([str(record[rel_name])], escapechar="\\"))[
338354
0
339355
]

‎src/xml2db/model.py

Lines changed: 61 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class DataModel:
7070
def __init__(
7171
self,
7272
xsd_file: str,
73-
short_name: str = None,
73+
short_name: str = "DocumentRoot",
7474
long_name: str = None,
7575
base_url: str = None,
7676
model_config: dict = None,
@@ -226,8 +226,7 @@ def _build_model(self):
226226
"""
227227
# parse the XML schema recursively and hold a reference to the head table
228228
root_table = self._parse_tree(
229-
self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema,
230-
is_root_table=True,
229+
self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema
231230
)
232231
self.root_table = root_table.type_name
233232
# compute a text representation of the original data model and store it
@@ -273,9 +272,7 @@ def _build_model(self):
273272
for tb in self.fk_ordered_tables:
274273
tb.build_sqlalchemy_tables()
275274

276-
def _parse_tree(
277-
self, parent_node: xmlschema.XsdElement, is_root_table: bool = False
278-
):
275+
def _parse_tree(self, parent_node: xmlschema.XsdElement, nodes_path: list = None):
279276
"""Parse a node of an XML schema recursively and create a target data model without any simplification
280277
281278
We parse the XSD tree recursively to create for each node (basically a complex type in the XSD) an equivalent \
@@ -289,7 +286,7 @@ def _parse_tree(
289286
290287
Args:
291288
parent_node: the current XSD node being parsed
292-
is_root_table: True if this is the root table
289+
nodes_path: a list of nodes types from the root node
293290
"""
294291

295292
# find current node type and name and returns corresponding table if it already exists
@@ -301,12 +298,16 @@ def _parse_tree(
301298
if parent_type is None:
302299
parent_type = parent_node.local_name
303300

301+
nodes_path = (nodes_path if nodes_path else []) + [parent_type]
302+
304303
# if this type has already been encountered, stop here and return existing table
305304
if parent_type in self.tables:
306305
parent_table = self.tables[parent_type]
307306
return parent_table
308307

309-
# elements names and types should be bijective. If an element name is used for different types,
308+
# For database tables we use element names rather than XSD types, under the assumption that they are often
309+
# more meaningful given that they are the one which appear in XML documents. However, same names can be used
310+
# for different XSD types, so if an element name is used for different types,
310311
# we add a suffix to the name to make it unique again (using a dict to keep the name/type association)
311312
parent_name = (
312313
parent_node.local_name
@@ -324,7 +325,7 @@ def _parse_tree(
324325
parent_table = self._create_table_model(
325326
parent_name,
326327
parent_type,
327-
is_root_table,
328+
len(nodes_path) == 1,
328329
isinstance(parent_node, xmlschema.XMLSchema),
329330
)
330331
self.tables[parent_type] = parent_table
@@ -363,6 +364,13 @@ def recurse_parse_simple_type(elem_type):
363364
if elem_type.base_type
364365
else recurse_parse_simple_type(elem_type.member_types)
365366
)
367+
if elem_type.is_list():
368+
return (
369+
"string",
370+
0,
371+
None,
372+
elem_type.allow_empty,
373+
)
366374
if elem_type.is_restriction():
367375
dt = elem_type.base_type.local_name
368376
mil = elem_type.min_length
@@ -384,7 +392,12 @@ def recurse_parse_simple_type(elem_type):
384392
else None
385393
)
386394
ae = ae and bt_ae if ae is not None and bt_ae is not None else None
387-
if elem_type.enumeration is not None and dt in ["string", "NMTOKEN", "duration", "token"]:
395+
if elem_type.enumeration is not None and dt in [
396+
"string",
397+
"NMTOKEN",
398+
"duration",
399+
"token",
400+
]:
388401
mil = min([len(val) for val in elem_type.enumeration])
389402
mal = max([len(val) for val in elem_type.enumeration])
390403
return dt, mil, mal, ae
@@ -410,25 +423,31 @@ def get_occurs(particle):
410423
),
411424
]
412425

413-
# go through item attributes and add them as columns
426+
# go through item attributes and add them as columns, adding a suffix if an element with the same name exists
427+
children_names = None
414428
for attrib_name, attrib in parent_node.attributes.items():
429+
if children_names is None:
430+
children_names = [child.local_name for child in parent_node]
415431
(
416432
data_type,
417433
min_length,
418434
max_length,
419435
allow_empty,
420436
) = recurse_parse_simple_type([attrib.type])
437+
suffix = attrib_name in children_names
421438
parent_table.add_column(
422-
f"{attrib_name}",
439+
f"{attrib_name}{'_attr' if suffix else ''}",
423440
data_type,
424441
[0, 1],
425442
min_length,
426443
max_length,
427444
True,
445+
suffix,
428446
False,
429447
allow_empty,
430448
None,
431449
)
450+
432451
nested_containers = []
433452
# go through the children to add either arguments either relations to the current element
434453
for child in parent_node:
@@ -454,6 +473,7 @@ def get_occurs(particle):
454473
if child.parent
455474
and child.parent.max_occurs != 1
456475
and child.parent.model != "choice"
476+
and child.max_occurs == 1
457477
else None
458478
),
459479
)
@@ -482,32 +502,39 @@ def get_occurs(particle):
482502
max_length,
483503
False,
484504
False,
505+
False,
485506
allow_empty,
486507
nested_containers[-1][1],
487508
)
488509

489510
elif ct.is_complex():
490-
child_table = self._parse_tree(child)
491-
child_table.model_group = (
492-
"choice"
493-
if ct.model_group and ct.model_group.model == "choice"
494-
else "sequence"
495-
)
496-
occurs = get_occurs(child)
497-
if child.is_single():
498-
parent_table.add_relation_1(
499-
child.local_name,
500-
child_table,
501-
occurs,
502-
nested_containers[-1][1],
511+
# ignoring recursive definitions by skipping these fields
512+
if child.type.local_name in nodes_path:
513+
logger.warning(
514+
f"type '{child.type.local_name}' contains a recursive definition"
503515
)
504516
else:
505-
parent_table.add_relation_n(
506-
child.local_name,
507-
child_table,
508-
occurs,
509-
nested_containers[-1][1],
517+
child_table = self._parse_tree(child, nodes_path)
518+
child_table.model_group = (
519+
"choice"
520+
if ct.model_group and ct.model_group.model == "choice"
521+
else "sequence"
510522
)
523+
occurs = get_occurs(child)
524+
if occurs[1] == 1:
525+
parent_table.add_relation_1(
526+
child.local_name,
527+
child_table,
528+
occurs,
529+
nested_containers[-1][1],
530+
)
531+
else:
532+
parent_table.add_relation_n(
533+
child.local_name,
534+
child_table,
535+
occurs,
536+
nested_containers[-1][1],
537+
)
511538
else:
512539
raise ValueError("unknown case; please check")
513540
else:
@@ -534,6 +561,7 @@ def get_occurs(particle):
534561
min_length,
535562
max_length,
536563
False,
564+
False,
537565
True,
538566
allow_empty,
539567
None,
@@ -544,31 +572,19 @@ def get_occurs(particle):
544572
def _repr_tree(
545573
self,
546574
parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
547-
visited_nodes: Union[set, None] = None,
548575
):
549576
"""Build a text representation of the data model tree
550577
551578
Args:
552579
parent_table: the current data model table object
553580
"""
554-
if visited_nodes is None:
555-
visited_nodes = set()
556-
else:
557-
visited_nodes = {item for item in visited_nodes}
558-
visited_nodes.add(parent_table.name)
559581
for field_type, name, field in parent_table.fields:
560582
if field_type == "col":
561583
yield f"{field.name}{field.occurs}: {field.data_type}"
562-
elif field_type == "rel1":
563-
mg = " (choice)" if field.other_table.model_group == "choice" else ""
564-
yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
565-
if field.other_table.name not in visited_nodes:
566-
for line in self._repr_tree(field.other_table, visited_nodes):
567-
yield f" {line}"
568-
elif field_type == "reln":
584+
else:
569585
mg = " (choice)" if field.other_table.model_group == "choice" else ""
570-
yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
571-
for line in self._repr_tree(field.other_table, visited_nodes):
586+
yield f"{field.name}{field.occurs}{mg}:"
587+
for line in self._repr_tree(field.other_table):
572588
yield f" {line}"
573589

574590
def get_entity_rel_diagram(self, text_context: bool = True) -> str:

0 commit comments

Comments
 (0)