From faf472c1689746adc43e0ae8ef6d6e3fcf87c023 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:07:42 +0200 Subject: [PATCH] fix: set type to optional (#7) Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling_core/types/doc/document.py | 6 +- docs/Document.json | 9 +- docs/Document.md | 41 ++++++- test/data/doc/doc-9.json | 167 +++++++++++++++++++++++++++++ 4 files changed, 214 insertions(+), 9 deletions(-) create mode 100644 test/data/doc/doc-9.json diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 781fa6b..581db09 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -263,7 +263,7 @@ class MinimalDocument( """Minimal model for a document.""" name: StrictStr = Field(alias="_name") - obj_type: StrictStr = Field("document", alias="type") + obj_type: Optional[StrictStr] = Field("document", alias="type") description: CCSDocumentDescription[ DescriptionAdvancedT, DescriptionAnalyticsT, @@ -291,7 +291,7 @@ class CCSDocument( ): """Model for a CCS-generated document.""" - obj_type: StrictStr = Field("pdf-document", alias="type") + obj_type: Optional[StrictStr] = Field("pdf-document", alias="type") bitmaps: Optional[list[BitmapObject]] = None equations: Optional[list[BaseCell]] = None footnotes: Optional[list[BaseText]] = None @@ -355,7 +355,7 @@ class ExportedCCSDocument( ): """Document model for Docling.""" - obj_type: StrictStr = Field( + obj_type: Optional[StrictStr] = Field( "pdf-document", alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191), diff --git a/docs/Document.json b/docs/Document.json index ffa00c6..7e6b3ce 100644 --- a/docs/Document.json +++ b/docs/Document.json @@ -1732,9 +1732,16 @@ "type": "string" }, "type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": "pdf-document", "title": "Type", - "type": "string", "x-es-ignore_above": 8191, "x-es-type": "keyword" }, diff --git a/docs/Document.md b/docs/Document.md index c5eccbe..7860b4f 100644 --- a/docs/Document.md +++ b/docs/Document.md @@ -36,11 +36,42 @@ **Title:** Type -| | | -| ------------ | ---------------- | -| **Type** | `string` | -| **Required** | No | -| **Default** | `"pdf-document"` | +| | | +| ------------------------- | ------------------------------------------------------------------------- | +| **Type** | `combining` | +| **Required** | No | +| **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") | +| **Default** | `"pdf-document"` | + +
+ +| Any of(Option) | +| ------------------------ | +| [item 0](#type_anyOf_i0) | +| [item 1](#type_anyOf_i1) | + +
+ +### 2.1. Property `ExportedCCSDocument > type > anyOf > item 0` + +| | | +| ------------ | -------- | +| **Type** | `string` | +| **Required** | No | + +
+
+ +### 2.2. Property `ExportedCCSDocument > type > anyOf > item 1` + +| | | +| ------------ | ------ | +| **Type** | `null` | +| **Required** | No | + +
+ +
diff --git a/test/data/doc/doc-9.json b/test/data/doc/doc-9.json new file mode 100644 index 0000000..ff644d0 --- /dev/null +++ b/test/data/doc/doc-9.json @@ -0,0 +1,167 @@ +{ + "_name": "2023 IBM International Business Machines Corp.", + "bitmaps": [], + "description": { + "logs": [ + { + "date": "2024-04-15T09:18:31.855017+00:00", + "agent": "CCS", + "comment": "parsing of documents", + "type": "parsing" + }, + { + "date": "2024-04-15T09:43:18.651+00:00", + "agent": "CXS", + "task": "task 12345", + "comment": "enrichment of documents", + "type": "text enrichment" + } + ], + "collection": { + "type": "Document", + "name": "ESG Reports", + "alias": [ + "esg-report" + ], + "version": "2.0.2" + }, + "languages": [ + "en" + ], + "advanced": { + "website": [ + "http://www.ibm.com/" + ], + "year": 2023 + }, + "subjects": [ + "Technology" + ], + "publication_date": "2023-01-01T12:00:00.000+00:00", + "affiliations": [ + { + "name": "International Business Machines Corp.", + "id": "ibm", + "source": "nyse" + } + ], + "title": "2023 ESG Report", + "type": "ESG report" + }, + "equations": [], + "figures": [], + "file-info": { + "#-pages": 1, + "document-hash": "776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb", + "filename": "IBM_2023.pdf", + "page-hashes": [ + { + "hash": "49319ad44d1997ea392d043a3b5c5fb044c9da0f16ab917326b14da4482bb39e", + "model": "model", + "page": 1 + } + ] + }, + "footnotes": [], + "main-text": [ + { + "$ref": "#/figures/0", + "name": "picture", + "type": "figure" + }, + { + "text": "-------------------------___, _ IXI", + "name": "text", + "type": "paragraph", + "prov": [ + { + "bbox": [ + 896.1026000976562, + 30.518247604370117, + 972.6661987304688, + 77.09062957763672 + ], + "page": 1, + "span": [ + 0, + 35 + ] + } + ] + } + ], + "page-dimensions": [ + { + "height": 612, + "page": 1, + "width": 1008 + } + ], + "page-footers": [], + "page-headers": [], + "references": [], + "tables": [], + "conversion_settings": { + "model_pipeline": { + "clusters": [ + { + "type": "LayoutSegmentationModel", + "name": "LayoutSegmentationModel", + "version": "NA" + } + ], + "page": [], + "normalization": [], + "tables": [ + { + "type": "TableStructureModel", + "name": "TableStructureModel", + "version": "NA" + } + ] + } + }, + "version": 2, + "_s3_data": { + "pdf-document": [ + { + "mime": "application/pdf", + "path": "index-code/PDFDocuments/776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb.pdf" + } + ], + "pdf-pages": [ + { + "mime": "application/pdf", + "path": "index-code/PDFPages/49319ad44d1997ea392d043a3b5c5fb044c9da0f16ab917326b14da4482bb39e.pdf", + "page": 1 + } + ], + "markdown-document": [ + { + "mime": "text/markdown", + "path": "index-code/MD/776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb.md" + } + ], + "json-document": { + "mime": "application/json", + "path": "index-code/JSONDocuments/776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb.json" + }, + "json-meta": { + "mime": "application/json", + "path": "index-code/JSONDocuments/776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb.meta.json" + }, + "glm-json-document": { + "mime": "application/json", + "path": "index-code/JSONDocuments/776531b533f5970d81de684e84b25cb13bafbab2cf520fddf7bac2ba25233dcb.glm.json" + } + }, + "type": null, + "_content_hash": "84be138c500936cbbc70628ceb4e4f82", + "identifiers": [ + { + "_name": "esg_report#ibm_2023", + "type": "esg_report", + "value": "ibm_2023" + } + ] + } \ No newline at end of file