Skip to content

Commit ae96129

Browse files
authored
feat: add subscript & superscript formatting (#319)
* feat: add subscript & superscript formatting Signed-off-by: Panos Vagenas <[email protected]> * switch to enum Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent d8a5256 commit ae96129

28 files changed

+895
-348
lines changed

docling_core/transforms/serializer/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,16 @@ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
202202
"""Hook for strikethrough formatting serialization."""
203203
...
204204

205+
@abstractmethod
206+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
207+
"""Hook for subscript formatting serialization."""
208+
...
209+
210+
@abstractmethod
211+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
212+
"""Hook for superscript formatting serialization."""
213+
...
214+
205215
@abstractmethod
206216
def serialize_hyperlink(
207217
self,

docling_core/transforms/serializer/common.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
PictureDataType,
4646
PictureItem,
4747
PictureMoleculeData,
48+
Script,
4849
TableAnnotationType,
4950
TableItem,
5051
TextItem,
@@ -455,6 +456,10 @@ def post_process(
455456
res = self.serialize_underline(text=res)
456457
if formatting.strikethrough:
457458
res = self.serialize_strikethrough(text=res)
459+
if formatting.script == Script.SUB:
460+
res = self.serialize_subscript(text=res)
461+
elif formatting.script == Script.SUPER:
462+
res = self.serialize_superscript(text=res)
458463
if params.include_hyperlinks and hyperlink:
459464
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
460465
return res
@@ -479,6 +484,16 @@ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
479484
"""Hook for strikethrough formatting serialization."""
480485
return text
481486

487+
@override
488+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
489+
"""Hook for subscript formatting serialization."""
490+
return text
491+
492+
@override
493+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
494+
"""Hook for superscript formatting serialization."""
495+
return text
496+
482497
@override
483498
def serialize_hyperlink(
484499
self,

docling_core/transforms/serializer/html.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,16 @@ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
847847
"""Apply HTML-specific strikethrough serialization."""
848848
return f"<del>{text}</del>"
849849

850+
@override
851+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
852+
"""Apply HTML-specific subscript serialization."""
853+
return f"<sub>{text}</sub>"
854+
855+
@override
856+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
857+
"""Apply HTML-specific superscript serialization."""
858+
return f"<sup>{text}</sup>"
859+
850860
@override
851861
def serialize_hyperlink(
852862
self,

docling_core/types/doc/document.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,13 +829,22 @@ def get_annotations(self) -> Sequence[BaseAnnotation]:
829829
return []
830830

831831

832+
class Script(str, Enum):
833+
"""Text script position."""
834+
835+
BASELINE = "baseline"
836+
SUB = "sub"
837+
SUPER = "super"
838+
839+
832840
class Formatting(BaseModel):
833841
"""Formatting."""
834842

835843
bold: bool = False
836844
italic: bool = False
837845
underline: bool = False
838846
strikethrough: bool = False
847+
script: Script = Script.BASELINE
839848

840849

841850
class TextItem(DocItem):

docs/DoclingDocument.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,10 @@
554554
"default": false,
555555
"title": "Strikethrough",
556556
"type": "boolean"
557+
},
558+
"script": {
559+
"$ref": "#/$defs/Script",
560+
"default": "baseline"
557561
}
558562
},
559563
"title": "Formatting",
@@ -1715,6 +1719,16 @@
17151719
"title": "RefItem",
17161720
"type": "object"
17171721
},
1722+
"Script": {
1723+
"description": "Text script position.",
1724+
"enum": [
1725+
"baseline",
1726+
"sub",
1727+
"super"
1728+
],
1729+
"title": "Script",
1730+
"type": "string"
1731+
},
17181732
"SectionHeaderItem": {
17191733
"additionalProperties": false,
17201734
"description": "SectionItem.",

0 commit comments

Comments
 (0)