Skip to content

Commit c5e7574

Browse files
hf-kkleinKonstantin
andauthored
feat: sanitize Anwendungsfall.beschreibung by removing hyphens inside words (#233)
seems like they're writing the XMLs in Word Co-authored-by: Konstantin <[email protected]>
1 parent 4f51fde commit c5e7574

File tree

4 files changed

+436
-78
lines changed

4 files changed

+436
-78
lines changed

src/fundamend/reader/ahbreader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
_is_segment_group,
3232
_is_uebertragungsdatei,
3333
)
34-
from fundamend.utils import lstrip, remove_linebreaks_and_hyphens, strip
34+
from fundamend.utils import lstrip, remove_linebreaks_and_hyphens, remove_unnecessary_hyphens, strip
3535

3636
# pylint:disable=duplicate-code
3737
# yes, it's very similar to the MigReader
@@ -252,7 +252,9 @@ def _read_anwendungsfall(self, original_element: ET.Element) -> Anwendungsfall:
252252
format_element = next((child for child in original_element[0] if child.tag.startswith("M_")))
253253
return Anwendungsfall(
254254
pruefidentifikator=original_element.attrib["Pruefidentifikator"],
255-
beschreibung=remove_linebreaks_and_hyphens(original_element.attrib["Beschreibung"]),
255+
beschreibung=remove_unnecessary_hyphens(
256+
remove_linebreaks_and_hyphens(original_element.attrib["Beschreibung"])
257+
),
256258
kommunikation_von=original_element.attrib["Kommunikation_von"].strip(),
257259
format=EdifactFormat(lstrip("M_", format_element.tag)),
258260
elements=tuple(segments_and_groups),

src/fundamend/utils.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33
"""
44

55
import re
6-
from typing import Optional
6+
from typing import Optional, overload
77

88
from fundamend.models.kommunikationsrichtung import Kommunikationsrichtung
99

10+
_unnecessary_hyphen_pattern = re.compile(r"(?<=[a-zäüöß])-(?=[a-zäüöß])")
11+
"""if before AND after a hyphen there are only lower case letters, then we can probably remove it"""
12+
1013

1114
def lstrip(prefix: str, text: str) -> str:
1215
"""Strip the given prefix from the given text. If the text does not start with the prefix, return the text as is.
@@ -143,4 +146,27 @@ def parse_kommunikation_von(kommunikation_von: Optional[str]) -> list[Kommunikat
143146
return result
144147

145148

146-
__all__ = ["lstrip", "rstrip", "strip", "parse_kommunikation_von", "remove_linebreaks_and_hyphens"]
149+
@overload
150+
def remove_unnecessary_hyphens(candidate: str) -> str: ...
151+
@overload
152+
def remove_unnecessary_hyphens(candidate: None) -> None: ...
153+
def remove_unnecessary_hyphens(candidate: Optional[str]) -> Optional[str]:
154+
"""
155+
removes hyphens from the middle of words that are likely unnecessary
156+
Example: "Ausfallarbeits-summenzeitreihe" returns "Ausfallarbeitssummenzeitreihe"
157+
or "Bestäti-gung" returns "Bestätigung". But "Sperr-/Entsperrauftrag" stays untouched.
158+
Handles multiple occurrences: "Bestäti-gung der Stornier-ung" returns "Bestätigung der Stornierung".
159+
"""
160+
if candidate is None:
161+
return None
162+
return _unnecessary_hyphen_pattern.sub("", candidate)
163+
164+
165+
__all__ = [
166+
"lstrip",
167+
"rstrip",
168+
"strip",
169+
"parse_kommunikation_von",
170+
"remove_linebreaks_and_hyphens",
171+
"remove_unnecessary_hyphens",
172+
]

0 commit comments

Comments
 (0)