Skip to content

Commit

Permalink
transform: encoding fix (argparse.FileType and non-UTF8 source).
Browse files Browse the repository at this point in the history
  • Loading branch information
peteradrichem committed Jan 18, 2025
1 parent 67449ec commit d63283c
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
6 changes: 1 addition & 5 deletions src/xul/cmd/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@ def parse_cl() -> argparse.Namespace:
parser.add_argument("-V", "--version", action="version", version="%(prog)s " + __version__)
parser.add_argument("xslt_source", help="XSLT source (file, http://...)")
parser.add_argument(
"xml_source",
nargs="?",
default=sys.stdin,
type=argparse.FileType("r"),
help="XML source (file, <stdin>, http://...)",
"xml_source", nargs="?", default=sys.stdin, help="XML source (file, <stdin>, http://...)"
)
parser.add_argument("-f", "--file", dest="file", help="save result to file")

Expand Down
29 changes: 18 additions & 11 deletions src/xul/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
https://lxml.de/tutorial.html
"""

import sys
import io
from logging import getLogger
from typing import Optional, TextIO, Union

Expand All @@ -20,6 +20,19 @@
logger = getLogger(__name__)


def get_source_name(xml_source: Union[TextIO, str]) -> str:
"""Return the name of XML source."""
if isinstance(xml_source, str):
return xml_source
if isinstance(xml_source, io.TextIOWrapper):
# e.g. sys.stdin
return xml_source.name
if isinstance(xml_source, io.StringIO):
return "StringIO"
# ?
return str(xml_source)


def build_etree(
xml_source: Union[TextIO, str],
parser: Optional[etree.XMLParser] = None,
Expand All @@ -45,6 +58,7 @@ def build_etree(
if not parser:
parser = etree.XMLParser(ns_clean=True)

file_name = get_source_name(xml_source)
try:
etree.clear_error_log()
return etree.parse(xml_source, parser)
Expand All @@ -61,14 +75,7 @@ def build_etree(
xmllogger = logger.warning
else:
xmllogger = logger.error

if xml_source in ("-", sys.stdin):
name = sys.stdin.name
xml_type = "object"
else:
name = xml_source
xml_type = "file"
xmllogger("%s is not a valid XML %s:", name, xml_type)
xmllogger("%s is not a valid XML source:", file_name)

# Parsers have an error_log property that lists the errors and warnings
# of the last parser run.
Expand All @@ -85,11 +92,11 @@ def build_etree(
# Catch UnicodeDecodeError exceptions, for example:
# "'utf-8' codec can't decode byte 0xff in position 0: invalid start byte"
except UnicodeDecodeError as e:
logger.error(e)
logger.error("%s: %s", file_name, e)
return None

# Catch OSError exceptions, for example:
# "failed to load external entity" (lxml.etree._raiseParseError)
# Error reading file '404.xml': failed to load external entity "404.xml"
except OSError as e:
logger.error(e)
return None

0 comments on commit d63283c

Please sign in to comment.