diff --git a/unblob/file_utils.py b/unblob/file_utils.py index 58646522f2..adad1b96ad 100644 --- a/unblob/file_utils.py +++ b/unblob/file_utils.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Iterator, Tuple -from dissect.cstruct import cstruct +from dissect.cstruct import Instance, cstruct from pyperscan import Scan from .logging import format_hex @@ -311,3 +311,26 @@ def read_until_past(file: File, pattern: bytes): return file.tell() if next_byte not in pattern: return file.tell() - 1 + + +def as_dict(obj): + """Convert a Python class instance to a dictionary.""" + if isinstance(obj, dict): + return obj + if isinstance(obj, list): + return [as_dict(item) for item in obj] + if isinstance(obj, Instance): + result = {} + for k, v in obj._values.items(): # noqa: SLF001 + result[k] = v + return result + + result = {} + for key, value in obj.__dict__.items(): + if key.startswith("_"): + continue + if isinstance(value, (list, tuple)): + result[key] = [as_dict(item) for item in value] + else: + result[key] = as_dict(value) + return result diff --git a/unblob/handlers/archive/sevenzip.py b/unblob/handlers/archive/sevenzip.py index 040b409293..f8f7a446da 100644 --- a/unblob/handlers/archive/sevenzip.py +++ b/unblob/handlers/archive/sevenzip.py @@ -23,6 +23,7 @@ from structlog import get_logger from unblob.extractors import Command +from unblob.file_utils import as_dict from ...models import File, HexString, StructHandler, ValidChunk @@ -70,4 +71,6 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] # We read the signature header here to get the offset to the header database first_db_header = start_offset + len(header) + header.next_header_offset end_offset = first_db_header + header.next_header_size - return ValidChunk(start_offset=start_offset, end_offset=end_offset) + return ValidChunk( + start_offset=start_offset, end_offset=end_offset, metadata=as_dict(header) + ) diff --git a/unblob/models.py b/unblob/models.py index 2b8431fa73..d101a080e8 100644 --- a/unblob/models.py +++ b/unblob/models.py @@ -88,6 +88,7 @@ class ValidChunk(Chunk): handler: "Handler" = attr.ib(init=False, eq=False) is_encrypted: bool = attr.ib(default=False) + metadata: dict = attr.ib(default={}) def extract(self, inpath: Path, outdir: Path): if self.is_encrypted: @@ -108,6 +109,7 @@ def as_report(self, extraction_reports: List[Report]) -> ChunkReport: size=self.size, handler_name=self.handler.NAME, is_encrypted=self.is_encrypted, + metadata=self.metadata, extraction_reports=extraction_reports, ) @@ -188,7 +190,7 @@ def default(self, obj): if isinstance(obj, bytes): try: - return obj.decode() + return obj.decode("utf-8", errors="surrogateescape") except UnicodeDecodeError: return str(obj) diff --git a/unblob/report.py b/unblob/report.py index 1b5bed1e71..3fe145ab25 100644 --- a/unblob/report.py +++ b/unblob/report.py @@ -4,7 +4,7 @@ import traceback from enum import Enum from pathlib import Path -from typing import List, Optional, Union, final +from typing import Dict, List, Optional, Union, final import attr @@ -181,6 +181,7 @@ class ChunkReport(Report): end_offset: int size: int is_encrypted: bool + metadata: Dict extraction_reports: List[Report]