From 0b150ba2e62b7df48ef35cabc8925a8366e620a3 Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Fri, 14 Apr 2023 14:07:04 +0200 Subject: [PATCH] feat(reporting): report meta-data information about chunks. Allow handlers to provide a dict value as part of a ValidChunk metadata attribute. That dictionnary can contain any relevant metadata information from the perspective of the handler, but we advise handler writers to report parsed information such as header values. This metadata dict is later reported as part of our ChunkReports and available in the JSON report file if the user requested one. The idea is to expose metadata to further analysis steps through the unblob report. For example, a binary analysis toolkit would read the load address and architecture from a uImage chunk to analyze the file extracted from that chunk with the right settings. A note on the 'as_dict' implementation. The initial idea was to implement it in dissect.cstruct (see https://github.com/fox-it/dissect.cstruct/pull/29), but due to expected changes in the project's API I chose to implement it in unblob so we're not dependent on another project. --- unblob/file_utils.py | 25 ++++++++++++++++++++++++- unblob/handlers/archive/sevenzip.py | 5 ++++- unblob/models.py | 4 +++- unblob/report.py | 3 ++- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/unblob/file_utils.py b/unblob/file_utils.py index 58646522f2..adad1b96ad 100644 --- a/unblob/file_utils.py +++ b/unblob/file_utils.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Iterator, Tuple -from dissect.cstruct import cstruct +from dissect.cstruct import Instance, cstruct from pyperscan import Scan from .logging import format_hex @@ -311,3 +311,26 @@ def read_until_past(file: File, pattern: bytes): return file.tell() if next_byte not in pattern: return file.tell() - 1 + + +def as_dict(obj): + """Convert a Python class instance to a dictionary.""" + if isinstance(obj, dict): + return obj + if isinstance(obj, list): + return [as_dict(item) for item in obj] + if isinstance(obj, Instance): + result = {} + for k, v in obj._values.items(): # noqa: SLF001 + result[k] = v + return result + + result = {} + for key, value in obj.__dict__.items(): + if key.startswith("_"): + continue + if isinstance(value, (list, tuple)): + result[key] = [as_dict(item) for item in value] + else: + result[key] = as_dict(value) + return result diff --git a/unblob/handlers/archive/sevenzip.py b/unblob/handlers/archive/sevenzip.py index 040b409293..f8f7a446da 100644 --- a/unblob/handlers/archive/sevenzip.py +++ b/unblob/handlers/archive/sevenzip.py @@ -23,6 +23,7 @@ from structlog import get_logger from unblob.extractors import Command +from unblob.file_utils import as_dict from ...models import File, HexString, StructHandler, ValidChunk @@ -70,4 +71,6 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] # We read the signature header here to get the offset to the header database first_db_header = start_offset + len(header) + header.next_header_offset end_offset = first_db_header + header.next_header_size - return ValidChunk(start_offset=start_offset, end_offset=end_offset) + return ValidChunk( + start_offset=start_offset, end_offset=end_offset, metadata=as_dict(header) + ) diff --git a/unblob/models.py b/unblob/models.py index 2b8431fa73..d101a080e8 100644 --- a/unblob/models.py +++ b/unblob/models.py @@ -88,6 +88,7 @@ class ValidChunk(Chunk): handler: "Handler" = attr.ib(init=False, eq=False) is_encrypted: bool = attr.ib(default=False) + metadata: dict = attr.ib(default={}) def extract(self, inpath: Path, outdir: Path): if self.is_encrypted: @@ -108,6 +109,7 @@ def as_report(self, extraction_reports: List[Report]) -> ChunkReport: size=self.size, handler_name=self.handler.NAME, is_encrypted=self.is_encrypted, + metadata=self.metadata, extraction_reports=extraction_reports, ) @@ -188,7 +190,7 @@ def default(self, obj): if isinstance(obj, bytes): try: - return obj.decode() + return obj.decode("utf-8", errors="surrogateescape") except UnicodeDecodeError: return str(obj) diff --git a/unblob/report.py b/unblob/report.py index 1b5bed1e71..3fe145ab25 100644 --- a/unblob/report.py +++ b/unblob/report.py @@ -4,7 +4,7 @@ import traceback from enum import Enum from pathlib import Path -from typing import List, Optional, Union, final +from typing import Dict, List, Optional, Union, final import attr @@ -181,6 +181,7 @@ class ChunkReport(Report): end_offset: int size: int is_encrypted: bool + metadata: Dict extraction_reports: List[Report]