Skip to content

Commit

Permalink
avoid running license detection on binary blobs
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan6419846 committed Jan 22, 2024
1 parent f13712f commit 84262eb
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 18 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* Skip symlinks for LDD analysis.
* Analyze nested archives.
* Analyze more archives, including RPM files.
* Speed-up analysis of packed archive files (will be unpacked in a separate step), ELF binaries and fonts by not scanning the whole binary blob,
but only looking at the metadata (if available).
* Handle more types of ELF binaries.
* Move tools to dedicated submodule.

Expand Down
49 changes: 45 additions & 4 deletions license_tools/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,35 @@ def to_kwargs(cls, flags: int) -> dict[str, bool]:
)


def _run_on_archive_file(path: Path) -> None:
"""
Run archive-specific analysis.
:param path: The path to run on.
"""
if path.suffix == ".rpm" or (path.suffixes and ".rpm" in path.suffixes):
rpm_results = PackageResults.from_rpm(path)
if rpm_results.declared_license_expression_spdx:
print(f'{path} declares the {rpm_results.declared_license_expression_spdx} license in its metadata.\n')


def _get_dummy_file_results(
path: Path,
short_path: str
) -> FileResults:
"""
Get some empty dummy license results, which allows displaying the file in the
results, but skipping the actual analysis.
:param path: The file path to analyze.
:param short_path: The short path to use for display.
:return: Minimal results.
"""
results = FileResults(path=path, short_path=short_path)
results.licenses = Licenses()
return results


def run_on_file(
path: Path,
short_path: str,
Expand All @@ -125,17 +154,32 @@ def run_on_file(
:param retrieval_flags: Values to retrieve.
:return: The requested results.
"""
if archive_utils.can_extract(archive_path=path):
# Archive files which can be extracted further are not being analyzed on the
# file level. This should improve the extraction speed and avoid possible
# memory errors as this skips running string matching on possibly large
# archives which are binary blobs anyway and do not provide any real value.
# Instead, just have a quick look at their headers if they provide any useful
# values.
_run_on_archive_file(path=path)
return _get_dummy_file_results(path=path, short_path=short_path)

retrieval_kwargs = RetrievalFlags.to_kwargs(flags=retrieval_flags)

# This data is not yet part of the dataclasses above, as it is a custom analysis.
# Return early if we got a result here, as these binary files currently do not
# provide any additional useful insights in most of the cases, but tend to be
# larger binary blobs which just slow down the analysis.
if retrieval_kwargs.pop("retrieve_ldd_data"):
results = linking_tools.check_shared_objects(path=path)
if results:
print(short_path + "\n" + results)
return _get_dummy_file_results(path=path, short_path=short_path)
if retrieval_kwargs.pop("retrieve_font_data"):
results = font_tools.check_font(path=path)
if results:
print(short_path + "\n" + results + "\n")
return _get_dummy_file_results(path=path, short_path=short_path)

# Register this here as each parallel process has its own directory.
atexit.register(scancode_tools.cleanup, scancode_config.scancode_temp_dir)
Expand Down Expand Up @@ -232,10 +276,7 @@ def run_on_package_archive_file(
:param retrieval_flags: Values to retrieve.
:return: The requested results.
"""
if archive_path.suffix == ".rpm" or (archive_path.suffixes and ".rpm" in archive_path.suffixes):
rpm_results = PackageResults.from_rpm(archive_path)
if rpm_results.declared_license_expression_spdx:
print(f'{archive_path} declares the {rpm_results.declared_license_expression_spdx} license in its metadata.\n')
_run_on_archive_file(path=archive_path)

with TemporaryDirectory() as working_directory:
if not archive_utils.can_extract(archive_path):
Expand Down
6 changes: 3 additions & 3 deletions license_tools/tools/scancode_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ class Licenses:
Information on all detected licenses.
"""

detected_license_expression: str | None
detected_license_expression_spdx: str | None
percentage_of_license_text: float
detected_license_expression: str | None = None
detected_license_expression_spdx: str | None = None
percentage_of_license_text: float = 0.0
license_detections: list[LicenseDetection] = dataclass_field(default_factory=list)
license_clues: list[LicenseClue] = dataclass_field(default_factory=list)

Expand Down
19 changes: 8 additions & 11 deletions tests/test_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,14 @@ def _run_mocked(
return_value: str | None = "",
) -> tuple[mock.Mock, mock.Mock, str]:
stdout = StringIO()
file_result = object()

class DummyFileResult:
licenses = None

def __init__(self, *args: Any, **kwargs: Any) -> None:
pass

file_result = DummyFileResult()
with mock.patch.object(
retrieval, "FileResults", return_value=file_result
) as results_mock, redirect_stdout(stdout), mock.patch(
Expand Down Expand Up @@ -155,11 +162,6 @@ def test_run_on_file__ldd_handling(self) -> None:
results_mock.assert_called_once_with(
path=SETUP_PATH,
short_path="setup.py",
retrieve_licenses=True,
retrieve_copyrights=True,
retrieve_emails=True,
retrieve_file_info=True,
retrieve_urls=True,
)
self.assertEqual("setup.py\n" + ldd_usr_bin_bc + "\n", stdout)

Expand Down Expand Up @@ -222,11 +224,6 @@ def test_run_on_file__font_handling(self) -> None:
results_mock.assert_called_once_with(
path=SETUP_PATH,
short_path="setup.py",
retrieve_licenses=True,
retrieve_copyrights=True,
retrieve_emails=True,
retrieve_file_info=True,
retrieve_urls=True,
)
self.assertEqual("setup.py\n" + font_awesome + "\n\n", stdout)

Expand Down

0 comments on commit 84262eb

Please sign in to comment.