Skip to content

Commit 25e41ce

Browse files
authored
fix: add license SPDX ID resolver (#1569)
1 parent dfded4b commit 25e41ce

File tree

2 files changed

+104
-21
lines changed

2 files changed

+104
-21
lines changed

api/src/shared/common/license_utils.py

Lines changed: 71 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ class MatchingLicense:
6666
(re.compile(r"opensource\.org/licenses/MIT/?", re.I), "MIT"),
6767
(re.compile(r"choosealicense\.com/licenses/mit/?", re.I), "MIT"),
6868
(re.compile(r"choosealicense\.com/licenses/apache-2\.0/?", re.I), "Apache-2.0"),
69-
# add Etalab / Québec, etc., once verified
7069
]
7170

7271

@@ -281,6 +280,44 @@ def resolve_fuzzy_match(
281280
return results
282281

283282

283+
def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
284+
"""Find exact match of normalized license URL in DB (License.url)."""
285+
if not db_session:
286+
return None
287+
# Compare normalized strings using SQL functions on License.url
288+
return (
289+
db_session.query(License)
290+
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
291+
.first()
292+
)
293+
294+
295+
def extract_spdx_id_from_url(url_normalized: str) -> Optional[str]:
296+
"""Extract an SPDX license ID from an SPDX-style URL if present.
297+
298+
Recognizes URLs of the form used on spdx.org, for example::
299+
300+
https://spdx.org/licenses/ODbL-1.0.html
301+
http://spdx.org/licenses/MIT
302+
303+
The function is conservative and only returns an SPDX ID when it finds a
304+
path segment under ``/licenses/`` that looks like an SPDX identifier. Any
305+
optional ``.html`` suffix is stripped.
306+
"""
307+
# Match host 'spdx.org' and capture the token after '/licenses/' up to
308+
# an optional '.html' suffix and optional trailing slash.
309+
match = re.search(r"spdx\.org/licenses/([^/?#]+?)(?:\.html)?/?$", url_normalized, re.I)
310+
if not match:
311+
return None
312+
313+
spdx_id = match.group(1)
314+
# Basic sanity check: SPDX IDs are typically alnum plus '-', '.' (e.g. 'CC-BY-4.0')
315+
if not re.fullmatch(r"[A-Za-z0-9.+-]+", spdx_id):
316+
return None
317+
318+
return spdx_id
319+
320+
284321
def resolve_license(
285322
license_url: str,
286323
allow_fuzzy: bool = True,
@@ -290,11 +327,12 @@ def resolve_license(
290327
"""Resolve a license URL to one or more SPDX candidates using multiple strategies.
291328
292329
Strategies (in order of precedence):
293-
1) Exact match in DB(db.license) -> return [exact]
294-
2) Creative Commons resolver(cc-resolver) -> return [cc]
295-
3) Generic heuristics(pattern-heuristics) -> return [heuristic]
296-
4) Fuzzy (same host candidates) -> return [fuzzy...]
297-
5) No match -> return [none]
330+
1) Exact match in DB (``db.license``) -> return [exact]
331+
2) Creative Commons resolver (``cc-resolver``) -> return [cc]
332+
3) SPDX catalog URL resolver (``spdx.org/licenses``) -> return [spdx]
333+
4) Generic heuristics (pattern-based) -> return [heuristic]
334+
5) Fuzzy (same-host candidates) -> return [fuzzy...]
335+
6) No match -> return []
298336
299337
Args:
300338
license_url (str): The license URL to resolve.
@@ -350,7 +388,31 @@ def resolve_license(
350388
)
351389
]
352390

353-
# 3) Generic heuristics
391+
# 3) SPDX catalog URL (spdx.org/licenses/<ID>[.html])
392+
spdx_id = extract_spdx_id_from_url(url_normalized)
393+
if spdx_id:
394+
# Try to enrich from DB if a matching License row exists
395+
db_lic: License | None = (
396+
db_session.query(License).filter(func.lower(License.id) == func.lower(spdx_id)).one_or_none()
397+
)
398+
if db_lic is not None:
399+
return [
400+
MatchingLicense(
401+
license_id=db_lic.id,
402+
license_url=url_str,
403+
normalized_url=url_normalized,
404+
spdx_id=spdx_id,
405+
match_type="heuristic",
406+
confidence=0.98,
407+
matched_name=db_lic.name,
408+
matched_catalog_url=db_lic.url,
409+
matched_source="spdx-resolver",
410+
)
411+
]
412+
else:
413+
logging.warning("SPDX ID %s resolved from URL but not found in DB", spdx_id)
414+
415+
# 4) Generic heuristics
354416
heuristic_match = heuristic_spdx(url_str)
355417
if heuristic_match:
356418
return [
@@ -366,7 +428,7 @@ def resolve_license(
366428
)
367429
]
368430

369-
# 4) Fuzzy (same host candidates only)
431+
# 5) Fuzzy (same host candidates only)
370432
if allow_fuzzy and url_host and db_session is not None:
371433
fuzzy_results = resolve_fuzzy_match(
372434
url_str=url_str,
@@ -378,17 +440,5 @@ def resolve_license(
378440
if fuzzy_results:
379441
return fuzzy_results
380442

381-
# 5) No match
443+
# 6) No match
382444
return []
383-
384-
385-
def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
386-
"""Find exact match of normalized license URL in DB (License.url)."""
387-
if not db_session:
388-
return None
389-
# Compare normalized strings using SQL functions on License.url
390-
return (
391-
db_session.query(License)
392-
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
393-
.first()
394-
)

api/tests/utils/test_license_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,39 @@ def test_resolve_license_creative_commons(self, _mock_find):
155155
self.assertEqual(results[0].spdx_id, "CC-BY-4.0")
156156
self.assertEqual(results[0].match_type, "heuristic")
157157

158+
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
159+
def test_resolve_license_spdx_catalog_url_db_hit(self, _mock_find):
160+
"""SPDX catalog URLs (spdx.org/licenses/ID) should resolve via SPDX branch when license exists in DB."""
161+
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
162+
lic = self._make_license("odbl-1.0", "https://spdx.org/licenses/ODbL-1.0.html", "ODbL 1.0")
163+
# Configure session to return our license when queried by ID
164+
self.session.query.return_value.filter.return_value.one_or_none.return_value = lic
165+
166+
results = resolve_license(spdx_url, db_session=self.session)
167+
168+
self.assertEqual(len(results), 1)
169+
r = results[0]
170+
# Implementation currently lowercases the SPDX ID extracted from the URL
171+
self.assertEqual(r.spdx_id, "odbl-1.0")
172+
self.assertEqual(r.license_id, "odbl-1.0")
173+
self.assertEqual(r.match_type, "heuristic")
174+
self.assertEqual(r.matched_source, "spdx-resolver")
175+
self.assertEqual(r.matched_name, "ODbL 1.0")
176+
self.assertEqual(r.matched_catalog_url, "https://spdx.org/licenses/ODbL-1.0.html")
177+
178+
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
179+
def test_resolve_license_spdx_catalog_url_db_miss(self, _mock_find):
180+
"""When SPDX ID is parsed from URL but not present in DB,
181+
resolver should log and return no SPDX-based result."""
182+
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
183+
# Simulate no matching License in DB
184+
self.session.query.return_value.filter.return_value.one_or_none.return_value = None
185+
186+
results = resolve_license(spdx_url, db_session=self.session)
187+
188+
# Current behavior: we only log a warning and return an empty list when SPDX ID is not found in DB.
189+
self.assertEqual(results, [])
190+
158191
@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
159192
def test_resolve_license_generic_heuristic(self, _mock_find):
160193
# Provide URL that matches heuristic patterns

0 commit comments

Comments
 (0)