@@ -66,7 +66,6 @@ class MatchingLicense:
6666 (re .compile (r"opensource\.org/licenses/MIT/?" , re .I ), "MIT" ),
6767 (re .compile (r"choosealicense\.com/licenses/mit/?" , re .I ), "MIT" ),
6868 (re .compile (r"choosealicense\.com/licenses/apache-2\.0/?" , re .I ), "Apache-2.0" ),
69- # add Etalab / Québec, etc., once verified
7069]
7170
7271
@@ -281,6 +280,44 @@ def resolve_fuzzy_match(
281280 return results
282281
283282
283+ def find_exact_match_license_url (url_normalized : str , db_session : Session | None ) -> License | None :
284+ """Find exact match of normalized license URL in DB (License.url)."""
285+ if not db_session :
286+ return None
287+ # Compare normalized strings using SQL functions on License.url
288+ return (
289+ db_session .query (License )
290+ .filter (normalize_url_str (url_normalized ) == func .lower (func .trim (normalize_url (License .url ))))
291+ .first ()
292+ )
293+
294+
295+ def extract_spdx_id_from_url (url_normalized : str ) -> Optional [str ]:
296+ """Extract an SPDX license ID from an SPDX-style URL if present.
297+
298+ Recognizes URLs of the form used on spdx.org, for example::
299+
300+ https://spdx.org/licenses/ODbL-1.0.html
301+ http://spdx.org/licenses/MIT
302+
303+ The function is conservative and only returns an SPDX ID when it finds a
304+ path segment under ``/licenses/`` that looks like an SPDX identifier. Any
305+ optional ``.html`` suffix is stripped.
306+ """
307+ # Match host 'spdx.org' and capture the token after '/licenses/' up to
308+ # an optional '.html' suffix and optional trailing slash.
309+ match = re .search (r"spdx\.org/licenses/([^/?#]+?)(?:\.html)?/?$" , url_normalized , re .I )
310+ if not match :
311+ return None
312+
313+ spdx_id = match .group (1 )
314+ # Basic sanity check: SPDX IDs are typically alnum plus '-', '.' (e.g. 'CC-BY-4.0')
315+ if not re .fullmatch (r"[A-Za-z0-9.+-]+" , spdx_id ):
316+ return None
317+
318+ return spdx_id
319+
320+
284321def resolve_license (
285322 license_url : str ,
286323 allow_fuzzy : bool = True ,
@@ -290,11 +327,12 @@ def resolve_license(
290327 """Resolve a license URL to one or more SPDX candidates using multiple strategies.
291328
292329 Strategies (in order of precedence):
293- 1) Exact match in DB(db.license) -> return [exact]
294- 2) Creative Commons resolver(cc-resolver) -> return [cc]
295- 3) Generic heuristics(pattern-heuristics) -> return [heuristic]
296- 4) Fuzzy (same host candidates) -> return [fuzzy...]
297- 5) No match -> return [none]
330+ 1) Exact match in DB (``db.license``) -> return [exact]
331+ 2) Creative Commons resolver (``cc-resolver``) -> return [cc]
332+ 3) SPDX catalog URL resolver (``spdx.org/licenses``) -> return [spdx]
333+ 4) Generic heuristics (pattern-based) -> return [heuristic]
334+ 5) Fuzzy (same-host candidates) -> return [fuzzy...]
335+ 6) No match -> return []
298336
299337 Args:
300338 license_url (str): The license URL to resolve.
@@ -350,7 +388,31 @@ def resolve_license(
350388 )
351389 ]
352390
353- # 3) Generic heuristics
391+ # 3) SPDX catalog URL (spdx.org/licenses/<ID>[.html])
392+ spdx_id = extract_spdx_id_from_url (url_normalized )
393+ if spdx_id :
394+ # Try to enrich from DB if a matching License row exists
395+ db_lic : License | None = (
396+ db_session .query (License ).filter (func .lower (License .id ) == func .lower (spdx_id )).one_or_none ()
397+ )
398+ if db_lic is not None :
399+ return [
400+ MatchingLicense (
401+ license_id = db_lic .id ,
402+ license_url = url_str ,
403+ normalized_url = url_normalized ,
404+ spdx_id = spdx_id ,
405+ match_type = "heuristic" ,
406+ confidence = 0.98 ,
407+ matched_name = db_lic .name ,
408+ matched_catalog_url = db_lic .url ,
409+ matched_source = "spdx-resolver" ,
410+ )
411+ ]
412+ else :
413+ logging .warning ("SPDX ID %s resolved from URL but not found in DB" , spdx_id )
414+
415+ # 4) Generic heuristics
354416 heuristic_match = heuristic_spdx (url_str )
355417 if heuristic_match :
356418 return [
@@ -366,7 +428,7 @@ def resolve_license(
366428 )
367429 ]
368430
369- # 4 ) Fuzzy (same host candidates only)
431+ # 5 ) Fuzzy (same host candidates only)
370432 if allow_fuzzy and url_host and db_session is not None :
371433 fuzzy_results = resolve_fuzzy_match (
372434 url_str = url_str ,
@@ -378,17 +440,5 @@ def resolve_license(
378440 if fuzzy_results :
379441 return fuzzy_results
380442
381- # 5 ) No match
443+ # 6 ) No match
382444 return []
383-
384-
385- def find_exact_match_license_url (url_normalized : str , db_session : Session | None ) -> License | None :
386- """Find exact match of normalized license URL in DB (License.url)."""
387- if not db_session :
388- return None
389- # Compare normalized strings using SQL functions on License.url
390- return (
391- db_session .query (License )
392- .filter (normalize_url_str (url_normalized ) == func .lower (func .trim (normalize_url (License .url ))))
393- .first ()
394- )
0 commit comments