Skip to content

Commit bf26b2f

Browse files
committed
feat: fallback title from webpage url
1 parent 3d2b5c5 commit bf26b2f

File tree

3 files changed

+43
-10
lines changed

3 files changed

+43
-10
lines changed

python/src/cairo_coder/core/rag_pipeline.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
ProcessedQuery,
2424
StreamEvent,
2525
StreamEventType,
26+
title_from_url,
2627
)
2728
from cairo_coder.dspy.document_retriever import DocumentRetrieverProgram
2829
from cairo_coder.dspy.generation_program import GenerationProgram, McpGenerationProgram
@@ -326,15 +327,6 @@ def _format_sources(self, documents: list[Document]) -> list[dict[str, Any]]:
326327
sources: list[dict[str, str]] = []
327328
seen_urls: set[str] = set()
328329

329-
# Helper to extract domain title
330-
def title_from_url(url: str) -> str:
331-
try:
332-
import urllib.parse as _up
333-
334-
host = _up.urlparse(url).netloc
335-
return host or url
336-
except Exception:
337-
return url
338330

339331
# 1) Vector store and other docs (skip Grok summary virtual doc)
340332
for doc in documents:

python/src/cairo_coder/core/types.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,31 @@ class ProcessedQuery:
7575
resources: list[DocumentSource] = field(default_factory=list)
7676

7777

78+
# Helper to extract domain title
79+
def title_from_url(url: str) -> str:
80+
try:
81+
import urllib.parse as _up
82+
83+
parsed = _up.urlparse(url)
84+
85+
# Try to extract a meaningful title from the path
86+
path = parsed.path.strip('/')
87+
if path:
88+
# Get the last segment of the path
89+
last_segment = path.split('/')[-1]
90+
# Remove file extensions
91+
last_segment = last_segment.rsplit('.', 1)[0]
92+
# Convert hyphens/underscores to spaces and title case
93+
if last_segment:
94+
title = last_segment.replace('-', ' ').replace('_', ' ').title()
95+
return title
96+
97+
# Fallback to netloc if path extraction fails
98+
host = parsed.netloc
99+
return host or url
100+
except Exception:
101+
return url
102+
78103
@dataclass(frozen=True)
79104
class Document:
80105
"""
@@ -95,7 +120,8 @@ def source(self) -> str | None:
95120
@property
96121
def title(self) -> str | None:
97122
"""Get document title from metadata."""
98-
return self.metadata.get("title", self.page_content[:20])
123+
title_fallback = title_from_url(self.source_link) if self.source_link else None
124+
return self.metadata.get("title", title_fallback or self.page_content[:20])
99125

100126
@property
101127
def source_link(self) -> str | None:

python/tests/unit/test_rag_pipeline.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,21 @@ def test_format_sources_deduplicates_urls(self, rag_pipeline):
528528
urls = [s["metadata"].get("url", "") for s in sources]
529529
assert urls.count(url) == 1
530530

531+
def test_format_sources_title_fallback(self, rag_pipeline):
532+
"""Missing document titles should fall back to a human-friendly domain."""
533+
doc = Document(
534+
page_content="Some content",
535+
metadata={
536+
"sourceLink": "https://docs.example.com/path/to/page",
537+
},
538+
)
539+
540+
sources = rag_pipeline._format_sources([doc])
541+
542+
assert len(sources) == 1
543+
assert sources[0]["metadata"]["title"] == "Page"
544+
assert sources[0]["metadata"]["url"] == "https://docs.example.com/path/to/page"
545+
531546
def test_prepare_context_excludes_virtual_document_headers(self, rag_pipeline):
532547
"""Virtual documents should not have headers to prevent citation."""
533548
docs = [

0 commit comments

Comments
 (0)