feat: fallback title from webpage url

enitrat · enitrat · commit bf26b2f39a51 · 2025-10-31T10:27:46.000Z
diff --git a/python/src/cairo_coder/core/rag_pipeline.py b/python/src/cairo_coder/core/rag_pipeline.py
@@ -23,6 +23,7 @@
     ProcessedQuery,
     StreamEvent,
     StreamEventType,
+    title_from_url,
 )
 from cairo_coder.dspy.document_retriever import DocumentRetrieverProgram
 from cairo_coder.dspy.generation_program import GenerationProgram, McpGenerationProgram
@@ -326,15 +327,6 @@ def _format_sources(self, documents: list[Document]) -> list[dict[str, Any]]:
         sources: list[dict[str, str]] = []
         seen_urls: set[str] = set()
 
-        # Helper to extract domain title
-        def title_from_url(url: str) -> str:
-            try:
-                import urllib.parse as _up
-
-                host = _up.urlparse(url).netloc
-                return host or url
-            except Exception:
-                return url
 
         # 1) Vector store and other docs (skip Grok summary virtual doc)
         for doc in documents:
diff --git a/python/src/cairo_coder/core/types.py b/python/src/cairo_coder/core/types.py
@@ -75,6 +75,31 @@ class ProcessedQuery:
     resources: list[DocumentSource] = field(default_factory=list)
 
 
+# Helper to extract domain title
+def title_from_url(url: str) -> str:
+    try:
+        import urllib.parse as _up
+
+        parsed = _up.urlparse(url)
+
+        # Try to extract a meaningful title from the path
+        path = parsed.path.strip('/')
+        if path:
+            # Get the last segment of the path
+            last_segment = path.split('/')[-1]
+            # Remove file extensions
+            last_segment = last_segment.rsplit('.', 1)[0]
+            # Convert hyphens/underscores to spaces and title case
+            if last_segment:
+                title = last_segment.replace('-', ' ').replace('_', ' ').title()
+                return title
+
+        # Fallback to netloc if path extraction fails
+        host = parsed.netloc
+        return host or url
+    except Exception:
+        return url
+
 @dataclass(frozen=True)
 class Document:
     """
@@ -95,7 +120,8 @@ def source(self) -> str | None:
     @property
     def title(self) -> str | None:
         """Get document title from metadata."""
-        return self.metadata.get("title", self.page_content[:20])
+        title_fallback = title_from_url(self.source_link) if self.source_link else None
+        return self.metadata.get("title", title_fallback or self.page_content[:20])
 
     @property
     def source_link(self) -> str | None:
diff --git a/python/tests/unit/test_rag_pipeline.py b/python/tests/unit/test_rag_pipeline.py
@@ -528,6 +528,21 @@ def test_format_sources_deduplicates_urls(self, rag_pipeline):
         urls = [s["metadata"].get("url", "") for s in sources]
         assert urls.count(url) == 1
 
+    def test_format_sources_title_fallback(self, rag_pipeline):
+        """Missing document titles should fall back to a human-friendly domain."""
+        doc = Document(
+            page_content="Some content",
+            metadata={
+                "sourceLink": "https://docs.example.com/path/to/page",
+            },
+        )
+
+        sources = rag_pipeline._format_sources([doc])
+
+        assert len(sources) == 1
+        assert sources[0]["metadata"]["title"] == "Page"
+        assert sources[0]["metadata"]["url"] == "https://docs.example.com/path/to/page"
+
     def test_prepare_context_excludes_virtual_document_headers(self, rag_pipeline):
         """Virtual documents should not have headers to prevent citation."""
         docs = [