RSS encoding fixes

dgtlmoon · dgtlmoon · commit 90d68f7ca74b · 2025-11-19T15:19:50.000+01:00
diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py
@@ -1,6 +1,7 @@
 from loguru import logger
 import hashlib
 import os
+import re
 import asyncio
 from changedetectionio import strtobool
 from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
@@ -76,9 +77,22 @@ def _run_sync(self,
         if not is_binary:
             # Don't run this for PDF (and requests identified as binary) takes a _long_ time
             if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
-                encoding = chardet.detect(r.content)['encoding']
-                if encoding:
-                    r.encoding = encoding
+                # For XML/RSS feeds, check the XML declaration for encoding attribute
+                # This is more reliable than chardet which can misdetect UTF-8 as MacRoman
+                content_type = r.headers.get('content-type', '').lower()
+                if 'xml' in content_type or 'rss' in content_type:
+                    # Look for <?xml version="1.0" encoding="UTF-8"?>
+                    xml_encoding_match = re.search(rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']', r.content[:200])
+                    if xml_encoding_match:
+                        r.encoding = xml_encoding_match.group(1).decode('ascii')
+                    else:
+                        # Default to UTF-8 for XML if no encoding found
+                        r.encoding = 'utf-8'
+                else:
+                    # For other content types, use chardet
+                    encoding = chardet.detect(r.content)['encoding']
+                    if encoding:
+                        r.encoding = encoding
 
         self.headers = r.headers
 
diff --git a/changedetectionio/rss_tools.py b/changedetectionio/rss_tools.py
@@ -146,8 +146,6 @@ def repl(m):
 <strong>Summary:</strong><br>
 {%- endif -%}
 {{ entry.summary | safe }}
-{%- else -%}
-&lt;none&gt;
 {%- endif -%}
 
 """