Skip to content

Commit 90d68f7

Browse files
committed
RSS encoding fixes
1 parent b6b733a commit 90d68f7

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

changedetectionio/content_fetchers/requests.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from loguru import logger
22
import hashlib
33
import os
4+
import re
45
import asyncio
56
from changedetectionio import strtobool
67
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
@@ -76,9 +77,22 @@ def _run_sync(self,
7677
if not is_binary:
7778
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
7879
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
79-
encoding = chardet.detect(r.content)['encoding']
80-
if encoding:
81-
r.encoding = encoding
80+
# For XML/RSS feeds, check the XML declaration for encoding attribute
81+
# This is more reliable than chardet which can misdetect UTF-8 as MacRoman
82+
content_type = r.headers.get('content-type', '').lower()
83+
if 'xml' in content_type or 'rss' in content_type:
84+
# Look for <?xml version="1.0" encoding="UTF-8"?>
85+
xml_encoding_match = re.search(rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']', r.content[:200])
86+
if xml_encoding_match:
87+
r.encoding = xml_encoding_match.group(1).decode('ascii')
88+
else:
89+
# Default to UTF-8 for XML if no encoding found
90+
r.encoding = 'utf-8'
91+
else:
92+
# For other content types, use chardet
93+
encoding = chardet.detect(r.content)['encoding']
94+
if encoding:
95+
r.encoding = encoding
8296

8397
self.headers = r.headers
8498

changedetectionio/rss_tools.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,6 @@ def repl(m):
146146
<strong>Summary:</strong><br>
147147
{%- endif -%}
148148
{{ entry.summary | safe }}
149-
{%- else -%}
150-
&lt;none&gt;
151149
{%- endif -%}
152150
153151
"""

0 commit comments

Comments
 (0)