File tree Expand file tree Collapse file tree 2 files changed +17
-5
lines changed Expand file tree Collapse file tree 2 files changed +17
-5
lines changed Original file line number Diff line number Diff line change 11from loguru import logger
22import hashlib
33import os
4+ import re
45import asyncio
56from changedetectionio import strtobool
67from changedetectionio .content_fetchers .exceptions import BrowserStepsInUnsupportedFetcher , EmptyReply , Non200ErrorCodeReceived
@@ -76,9 +77,22 @@ def _run_sync(self,
7677 if not is_binary :
7778 # Don't run this for PDF (and requests identified as binary) takes a _long_ time
7879 if not r .headers .get ('content-type' ) or not 'charset=' in r .headers .get ('content-type' ):
79- encoding = chardet .detect (r .content )['encoding' ]
80- if encoding :
81- r .encoding = encoding
80+ # For XML/RSS feeds, check the XML declaration for encoding attribute
81+ # This is more reliable than chardet which can misdetect UTF-8 as MacRoman
82+ content_type = r .headers .get ('content-type' , '' ).lower ()
83+ if 'xml' in content_type or 'rss' in content_type :
84+ # Look for <?xml version="1.0" encoding="UTF-8"?>
85+ xml_encoding_match = re .search (rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']' , r .content [:200 ])
86+ if xml_encoding_match :
87+ r .encoding = xml_encoding_match .group (1 ).decode ('ascii' )
88+ else :
89+ # Default to UTF-8 for XML if no encoding found
90+ r .encoding = 'utf-8'
91+ else :
92+ # For other content types, use chardet
93+ encoding = chardet .detect (r .content )['encoding' ]
94+ if encoding :
95+ r .encoding = encoding
8296
8397 self .headers = r .headers
8498
Original file line number Diff line number Diff line change @@ -146,8 +146,6 @@ def repl(m):
146146<strong>Summary:</strong><br>
147147{%- endif -%}
148148{{ entry.summary | safe }}
149- {%- else -%}
150- <none>
151149{%- endif -%}
152150
153151"""
You can’t perform that action at this time.
0 commit comments