Skip to content

Commit be3c989

Browse files
authored
RSS Reader Mode parser improvements - Pick up all fields from RSS where possible, better auto-detect of the XML encoding if it wasnt set by the browser (#3646)
1 parent 0be5005 commit be3c989

File tree

3 files changed

+229
-63
lines changed

3 files changed

+229
-63
lines changed

changedetectionio/content_fetchers/requests.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from loguru import logger
22
import hashlib
33
import os
4+
import re
45
import asyncio
56
from changedetectionio import strtobool
67
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
@@ -76,9 +77,22 @@ def _run_sync(self,
7677
if not is_binary:
7778
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
7879
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
79-
encoding = chardet.detect(r.content)['encoding']
80-
if encoding:
81-
r.encoding = encoding
80+
# For XML/RSS feeds, check the XML declaration for encoding attribute
81+
# This is more reliable than chardet which can misdetect UTF-8 as MacRoman
82+
content_type = r.headers.get('content-type', '').lower()
83+
if 'xml' in content_type or 'rss' in content_type:
84+
# Look for <?xml version="1.0" encoding="UTF-8"?>
85+
xml_encoding_match = re.search(rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']', r.content[:200])
86+
if xml_encoding_match:
87+
r.encoding = xml_encoding_match.group(1).decode('ascii')
88+
else:
89+
# Default to UTF-8 for XML if no encoding found
90+
r.encoding = 'utf-8'
91+
else:
92+
# For other content types, use chardet
93+
encoding = chardet.detect(r.content)['encoding']
94+
if encoding:
95+
r.encoding = encoding
8296

8397
self.headers = r.headers
8498

changedetectionio/rss_tools.py

Lines changed: 134 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,135 @@ def repl(m):
2929
return re.sub(pattern, repl, html_content)
3030

3131

32+
# Jinja2 template for formatting RSS/Atom feed entries
33+
# Covers all common feedparser entry fields including namespaced elements
34+
# Outputs HTML that will be converted to text via html_to_text
35+
# @todo - This could be a UI setting in the future
36+
RSS_ENTRY_TEMPLATE = """<article class="rss-item" id="{{ entry.id|replace('"', '')|replace(' ', '-') }}">{%- if entry.title -%}Title: {{ entry.title }}<br>{%- endif -%}
37+
{%- if entry.link -%}<strong>Link:</strong> <a href="{{ entry.link }}">{{ entry.link }}</a><br>
38+
{%- endif -%}
39+
{%- if entry.id -%}
40+
<strong>Guid:</strong> {{ entry.id }}<br>
41+
{%- endif -%}
42+
{%- if entry.published -%}
43+
<strong>PubDate:</strong> {{ entry.published }}<br>
44+
{%- endif -%}
45+
{%- if entry.updated and entry.updated != entry.published -%}
46+
<strong>Updated:</strong> {{ entry.updated }}<br>
47+
{%- endif -%}
48+
{%- if entry.author -%}
49+
<strong>Author:</strong> {{ entry.author }}<br>
50+
{%- elif entry.author_detail and entry.author_detail.name -%}
51+
<strong>Author:</strong> {{ entry.author_detail.name }}
52+
{%- if entry.author_detail.email %} ({{ entry.author_detail.email }}){% endif -%}
53+
<br>
54+
{%- endif -%}
55+
{%- if entry.contributors -%}
56+
<strong>Contributors:</strong> {% for contributor in entry.contributors -%}
57+
{{ contributor.name if contributor.name else contributor }}
58+
{%- if not loop.last %}, {% endif -%}
59+
{%- endfor %}<br>
60+
{%- endif -%}
61+
{%- if entry.publisher -%}
62+
<strong>Publisher:</strong> {{ entry.publisher }}<br>
63+
{%- endif -%}
64+
{%- if entry.rights -%}
65+
<strong>Rights:</strong> {{ entry.rights }}<br>
66+
{%- endif -%}
67+
{%- if entry.license -%}
68+
<strong>License:</strong> {{ entry.license }}<br>
69+
{%- endif -%}
70+
{%- if entry.language -%}
71+
<strong>Language:</strong> {{ entry.language }}<br>
72+
{%- endif -%}
73+
{%- if entry.tags -%}
74+
<strong>Tags:</strong> {% for tag in entry.tags -%}
75+
{{ tag.term if tag.term else tag }}
76+
{%- if not loop.last %}, {% endif -%}
77+
{%- endfor %}<br>
78+
{%- endif -%}
79+
{%- if entry.category -%}
80+
<strong>Category:</strong> {{ entry.category }}<br>
81+
{%- endif -%}
82+
{%- if entry.comments -%}
83+
<strong>Comments:</strong> <a href="{{ entry.comments }}">{{ entry.comments }}</a><br>
84+
{%- endif -%}
85+
{%- if entry.slash_comments -%}
86+
<strong>Comment Count:</strong> {{ entry.slash_comments }}<br>
87+
{%- endif -%}
88+
{%- if entry.enclosures -%}
89+
<strong>Enclosures:</strong><br>
90+
{%- for enclosure in entry.enclosures %}
91+
- <a href="{{ enclosure.href }}">{{ enclosure.href }}</a> ({{ enclosure.type if enclosure.type else 'unknown type' }}
92+
{%- if enclosure.length %}, {{ enclosure.length }} bytes{% endif -%}
93+
)<br>
94+
{%- endfor -%}
95+
{%- endif -%}
96+
{%- if entry.media_content -%}
97+
<strong>Media:</strong><br>
98+
{%- for media in entry.media_content %}
99+
- <a href="{{ media.url }}">{{ media.url }}</a>
100+
{%- if media.type %} ({{ media.type }}){% endif -%}
101+
{%- if media.width and media.height %} {{ media.width }}x{{ media.height }}{% endif -%}
102+
<br>
103+
{%- endfor -%}
104+
{%- endif -%}
105+
{%- if entry.media_thumbnail -%}
106+
<strong>Thumbnail:</strong> <a href="{{ entry.media_thumbnail[0].url if entry.media_thumbnail[0].url else entry.media_thumbnail[0] }}">{{ entry.media_thumbnail[0].url if entry.media_thumbnail[0].url else entry.media_thumbnail[0] }}</a><br>
107+
{%- endif -%}
108+
{%- if entry.media_description -%}
109+
<strong>Media Description:</strong> {{ entry.media_description }}<br>
110+
{%- endif -%}
111+
{%- if entry.itunes_duration -%}
112+
<strong>Duration:</strong> {{ entry.itunes_duration }}<br>
113+
{%- endif -%}
114+
{%- if entry.itunes_author -%}
115+
<strong>Podcast Author:</strong> {{ entry.itunes_author }}<br>
116+
{%- endif -%}
117+
{%- if entry.dc_identifier -%}
118+
<strong>Identifier:</strong> {{ entry.dc_identifier }}<br>
119+
{%- endif -%}
120+
{%- if entry.dc_source -%}
121+
<strong>DC Source:</strong> {{ entry.dc_source }}<br>
122+
{%- endif -%}
123+
{%- if entry.dc_type -%}
124+
<strong>Type:</strong> {{ entry.dc_type }}<br>
125+
{%- endif -%}
126+
{%- if entry.dc_format -%}
127+
<strong>Format:</strong> {{ entry.dc_format }}<br>
128+
{%- endif -%}
129+
{%- if entry.dc_relation -%}
130+
<strong>Related:</strong> {{ entry.dc_relation }}<br>
131+
{%- endif -%}
132+
{%- if entry.dc_coverage -%}
133+
<strong>Coverage:</strong> {{ entry.dc_coverage }}<br>
134+
{%- endif -%}
135+
{%- if entry.source and entry.source.title -%}
136+
<strong>Source:</strong> {{ entry.source.title }}
137+
{%- if entry.source.link %} (<a href="{{ entry.source.link }}">{{ entry.source.link }}</a>){% endif -%}
138+
<br>
139+
{%- endif -%}
140+
{%- if entry.dc_content -%}
141+
<strong>Content:</strong> {{ entry.dc_content | safe }}
142+
{%- elif entry.content and entry.content[0].value -%}
143+
<strong>Content:</strong> {{ entry.content[0].value | safe }}
144+
{%- elif entry.summary -%}
145+
<strong>Summary:</strong> {{ entry.summary | safe }}
146+
{%- endif -%}</article>
147+
"""
148+
149+
32150
def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
33151
"""
34-
Format RSS/Atom feed items in a readable text format using feedparser.
152+
Format RSS/Atom feed items in a readable text format using feedparser and Jinja2.
35153
36-
Converts RSS <item> or Atom <entry> elements to formatted text with:
37-
- <title> → <h1>Title</h1>
38-
- <link> → Link: [url]
39-
- <guid> → Guid: [id]
40-
- <pubDate> → PubDate: [date]
41-
- <description> or <content> → Raw HTML content (CDATA and entities automatically handled)
154+
Converts RSS <item> or Atom <entry> elements to formatted text with all available fields:
155+
- Basic fields: title, link, id/guid, published date, updated date
156+
- Author fields: author, author_detail, contributors, publisher
157+
- Content fields: content, summary, description
158+
- Metadata: tags, category, rights, license
159+
- Media: enclosures, media_content, media_thumbnail
160+
- Dublin Core elements: dc:creator, dc:date, dc:publisher, etc. (mapped by feedparser)
42161
43162
Args:
44163
rss_content: The RSS/Atom feed content
@@ -49,65 +168,19 @@ def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
49168
"""
50169
try:
51170
import feedparser
52-
from xml.sax.saxutils import escape as xml_escape
171+
from changedetectionio.jinja2_custom import safe_jinja
53172

54173
# Parse the feed - feedparser handles all RSS/Atom variants, CDATA, entity unescaping, etc.
55174
feed = feedparser.parse(rss_content)
56175

57-
formatted_items = []
58-
59-
# Determine feed type for appropriate labels when fields are missing
60-
# feedparser sets feed.version to things like 'rss20', 'atom10', etc.
176+
# Determine feed type for appropriate labels
61177
is_atom = feed.version and 'atom' in feed.version
62178

179+
formatted_items = []
63180
for entry in feed.entries:
64-
item_parts = []
65-
66-
# Title - feedparser handles CDATA and entity unescaping automatically
67-
if hasattr(entry, 'title') and entry.title:
68-
item_parts.append(f'<h1>{xml_escape(entry.title)}</h1>')
69-
70-
# Link
71-
if hasattr(entry, 'link') and entry.link:
72-
item_parts.append(f'Link: {xml_escape(entry.link)}<br>')
73-
74-
# GUID/ID
75-
if hasattr(entry, 'id') and entry.id:
76-
item_parts.append(f'Guid: {xml_escape(entry.id)}<br>')
77-
78-
# Date - feedparser normalizes all date field names to 'published'
79-
if hasattr(entry, 'published') and entry.published:
80-
item_parts.append(f'PubDate: {xml_escape(entry.published)}<br>')
81-
82-
# Description/Content - feedparser handles CDATA and entity unescaping automatically
83-
# Only add "Summary:" label for Atom <summary> tags
84-
content = None
85-
add_label = False
86-
87-
if hasattr(entry, 'content') and entry.content:
88-
# Atom <content> - no label, just content
89-
content = entry.content[0].value if entry.content[0].value else None
90-
elif hasattr(entry, 'summary'):
91-
# Could be RSS <description> or Atom <summary>
92-
# feedparser maps both to entry.summary
93-
content = entry.summary if entry.summary else None
94-
# Only add "Summary:" label for Atom feeds (which use <summary> tag)
95-
if is_atom:
96-
add_label = True
97-
98-
# Add content with or without label
99-
if content:
100-
if add_label:
101-
item_parts.append(f'Summary:<br>{content}')
102-
else:
103-
item_parts.append(content)
104-
else:
105-
# No content - just show <none>
106-
item_parts.append('&lt;none&gt;')
107-
108-
# Join all parts of this item
109-
if item_parts:
110-
formatted_items.append('\n'.join(item_parts))
181+
# Render the entry using Jinja2 template
182+
rendered = safe_jinja.render(RSS_ENTRY_TEMPLATE, entry=entry, is_atom=is_atom)
183+
formatted_items.append(rendered.strip())
111184

112185
# Wrap each item in a div with classes (first, last, item-N)
113186
items_html = []
@@ -122,7 +195,8 @@ def format_rss_items(rss_content: str, render_anchor_tag_content=False) -> str:
122195

123196
class_str = ' '.join(classes)
124197
items_html.append(f'<div class="{class_str}">{item}</div>')
125-
return '<html><body>\n'+"\n<br><br>".join(items_html)+'\n</body></html>'
198+
199+
return '<html><body>\n' + "\n<br>".join(items_html) + '\n</body></html>'
126200

127201
except Exception as e:
128202
logger.warning(f"Error formatting RSS items: {str(e)}")

changedetectionio/tests/test_rss_reader_mode.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,61 @@
77
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_rss_token_from_UI, \
88
extract_UUID_from_client, delete_all_watches
99

10+
def set_xmlns_purl_content(datastore_path, extra=""):
11+
data=f"""<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="https://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
12+
<channel>
13+
<atom:link href="https://www.xxxxxxxtechxxxxx.com/feeds.xml" rel="self" type="application/rss+xml"/>
14+
<title>
15+
<![CDATA[ Latest from xxxxxxxtechxxxxx ]]>
16+
</title>
17+
<link>https://www.xxxxx.com</link>
18+
<description>
19+
<![CDATA[ All the latest content from the xxxxxxxtechxxxxx team ]]>
20+
</description>
21+
<lastBuildDate>Wed, 19 Nov 2025 15:00:00 +0000</lastBuildDate>
22+
<language>en</language>
23+
<item>
24+
<title>
25+
<![CDATA[ Sony Xperia 1 VII review: has Sony’s long-standing Xperia family lost what it takes to compete? ]]>
26+
</title>
27+
<dc:content>
28+
<![CDATA[ {{extra}} a little harder, dc-content. blue often quite tough and purple usually very difficult.</p><p>On the plus side, you don't technically need to solve the final one, as you'll be able to answer that one by a process of elimination. What's more, you can make up to four mistakes, which gives you a little bit of breathing room.</p><p>It's a little more involved than something like Wordle, however, and there are plenty of opportunities for the game to trip you up with tricks. For instance, watch out for homophones and other word games that could disguise the answers.</p><p>It's playable for free via the <a href="https://www.nytimes.com/games/strands" target="_blank">NYT Games site</a> on desktop or mobile.</p></article></section> ]]>
29+
</dc:content>
30+
<link>https://www.xxxxxxx.com/gaming/nyt-connections-today-answers-hints-20-november-2025</link>
31+
<description>
32+
<![CDATA[ Looking for NYT Connections answers and hints? Here's all you need to know to solve today's game, plus my commentary on the puzzles. ]]>
33+
</description>
34+
<guid isPermaLink="false">N2C2T6DztpWdxSdKpSUx89</guid>
35+
<enclosure url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg" type="image/jpeg" length="0"/>
36+
<pubDate>Wed, 19 Nov 2025 15:00:00 +0000</pubDate>
37+
<category>
38+
<![CDATA[ Gaming ]]>
39+
</category>
40+
<dc:creator>
41+
<![CDATA[ Johnny Dee ]]>
42+
</dc:creator>
43+
<media:content type="image/jpeg" url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg">
44+
<media:credit>
45+
<![CDATA[ New York Times ]]>
46+
</media:credit>
47+
<media:text>
48+
<![CDATA[ NYT Connections homescreen on a phone, on a purple background ]]>
49+
</media:text>
50+
<media:title type="plain">
51+
<![CDATA[ NYT Connections homescreen on a phone, on a purple background ]]>
52+
</media:title>
53+
</media:content>
54+
<media:thumbnail url="https://cdn.mos.cms.futurecdn.net/RCGfdf3yhQ9W3MHbTRT6yk-1280-80.jpg"/>
55+
</item>
56+
</channel>
57+
</rss>
58+
"""
59+
60+
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
61+
f.write(data)
62+
63+
64+
1065

1166
def set_original_cdata_xml(datastore_path):
1267
test_return_data = """<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
@@ -98,3 +153,26 @@ def test_rss_reader_mode_with_css_filters(client, live_server, measure_memory_us
98153
assert 'The days of Terminator and The Matrix' in snapshot_contents
99154
delete_all_watches(client)
100155

156+
157+
def test_xmlns_purl_content(client, live_server, measure_memory_usage, datastore_path):
158+
set_xmlns_purl_content(datastore_path=datastore_path)
159+
160+
# Rarely do endpoints give the right header, usually just text/xml, so we check also for <rss
161+
# This also triggers the automatic CDATA text parser so the RSS goes back a nice content list
162+
#test_url = url_for('test_endpoint', content_type="text/xml; charset=UTF-8", _external=True)
163+
164+
# Because NO utf-8 was specified here, we should be able to recover it in requests or other somehow.
165+
test_url = url_for('test_endpoint', content_type="text/xml;", _external=True)
166+
live_server.app.config['DATASTORE'].data['settings']['application']['rss_reader_mode'] = True
167+
168+
# Add our URL to the import page
169+
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'include_filters': [".last"]})
170+
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
171+
172+
wait_for_all_checks(client)
173+
174+
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
175+
dates = list(watch.history.keys())
176+
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
177+
assert "Title: Sony Xperia 1 VII review: has Sony’s long-standing Xperia family lost what it takes to compete?" in snapshot_contents
178+
assert "dc-content" in snapshot_contents

0 commit comments

Comments
 (0)