Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 1 addition & 102 deletions changedetectionio/blueprint/rss/__init__.py
Original file line number Diff line number Diff line change
@@ -1,102 +1 @@
import time
import datetime
import pytz
from flask import Blueprint, make_response, request, url_for
from loguru import logger
from feedgen.feed import FeedGenerator

from changedetectionio.store import ChangeDetectionStore
from changedetectionio.safe_jinja import render as jinja_render

def construct_blueprint(datastore: ChangeDetectionStore):
rss_blueprint = Blueprint('rss', __name__)

# Import the login decorator if needed
# from changedetectionio.auth_decorator import login_optionally_required
@rss_blueprint.route("", methods=['GET'])
def feed():
now = time.time()
# Always requires token set
app_rss_token = datastore.data['settings']['application'].get('rss_access_token')
rss_url_token = request.args.get('token')
if rss_url_token != app_rss_token:
return "Access denied, bad token", 403

from changedetectionio import diff
limit_tag = request.args.get('tag', '').lower().strip()
# Be sure limit_tag is a uuid
for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
if limit_tag == tag.get('title', '').lower().strip():
limit_tag = uuid

# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []

# @todo needs a .itemsWithTag() or something - then we can use that in Jinaj2 and throw this away
for uuid, watch in datastore.data['watching'].items():
# @todo tag notification_muted skip also (improve Watch model)
if datastore.data['settings']['application'].get('rss_hide_muted_watches') and watch.get('notification_muted'):
continue
if limit_tag and not limit_tag in watch['tags']:
continue
watch['uuid'] = uuid
sorted_watches.append(watch)

sorted_watches.sort(key=lambda x: x.last_changed, reverse=False)

fg = FeedGenerator()
fg.title('changedetection.io')
fg.description('Feed description')
fg.link(href='https://changedetection.io')

for watch in sorted_watches:

dates = list(watch.history.keys())
# Re #521 - Don't bother processing this one if theres less than 2 snapshots, means we never had a change detected.
if len(dates) < 2:
continue

if not watch.viewed:
# Re #239 - GUID needs to be individual for each event
# @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228)
guid = "{}/{}".format(watch['uuid'], watch.last_changed)
fe = fg.add_entry()

# Include a link to the diff page, they will have to login here to see if password protection is enabled.
# Description is the page you watch, link takes you to the diff JS UI page
# Dict val base_url will get overriden with the env var if it is set.
ext_base_url = datastore.data['settings']['application'].get('active_base_url')

# Because we are called via whatever web server, flask should figure out the right path (
diff_link = {'href': url_for('ui.ui_views.diff_history_page', uuid=watch['uuid'], _external=True)}

fe.link(link=diff_link)

# @todo watch should be a getter - watch.get('title') (internally if URL else..)

watch_title = watch.get('title') if watch.get('title') else watch.get('url')
fe.title(title=watch_title)

html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]),
newest_version_file_contents=watch.get_history_snapshot(dates[-1]),
include_equal=False,
line_feed_sep="<br>")

# @todo Make this configurable and also consider html-colored markup
# @todo User could decide if <link> goes to the diff page, or to the watch link
rss_template = "<html><body>\n<h4><a href=\"{{watch_url}}\">{{watch_title}}</a></h4>\n<p>{{html_diff}}</p>\n</body></html>\n"
content = jinja_render(template_str=rss_template, watch_title=watch_title, html_diff=html_diff, watch_url=watch.link)

fe.content(content=content, type='CDATA')

fe.guid(guid, permalink=False)
dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key))
dt = dt.replace(tzinfo=pytz.UTC)
fe.pubDate(dt)

response = make_response(fg.rss_str())
response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8')
logger.trace(f"RSS generated in {time.time() - now:.3f}s")
return response

return rss_blueprint
RSS_FORMAT_TYPES = [('plaintext', 'Plain text'), ('html', 'HTML Color')]
147 changes: 147 additions & 0 deletions changedetectionio/blueprint/rss/blueprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@

from changedetectionio.safe_jinja import render as jinja_render
from changedetectionio.store import ChangeDetectionStore
from feedgen.feed import FeedGenerator
from flask import Blueprint, make_response, request, url_for, redirect
from loguru import logger
import datetime
import pytz
import re
import time


BAD_CHARS_REGEX=r'[\x00-\x08\x0B\x0C\x0E-\x1F]'

# Anything that is not text/UTF-8 should be stripped before it breaks feedgen (such as binary data etc)
def scan_invalid_chars_in_rss(content):
for match in re.finditer(BAD_CHARS_REGEX, content):
i = match.start()
bad_char = content[i]
hex_value = f"0x{ord(bad_char):02x}"
# Grab context
start = max(0, i - 20)
end = min(len(content), i + 21)
context = content[start:end].replace('\n', '\\n').replace('\r', '\\r')
logger.warning(f"Invalid char {hex_value} at pos {i}: ...{context}...")
# First match is enough
return True

return False


def clean_entry_content(content):
cleaned = re.sub(BAD_CHARS_REGEX, '', content)
return cleaned

def construct_blueprint(datastore: ChangeDetectionStore):
rss_blueprint = Blueprint('rss', __name__)

# Some RSS reader situations ended up with rss/ (forward slash after RSS) due
# to some earlier blueprint rerouting work, it should goto feed.
@rss_blueprint.route("/", methods=['GET'])
def extraslash():
return redirect(url_for('rss.feed'))

# Import the login decorator if needed
# from changedetectionio.auth_decorator import login_optionally_required
@rss_blueprint.route("", methods=['GET'])
def feed():
now = time.time()
# Always requires token set
app_rss_token = datastore.data['settings']['application'].get('rss_access_token')
rss_url_token = request.args.get('token')
if rss_url_token != app_rss_token:
return "Access denied, bad token", 403

from changedetectionio import diff
limit_tag = request.args.get('tag', '').lower().strip()
# Be sure limit_tag is a uuid
for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
if limit_tag == tag.get('title', '').lower().strip():
limit_tag = uuid

# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []

# @todo needs a .itemsWithTag() or something - then we can use that in Jinaj2 and throw this away
for uuid, watch in datastore.data['watching'].items():
# @todo tag notification_muted skip also (improve Watch model)
if datastore.data['settings']['application'].get('rss_hide_muted_watches') and watch.get('notification_muted'):
continue
if limit_tag and not limit_tag in watch['tags']:
continue
watch['uuid'] = uuid
sorted_watches.append(watch)

sorted_watches.sort(key=lambda x: x.last_changed, reverse=False)

fg = FeedGenerator()
fg.title('changedetection.io')
fg.description('Feed description')
fg.link(href='https://changedetection.io')

html_colour_enable = False
if datastore.data['settings']['application'].get('rss_content_format') == 'html':
html_colour_enable = True

for watch in sorted_watches:

dates = list(watch.history.keys())
# Re #521 - Don't bother processing this one if theres less than 2 snapshots, means we never had a change detected.
if len(dates) < 2:
continue

if not watch.viewed:
# Re #239 - GUID needs to be individual for each event
# @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228)
guid = "{}/{}".format(watch['uuid'], watch.last_changed)
fe = fg.add_entry()

# Include a link to the diff page, they will have to login here to see if password protection is enabled.
# Description is the page you watch, link takes you to the diff JS UI page
# Dict val base_url will get overriden with the env var if it is set.
ext_base_url = datastore.data['settings']['application'].get('active_base_url')
# @todo fix

# Because we are called via whatever web server, flask should figure out the right path (
diff_link = {'href': url_for('ui.ui_views.diff_history_page', uuid=watch['uuid'], _external=True)}

fe.link(link=diff_link)

# @todo watch should be a getter - watch.get('title') (internally if URL else..)

watch_title = watch.get('title') if watch.get('title') else watch.get('url')
fe.title(title=watch_title)
try:

html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]),
newest_version_file_contents=watch.get_history_snapshot(dates[-1]),
include_equal=False,
line_feed_sep="<br>",
html_colour=html_colour_enable
)
except FileNotFoundError as e:
html_diff = f"History snapshot file for watch {watch.get('uuid')}@{watch.last_changed} - '{watch.get('title')} not found."

# @todo Make this configurable and also consider html-colored markup
# @todo User could decide if <link> goes to the diff page, or to the watch link
rss_template = "<html><body>\n<h4><a href=\"{{watch_url}}\">{{watch_title}}</a></h4>\n<p>{{html_diff}}</p>\n</body></html>\n"

content = jinja_render(template_str=rss_template, watch_title=watch_title, html_diff=html_diff, watch_url=watch.link)

# Out of range chars could also break feedgen
if scan_invalid_chars_in_rss(content):
content = clean_entry_content(content)

fe.content(content=content, type='CDATA')
fe.guid(guid, permalink=False)
dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key))
dt = dt.replace(tzinfo=pytz.UTC)
fe.pubDate(dt)

response = make_response(fg.rss_str())
response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8')
logger.trace(f"RSS generated in {time.time() - now:.3f}s")
return response

return rss_blueprint
5 changes: 4 additions & 1 deletion changedetectionio/blueprint/settings/templates/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@
{{ render_field(form.application.form.pager_size) }}
<span class="pure-form-message-inline">Number of items per page in the watch overview list, 0 to disable.</span>
</div>

<div class="pure-control-group">
{{ render_field(form.application.form.rss_content_format) }}
<span class="pure-form-message-inline">Love RSS? Does your reader support HTML? Set it here</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.extract_title_as_title) }}
<span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/flask_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def static_content(group, filename):
import changedetectionio.conditions.blueprint as conditions
app.register_blueprint(conditions.construct_blueprint(datastore), url_prefix='/conditions')

import changedetectionio.blueprint.rss as rss
import changedetectionio.blueprint.rss.blueprint as rss
app.register_blueprint(rss.construct_blueprint(datastore), url_prefix='/rss')

# watchlist UI buttons etc
Expand Down
4 changes: 4 additions & 0 deletions changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from loguru import logger
from wtforms.widgets.core import TimeInput

from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES
from changedetectionio.conditions.form import ConditionFormRow
from changedetectionio.strtobool import strtobool

Expand Down Expand Up @@ -739,6 +740,9 @@ class globalSettingsApplicationForm(commonSettingsForm):
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0,
message="Should be atleast zero (disabled)")])

rss_content_format = SelectField('RSS Content format', choices=RSS_FORMAT_TYPES)

removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
shared_diff_access = BooleanField('Allow access to view diff page when password is enabled', default=False, validators=[validators.Optional()])
Expand Down
6 changes: 6 additions & 0 deletions changedetectionio/model/App.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from os import getenv

from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES

from changedetectionio.notification import (
default_notification_body,
default_notification_format,
Expand All @@ -9,6 +12,8 @@
_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'



class model(dict):
base_config = {
'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
Expand Down Expand Up @@ -48,6 +53,7 @@ class model(dict):
'password': False,
'render_anchor_tag_content': False,
'rss_access_token': None,
'rss_content_format': RSS_FORMAT_TYPES[0][0],
'rss_hide_muted_watches': True,
'schema_version' : 0,
'shared_diff_access': False,
Expand Down
71 changes: 71 additions & 0 deletions changedetectionio/tests/test_rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ def set_original_cdata_xml():
f.write(test_return_data)



def set_html_content(content):
test_return_data = f"""<html>
<body>
Some initial text<br>
<p>{content}</p>
<br>
So let's see what happens. <br>
</body>
</html>
"""

# Write as UTF-8 encoded bytes
with open("test-datastore/endpoint-content.txt", "wb") as f:
f.write(test_return_data.encode('utf-8'))

def test_setup(client, live_server, measure_memory_usage):
live_server_setup(live_server)

Expand Down Expand Up @@ -164,3 +180,58 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):
assert b'Some other description' not in res.data # Should NOT be selected by the xpath

res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)


def test_rss_bad_chars_breaking(client, live_server):
"""This should absolutely trigger the RSS builder to go into worst state mode

- source: prefix means no html conversion (which kinda filters out the bad stuff)
- Binary data
- Very long so that the saving is performed by Brotli (and decoded back to bytes)

Otherwise feedgen should support regular unicode
"""
#live_server_setup(live_server)

with open("test-datastore/endpoint-content.txt", "w") as f:
ten_kb_string = "A" * 10_000
f.write(ten_kb_string)

test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("imports.import_page"),
data={"urls": "source:"+test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)

# Set the bad content
with open("test-datastore/endpoint-content.txt", "w") as f:
jpeg_bytes = "\xff\xd8\xff\xe0\x00\x10XXXXXXXX\x00\x01\x02\x00\x00\x01\x00\x01\x00\x00" # JPEG header
jpeg_bytes += "A" * 10_000

f.write(jpeg_bytes)

res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
assert b'Queued 1 watch for rechecking.' in res.data
wait_for_all_checks(client)
rss_token = extract_rss_token_from_UI(client)

uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2

# Check RSS feed is still working
res = client.get(
url_for("rss.feed", uuid=uuid, token=rss_token),
follow_redirects=False # Important! leave this off! it should not redirect
)
assert res.status_code == 200

#assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2
#assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2





Loading