Skip to content

Commit

Permalink
Automatically add tags to bookmarks based on URL pattern (sissbruecke…
Browse files Browse the repository at this point in the history
…r#736)

* [WIP] DSL

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* dsl2

* full feature

* upd

* upd

* upd

* upd

* rename to auto_tagging_rules

* update migration after rebase

* add REST API tests

* improve settings view

---------

Co-authored-by: Sascha Ißbrücker <[email protected]>
  • Loading branch information
vslinko and sissbruecker authored May 17, 2024
1 parent e03f536 commit fa5f78c
Show file tree
Hide file tree
Showing 9 changed files with 369 additions and 0 deletions.
18 changes: 18 additions & 0 deletions bookmarks/migrations/0036_userprofile_auto_tagging_rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 5.0.3 on 2024-05-17 07:09

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("bookmarks", "0035_userprofile_tag_grouping"),
]

operations = [
migrations.AddField(
model_name="userprofile",
name="auto_tagging_rules",
field=models.TextField(blank=True),
),
]
2 changes: 2 additions & 0 deletions bookmarks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ class UserProfile(models.Model):
display_remove_bookmark_action = models.BooleanField(default=True, null=False)
permanent_notes = models.BooleanField(default=False, null=False)
custom_css = models.TextField(blank=True, null=False)
auto_tagging_rules = models.TextField(blank=True, null=False)
search_preferences = models.JSONField(default=dict, null=False)
enable_automatic_html_snapshots = models.BooleanField(default=True, null=False)
default_mark_unread = models.BooleanField(default=False, null=False)
Expand Down Expand Up @@ -445,6 +446,7 @@ class Meta:
"permanent_notes",
"default_mark_unread",
"custom_css",
"auto_tagging_rules",
]


Expand Down
70 changes: 70 additions & 0 deletions bookmarks/services/auto_tagging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from urllib.parse import urlparse, parse_qs
import re
import idna


def get_tags(script: str, url: str):
parsed_url = urlparse(url.lower())
result = set()

for line in script.lower().split("\n"):
if "#" in line:
i = line.index("#")
line = line[:i]

parts = line.split()
if len(parts) < 2:
continue

domain_pattern = re.sub("^https?://", "", parts[0])
path_pattern = None
qs_pattern = None

if "/" in domain_pattern:
i = domain_pattern.index("/")
path_pattern = domain_pattern[i:]
domain_pattern = domain_pattern[:i]

if path_pattern and "?" in path_pattern:
i = path_pattern.index("?")
qs_pattern = path_pattern[i + 1 :]
path_pattern = path_pattern[:i]

if not _domains_matches(domain_pattern, parsed_url.netloc):
continue

if path_pattern and not _path_matches(path_pattern, parsed_url.path):
continue

if qs_pattern and not _qs_matches(qs_pattern, parsed_url.query):
continue

for tag in parts[1:]:
result.add(tag)

return result


def _path_matches(expected_path: str, actual_path: str) -> bool:
return actual_path.startswith(expected_path)


def _domains_matches(expected_domain: str, actual_domain: str) -> bool:
expected_domain = idna.encode(expected_domain)
actual_domain = idna.encode(actual_domain)

return actual_domain.endswith(expected_domain)


def _qs_matches(expected_qs: str, actual_qs: str) -> bool:
expected_qs = parse_qs(expected_qs, keep_blank_values=True)
actual_qs = parse_qs(actual_qs, keep_blank_values=True)

for key in expected_qs:
if key not in actual_qs:
return False
for value in expected_qs[key]:
if value != "" and value not in actual_qs[key]:
return False

return True
10 changes: 10 additions & 0 deletions bookmarks/services/bookmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from bookmarks.models import Bookmark, BookmarkAsset, parse_tag_string
from bookmarks.services import tasks
from bookmarks.services import website_loader
from bookmarks.services import auto_tagging
from bookmarks.services.tags import get_or_create_tags

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -242,6 +243,15 @@ def _update_website_metadata(bookmark: Bookmark):

def _update_bookmark_tags(bookmark: Bookmark, tag_string: str, user: User):
tag_names = parse_tag_string(tag_string)

if user.profile.auto_tagging_rules:
auto_tag_names = auto_tagging.get_tags(
user.profile.auto_tagging_rules, bookmark.url
)
for auto_tag_name in auto_tag_names:
if auto_tag_name not in tag_names:
tag_names.append(auto_tag_name)

tags = get_or_create_tags(tag_names, user)
bookmark.tags.set(tags)

Expand Down
15 changes: 15 additions & 0 deletions bookmarks/templates/settings/general.html
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,21 @@ <h2>Profile</h2>
If disabled, tags will not be grouped.
</div>
</div>
<div class="form-group">
<details {% if form.auto_tagging_rules.value %}open{% endif %}>
<summary>Auto Tagging</summary>
<label for="{{ form.auto_tagging_rules.id_for_label }}" class="text-assistive">Auto Tagging</label>
<div class="mt-2">
{{ form.auto_tagging_rules|add_class:"form-input custom-css"|attr:"rows:6" }}
</div>
</details>
<div class="form-input-hint">
Automatically adds tags to bookmarks based on predefined rules.
Each line is a single rule that maps a URL to one or more tags. For example:
<pre>youtube.com video
reddit.com/r/Music music reddit</pre>
</div>
</div>
<div class="form-group">
<label for="{{ form.enable_favicons.id_for_label }}" class="form-checkbox">
{{ form.enable_favicons }}
Expand Down
179 changes: 179 additions & 0 deletions bookmarks/tests/test_auto_tagging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from bookmarks.services import auto_tagging
from django.test import TestCase


class AutoTaggingTestCase(TestCase):
def test_auto_tag_by_domain(self):
script = """
example.com example
test.com test
"""
url = "https://example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["example"]))

def test_auto_tag_by_domain_ignores_case(self):
script = """
EXAMPLE.com example
"""
url = "https://example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["example"]))

def test_auto_tag_by_domain_should_add_all_tags(self):
script = """
example.com one two three
"""
url = "https://example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["one", "two", "three"]))

def test_auto_tag_by_domain_work_with_idn_domains(self):
script = """
रजिस्ट्री.भारत tag1
"""
url = "https://www.xn--81bg3cc2b2bk5hb.xn--h2brj9c/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["tag1"]))

script = """
xn--81bg3cc2b2bk5hb.xn--h2brj9c tag1
"""
url = "https://www.रजिस्ट्री.भारत/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["tag1"]))

def test_auto_tag_by_domain_and_path(self):
script = """
example.com/one one
example.com/two two
test.com test
"""
url = "https://example.com/one/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["one"]))

def test_auto_tag_by_domain_and_path_ignores_case(self):
script = """
example.com/One one
"""
url = "https://example.com/one/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["one"]))

def test_auto_tag_by_domain_and_path_matches_path_ltr(self):
script = """
example.com/one one
example.com/two two
test.com test
"""
url = "https://example.com/one/two"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["one"]))

def test_auto_tag_by_domain_ignores_domain_in_path(self):
script = """
example.com example
"""
url = "https://test.com/example.com"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set([]))

def test_auto_tag_by_domain_includes_subdomains(self):
script = """
example.com example
test.example.com test
some.example.com some
"""
url = "https://test.example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["example", "test"]))

def test_auto_tag_by_domain_matches_domain_rtl(self):
script = """
example.com example
"""
url = "https://example.com.bad-website.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set([]))

def test_auto_tag_by_domain_ignores_schema(self):
script = """
https://example.com/ https
http://example.com/ http
"""
url = "http://example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["https", "http"]))

def test_auto_tag_by_domain_ignores_lines_with_no_tags(self):
script = """
example.com
"""
url = "https://example.com/"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set([]))

def test_auto_tag_by_domain_path_and_qs(self):
script = """
example.com/page?a=b tag1 # true, matches a=b
example.com/page?a=c&c=d tag2 # true, matches both a=c and c=d
example.com/page?c=d&l=p tag3 # false, l=p doesn't exists
example.com/page?a=bb tag4 # false bb != b
example.com/page?a=b&a=c tag5 # true, matches both a=b and a=c
example.com/page?a=B tag6 # true, matches a=b because case insensitive
example.com/page?A=b tag7 # true, matches a=b because case insensitive
"""
url = "https://example.com/page/some?z=x&a=b&v=b&c=d&o=p&a=c"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["tag1", "tag2", "tag5", "tag6", "tag7"]))

def test_auto_tag_by_domain_path_and_qs_with_empty_value(self):
script = """
example.com/page?a= tag1
example.com/page?b= tag2
"""
url = "https://example.com/page/some?a=value"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["tag1"]))

def test_auto_tag_by_domain_path_and_qs_works_with_encoded_url(self):
script = """
example.com/page?a=йцу tag1
example.com/page?a=%D0%B9%D1%86%D1%83 tag2
"""
url = "https://example.com/page?a=%D0%B9%D1%86%D1%83"

tags = auto_tagging.get_tags(script, url)

self.assertEqual(tags, set(["tag1", "tag2"]))
46 changes: 46 additions & 0 deletions bookmarks/tests/test_bookmarks_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,20 @@ def test_create_bookmark_is_not_shared_by_default(self):
bookmark = Bookmark.objects.get(url=data["url"])
self.assertFalse(bookmark.shared)

def test_create_bookmark_should_add_tags_from_auto_tagging(self):
tag1 = self.setup_tag()
tag2 = self.setup_tag()

self.authenticate()
profile = self.get_or_create_test_user().profile
profile.auto_tagging_rules = f"example.com {tag2.name}"
profile.save()

data = {"url": "https://example.com/", "tag_names": [tag1.name]}
self.post(reverse("bookmarks:bookmark-list"), data, status.HTTP_201_CREATED)
bookmark = Bookmark.objects.get(url=data["url"])
self.assertCountEqual(bookmark.tags.all(), [tag1, tag2])

def test_get_bookmark(self):
self.authenticate()
bookmark = self.setup_bookmark()
Expand Down Expand Up @@ -512,6 +526,22 @@ def test_update_bookmark_shared_flag(self):
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
self.assertEqual(updated_bookmark.shared, True)

def test_update_bookmark_adds_tags_from_auto_tagging(self):
bookmark = self.setup_bookmark()
tag1 = self.setup_tag()
tag2 = self.setup_tag()

self.authenticate()
profile = self.get_or_create_test_user().profile
profile.auto_tagging_rules = f"example.com {tag2.name}"
profile.save()

data = {"url": "https://example.com/", "tag_names": [tag1.name]}
url = reverse("bookmarks:bookmark-detail", args=[bookmark.id])
self.put(url, data, expected_status_code=status.HTTP_200_OK)
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
self.assertCountEqual(updated_bookmark.tags.all(), [tag1, tag2])

def test_patch_bookmark(self):
self.authenticate()
bookmark = self.setup_bookmark()
Expand Down Expand Up @@ -583,6 +613,22 @@ def test_patch_with_empty_payload_does_not_modify_bookmark(self):
self.assertEqual(updated_bookmark.description, bookmark.description)
self.assertListEqual(updated_bookmark.tag_names, bookmark.tag_names)

def test_patch_bookmark_adds_tags_from_auto_tagging(self):
bookmark = self.setup_bookmark()
tag1 = self.setup_tag()
tag2 = self.setup_tag()

self.authenticate()
profile = self.get_or_create_test_user().profile
profile.auto_tagging_rules = f"example.com {tag2.name}"
profile.save()

data = {"tag_names": [tag1.name]}
url = reverse("bookmarks:bookmark-detail", args=[bookmark.id])
self.patch(url, data, expected_status_code=status.HTTP_200_OK)
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
self.assertCountEqual(updated_bookmark.tags.all(), [tag1, tag2])

def test_delete_bookmark(self):
self.authenticate()
bookmark = self.setup_bookmark()
Expand Down
Loading

0 comments on commit fa5f78c

Please sign in to comment.