Skip to content

Commit fa5f78c

Browse files
Automatically add tags to bookmarks based on URL pattern (sissbruecker#736)
* [WIP] DSL * upd * upd * upd * upd * upd * upd * upd * upd * upd * upd * upd * dsl2 * full feature * upd * upd * upd * upd * rename to auto_tagging_rules * update migration after rebase * add REST API tests * improve settings view --------- Co-authored-by: Sascha Ißbrücker <[email protected]>
1 parent e03f536 commit fa5f78c

File tree

9 files changed

+369
-0
lines changed

9 files changed

+369
-0
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 5.0.3 on 2024-05-17 07:09
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("bookmarks", "0035_userprofile_tag_grouping"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="userprofile",
15+
name="auto_tagging_rules",
16+
field=models.TextField(blank=True),
17+
),
18+
]

bookmarks/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ class UserProfile(models.Model):
415415
display_remove_bookmark_action = models.BooleanField(default=True, null=False)
416416
permanent_notes = models.BooleanField(default=False, null=False)
417417
custom_css = models.TextField(blank=True, null=False)
418+
auto_tagging_rules = models.TextField(blank=True, null=False)
418419
search_preferences = models.JSONField(default=dict, null=False)
419420
enable_automatic_html_snapshots = models.BooleanField(default=True, null=False)
420421
default_mark_unread = models.BooleanField(default=False, null=False)
@@ -445,6 +446,7 @@ class Meta:
445446
"permanent_notes",
446447
"default_mark_unread",
447448
"custom_css",
449+
"auto_tagging_rules",
448450
]
449451

450452

bookmarks/services/auto_tagging.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from urllib.parse import urlparse, parse_qs
2+
import re
3+
import idna
4+
5+
6+
def get_tags(script: str, url: str):
7+
parsed_url = urlparse(url.lower())
8+
result = set()
9+
10+
for line in script.lower().split("\n"):
11+
if "#" in line:
12+
i = line.index("#")
13+
line = line[:i]
14+
15+
parts = line.split()
16+
if len(parts) < 2:
17+
continue
18+
19+
domain_pattern = re.sub("^https?://", "", parts[0])
20+
path_pattern = None
21+
qs_pattern = None
22+
23+
if "/" in domain_pattern:
24+
i = domain_pattern.index("/")
25+
path_pattern = domain_pattern[i:]
26+
domain_pattern = domain_pattern[:i]
27+
28+
if path_pattern and "?" in path_pattern:
29+
i = path_pattern.index("?")
30+
qs_pattern = path_pattern[i + 1 :]
31+
path_pattern = path_pattern[:i]
32+
33+
if not _domains_matches(domain_pattern, parsed_url.netloc):
34+
continue
35+
36+
if path_pattern and not _path_matches(path_pattern, parsed_url.path):
37+
continue
38+
39+
if qs_pattern and not _qs_matches(qs_pattern, parsed_url.query):
40+
continue
41+
42+
for tag in parts[1:]:
43+
result.add(tag)
44+
45+
return result
46+
47+
48+
def _path_matches(expected_path: str, actual_path: str) -> bool:
49+
return actual_path.startswith(expected_path)
50+
51+
52+
def _domains_matches(expected_domain: str, actual_domain: str) -> bool:
53+
expected_domain = idna.encode(expected_domain)
54+
actual_domain = idna.encode(actual_domain)
55+
56+
return actual_domain.endswith(expected_domain)
57+
58+
59+
def _qs_matches(expected_qs: str, actual_qs: str) -> bool:
60+
expected_qs = parse_qs(expected_qs, keep_blank_values=True)
61+
actual_qs = parse_qs(actual_qs, keep_blank_values=True)
62+
63+
for key in expected_qs:
64+
if key not in actual_qs:
65+
return False
66+
for value in expected_qs[key]:
67+
if value != "" and value not in actual_qs[key]:
68+
return False
69+
70+
return True

bookmarks/services/bookmarks.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from bookmarks.models import Bookmark, BookmarkAsset, parse_tag_string
1111
from bookmarks.services import tasks
1212
from bookmarks.services import website_loader
13+
from bookmarks.services import auto_tagging
1314
from bookmarks.services.tags import get_or_create_tags
1415

1516
logger = logging.getLogger(__name__)
@@ -242,6 +243,15 @@ def _update_website_metadata(bookmark: Bookmark):
242243

243244
def _update_bookmark_tags(bookmark: Bookmark, tag_string: str, user: User):
244245
tag_names = parse_tag_string(tag_string)
246+
247+
if user.profile.auto_tagging_rules:
248+
auto_tag_names = auto_tagging.get_tags(
249+
user.profile.auto_tagging_rules, bookmark.url
250+
)
251+
for auto_tag_name in auto_tag_names:
252+
if auto_tag_name not in tag_names:
253+
tag_names.append(auto_tag_name)
254+
245255
tags = get_or_create_tags(tag_names, user)
246256
bookmark.tags.set(tags)
247257

bookmarks/templates/settings/general.html

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,21 @@ <h2>Profile</h2>
118118
If disabled, tags will not be grouped.
119119
</div>
120120
</div>
121+
<div class="form-group">
122+
<details {% if form.auto_tagging_rules.value %}open{% endif %}>
123+
<summary>Auto Tagging</summary>
124+
<label for="{{ form.auto_tagging_rules.id_for_label }}" class="text-assistive">Auto Tagging</label>
125+
<div class="mt-2">
126+
{{ form.auto_tagging_rules|add_class:"form-input custom-css"|attr:"rows:6" }}
127+
</div>
128+
</details>
129+
<div class="form-input-hint">
130+
Automatically adds tags to bookmarks based on predefined rules.
131+
Each line is a single rule that maps a URL to one or more tags. For example:
132+
<pre>youtube.com video
133+
reddit.com/r/Music music reddit</pre>
134+
</div>
135+
</div>
121136
<div class="form-group">
122137
<label for="{{ form.enable_favicons.id_for_label }}" class="form-checkbox">
123138
{{ form.enable_favicons }}

bookmarks/tests/test_auto_tagging.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
from bookmarks.services import auto_tagging
2+
from django.test import TestCase
3+
4+
5+
class AutoTaggingTestCase(TestCase):
6+
def test_auto_tag_by_domain(self):
7+
script = """
8+
example.com example
9+
test.com test
10+
"""
11+
url = "https://example.com/"
12+
13+
tags = auto_tagging.get_tags(script, url)
14+
15+
self.assertEqual(tags, set(["example"]))
16+
17+
def test_auto_tag_by_domain_ignores_case(self):
18+
script = """
19+
EXAMPLE.com example
20+
"""
21+
url = "https://example.com/"
22+
23+
tags = auto_tagging.get_tags(script, url)
24+
25+
self.assertEqual(tags, set(["example"]))
26+
27+
def test_auto_tag_by_domain_should_add_all_tags(self):
28+
script = """
29+
example.com one two three
30+
"""
31+
url = "https://example.com/"
32+
33+
tags = auto_tagging.get_tags(script, url)
34+
35+
self.assertEqual(tags, set(["one", "two", "three"]))
36+
37+
def test_auto_tag_by_domain_work_with_idn_domains(self):
38+
script = """
39+
रजिस्ट्री.भारत tag1
40+
"""
41+
url = "https://www.xn--81bg3cc2b2bk5hb.xn--h2brj9c/"
42+
43+
tags = auto_tagging.get_tags(script, url)
44+
45+
self.assertEqual(tags, set(["tag1"]))
46+
47+
script = """
48+
xn--81bg3cc2b2bk5hb.xn--h2brj9c tag1
49+
"""
50+
url = "https://www.रजिस्ट्री.भारत/"
51+
52+
tags = auto_tagging.get_tags(script, url)
53+
54+
self.assertEqual(tags, set(["tag1"]))
55+
56+
def test_auto_tag_by_domain_and_path(self):
57+
script = """
58+
example.com/one one
59+
example.com/two two
60+
test.com test
61+
"""
62+
url = "https://example.com/one/"
63+
64+
tags = auto_tagging.get_tags(script, url)
65+
66+
self.assertEqual(tags, set(["one"]))
67+
68+
def test_auto_tag_by_domain_and_path_ignores_case(self):
69+
script = """
70+
example.com/One one
71+
"""
72+
url = "https://example.com/one/"
73+
74+
tags = auto_tagging.get_tags(script, url)
75+
76+
self.assertEqual(tags, set(["one"]))
77+
78+
def test_auto_tag_by_domain_and_path_matches_path_ltr(self):
79+
script = """
80+
example.com/one one
81+
example.com/two two
82+
test.com test
83+
"""
84+
url = "https://example.com/one/two"
85+
86+
tags = auto_tagging.get_tags(script, url)
87+
88+
self.assertEqual(tags, set(["one"]))
89+
90+
def test_auto_tag_by_domain_ignores_domain_in_path(self):
91+
script = """
92+
example.com example
93+
"""
94+
url = "https://test.com/example.com"
95+
96+
tags = auto_tagging.get_tags(script, url)
97+
98+
self.assertEqual(tags, set([]))
99+
100+
def test_auto_tag_by_domain_includes_subdomains(self):
101+
script = """
102+
example.com example
103+
test.example.com test
104+
some.example.com some
105+
"""
106+
url = "https://test.example.com/"
107+
108+
tags = auto_tagging.get_tags(script, url)
109+
110+
self.assertEqual(tags, set(["example", "test"]))
111+
112+
def test_auto_tag_by_domain_matches_domain_rtl(self):
113+
script = """
114+
example.com example
115+
"""
116+
url = "https://example.com.bad-website.com/"
117+
118+
tags = auto_tagging.get_tags(script, url)
119+
120+
self.assertEqual(tags, set([]))
121+
122+
def test_auto_tag_by_domain_ignores_schema(self):
123+
script = """
124+
https://example.com/ https
125+
http://example.com/ http
126+
"""
127+
url = "http://example.com/"
128+
129+
tags = auto_tagging.get_tags(script, url)
130+
131+
self.assertEqual(tags, set(["https", "http"]))
132+
133+
def test_auto_tag_by_domain_ignores_lines_with_no_tags(self):
134+
script = """
135+
example.com
136+
"""
137+
url = "https://example.com/"
138+
139+
tags = auto_tagging.get_tags(script, url)
140+
141+
self.assertEqual(tags, set([]))
142+
143+
def test_auto_tag_by_domain_path_and_qs(self):
144+
script = """
145+
example.com/page?a=b tag1 # true, matches a=b
146+
example.com/page?a=c&c=d tag2 # true, matches both a=c and c=d
147+
example.com/page?c=d&l=p tag3 # false, l=p doesn't exists
148+
example.com/page?a=bb tag4 # false bb != b
149+
example.com/page?a=b&a=c tag5 # true, matches both a=b and a=c
150+
example.com/page?a=B tag6 # true, matches a=b because case insensitive
151+
example.com/page?A=b tag7 # true, matches a=b because case insensitive
152+
"""
153+
url = "https://example.com/page/some?z=x&a=b&v=b&c=d&o=p&a=c"
154+
155+
tags = auto_tagging.get_tags(script, url)
156+
157+
self.assertEqual(tags, set(["tag1", "tag2", "tag5", "tag6", "tag7"]))
158+
159+
def test_auto_tag_by_domain_path_and_qs_with_empty_value(self):
160+
script = """
161+
example.com/page?a= tag1
162+
example.com/page?b= tag2
163+
"""
164+
url = "https://example.com/page/some?a=value"
165+
166+
tags = auto_tagging.get_tags(script, url)
167+
168+
self.assertEqual(tags, set(["tag1"]))
169+
170+
def test_auto_tag_by_domain_path_and_qs_works_with_encoded_url(self):
171+
script = """
172+
example.com/page?a=йцу tag1
173+
example.com/page?a=%D0%B9%D1%86%D1%83 tag2
174+
"""
175+
url = "https://example.com/page?a=%D0%B9%D1%86%D1%83"
176+
177+
tags = auto_tagging.get_tags(script, url)
178+
179+
self.assertEqual(tags, set(["tag1", "tag2"]))

bookmarks/tests/test_bookmarks_api.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,20 @@ def test_create_bookmark_is_not_shared_by_default(self):
440440
bookmark = Bookmark.objects.get(url=data["url"])
441441
self.assertFalse(bookmark.shared)
442442

443+
def test_create_bookmark_should_add_tags_from_auto_tagging(self):
444+
tag1 = self.setup_tag()
445+
tag2 = self.setup_tag()
446+
447+
self.authenticate()
448+
profile = self.get_or_create_test_user().profile
449+
profile.auto_tagging_rules = f"example.com {tag2.name}"
450+
profile.save()
451+
452+
data = {"url": "https://example.com/", "tag_names": [tag1.name]}
453+
self.post(reverse("bookmarks:bookmark-list"), data, status.HTTP_201_CREATED)
454+
bookmark = Bookmark.objects.get(url=data["url"])
455+
self.assertCountEqual(bookmark.tags.all(), [tag1, tag2])
456+
443457
def test_get_bookmark(self):
444458
self.authenticate()
445459
bookmark = self.setup_bookmark()
@@ -512,6 +526,22 @@ def test_update_bookmark_shared_flag(self):
512526
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
513527
self.assertEqual(updated_bookmark.shared, True)
514528

529+
def test_update_bookmark_adds_tags_from_auto_tagging(self):
530+
bookmark = self.setup_bookmark()
531+
tag1 = self.setup_tag()
532+
tag2 = self.setup_tag()
533+
534+
self.authenticate()
535+
profile = self.get_or_create_test_user().profile
536+
profile.auto_tagging_rules = f"example.com {tag2.name}"
537+
profile.save()
538+
539+
data = {"url": "https://example.com/", "tag_names": [tag1.name]}
540+
url = reverse("bookmarks:bookmark-detail", args=[bookmark.id])
541+
self.put(url, data, expected_status_code=status.HTTP_200_OK)
542+
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
543+
self.assertCountEqual(updated_bookmark.tags.all(), [tag1, tag2])
544+
515545
def test_patch_bookmark(self):
516546
self.authenticate()
517547
bookmark = self.setup_bookmark()
@@ -583,6 +613,22 @@ def test_patch_with_empty_payload_does_not_modify_bookmark(self):
583613
self.assertEqual(updated_bookmark.description, bookmark.description)
584614
self.assertListEqual(updated_bookmark.tag_names, bookmark.tag_names)
585615

616+
def test_patch_bookmark_adds_tags_from_auto_tagging(self):
617+
bookmark = self.setup_bookmark()
618+
tag1 = self.setup_tag()
619+
tag2 = self.setup_tag()
620+
621+
self.authenticate()
622+
profile = self.get_or_create_test_user().profile
623+
profile.auto_tagging_rules = f"example.com {tag2.name}"
624+
profile.save()
625+
626+
data = {"tag_names": [tag1.name]}
627+
url = reverse("bookmarks:bookmark-detail", args=[bookmark.id])
628+
self.patch(url, data, expected_status_code=status.HTTP_200_OK)
629+
updated_bookmark = Bookmark.objects.get(id=bookmark.id)
630+
self.assertCountEqual(updated_bookmark.tags.all(), [tag1, tag2])
631+
586632
def test_delete_bookmark(self):
587633
self.authenticate()
588634
bookmark = self.setup_bookmark()

0 commit comments

Comments
 (0)