forked from sissbruecker/linkding
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automatically add tags to bookmarks based on URL pattern (sissbruecke…
…r#736) * [WIP] DSL * upd * upd * upd * upd * upd * upd * upd * upd * upd * upd * upd * dsl2 * full feature * upd * upd * upd * upd * rename to auto_tagging_rules * update migration after rebase * add REST API tests * improve settings view --------- Co-authored-by: Sascha Ißbrücker <[email protected]>
- Loading branch information
1 parent
e03f536
commit fa5f78c
Showing
9 changed files
with
369 additions
and
0 deletions.
There are no files selected for viewing
18 changes: 18 additions & 0 deletions
18
bookmarks/migrations/0036_userprofile_auto_tagging_rules.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Generated by Django 5.0.3 on 2024-05-17 07:09 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("bookmarks", "0035_userprofile_tag_grouping"), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name="userprofile", | ||
name="auto_tagging_rules", | ||
field=models.TextField(blank=True), | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from urllib.parse import urlparse, parse_qs | ||
import re | ||
import idna | ||
|
||
|
||
def get_tags(script: str, url: str): | ||
parsed_url = urlparse(url.lower()) | ||
result = set() | ||
|
||
for line in script.lower().split("\n"): | ||
if "#" in line: | ||
i = line.index("#") | ||
line = line[:i] | ||
|
||
parts = line.split() | ||
if len(parts) < 2: | ||
continue | ||
|
||
domain_pattern = re.sub("^https?://", "", parts[0]) | ||
path_pattern = None | ||
qs_pattern = None | ||
|
||
if "/" in domain_pattern: | ||
i = domain_pattern.index("/") | ||
path_pattern = domain_pattern[i:] | ||
domain_pattern = domain_pattern[:i] | ||
|
||
if path_pattern and "?" in path_pattern: | ||
i = path_pattern.index("?") | ||
qs_pattern = path_pattern[i + 1 :] | ||
path_pattern = path_pattern[:i] | ||
|
||
if not _domains_matches(domain_pattern, parsed_url.netloc): | ||
continue | ||
|
||
if path_pattern and not _path_matches(path_pattern, parsed_url.path): | ||
continue | ||
|
||
if qs_pattern and not _qs_matches(qs_pattern, parsed_url.query): | ||
continue | ||
|
||
for tag in parts[1:]: | ||
result.add(tag) | ||
|
||
return result | ||
|
||
|
||
def _path_matches(expected_path: str, actual_path: str) -> bool: | ||
return actual_path.startswith(expected_path) | ||
|
||
|
||
def _domains_matches(expected_domain: str, actual_domain: str) -> bool: | ||
expected_domain = idna.encode(expected_domain) | ||
actual_domain = idna.encode(actual_domain) | ||
|
||
return actual_domain.endswith(expected_domain) | ||
|
||
|
||
def _qs_matches(expected_qs: str, actual_qs: str) -> bool: | ||
expected_qs = parse_qs(expected_qs, keep_blank_values=True) | ||
actual_qs = parse_qs(actual_qs, keep_blank_values=True) | ||
|
||
for key in expected_qs: | ||
if key not in actual_qs: | ||
return False | ||
for value in expected_qs[key]: | ||
if value != "" and value not in actual_qs[key]: | ||
return False | ||
|
||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from bookmarks.services import auto_tagging | ||
from django.test import TestCase | ||
|
||
|
||
class AutoTaggingTestCase(TestCase): | ||
def test_auto_tag_by_domain(self): | ||
script = """ | ||
example.com example | ||
test.com test | ||
""" | ||
url = "https://example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["example"])) | ||
|
||
def test_auto_tag_by_domain_ignores_case(self): | ||
script = """ | ||
EXAMPLE.com example | ||
""" | ||
url = "https://example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["example"])) | ||
|
||
def test_auto_tag_by_domain_should_add_all_tags(self): | ||
script = """ | ||
example.com one two three | ||
""" | ||
url = "https://example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["one", "two", "three"])) | ||
|
||
def test_auto_tag_by_domain_work_with_idn_domains(self): | ||
script = """ | ||
रजिस्ट्री.भारत tag1 | ||
""" | ||
url = "https://www.xn--81bg3cc2b2bk5hb.xn--h2brj9c/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["tag1"])) | ||
|
||
script = """ | ||
xn--81bg3cc2b2bk5hb.xn--h2brj9c tag1 | ||
""" | ||
url = "https://www.रजिस्ट्री.भारत/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["tag1"])) | ||
|
||
def test_auto_tag_by_domain_and_path(self): | ||
script = """ | ||
example.com/one one | ||
example.com/two two | ||
test.com test | ||
""" | ||
url = "https://example.com/one/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["one"])) | ||
|
||
def test_auto_tag_by_domain_and_path_ignores_case(self): | ||
script = """ | ||
example.com/One one | ||
""" | ||
url = "https://example.com/one/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["one"])) | ||
|
||
def test_auto_tag_by_domain_and_path_matches_path_ltr(self): | ||
script = """ | ||
example.com/one one | ||
example.com/two two | ||
test.com test | ||
""" | ||
url = "https://example.com/one/two" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["one"])) | ||
|
||
def test_auto_tag_by_domain_ignores_domain_in_path(self): | ||
script = """ | ||
example.com example | ||
""" | ||
url = "https://test.com/example.com" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set([])) | ||
|
||
def test_auto_tag_by_domain_includes_subdomains(self): | ||
script = """ | ||
example.com example | ||
test.example.com test | ||
some.example.com some | ||
""" | ||
url = "https://test.example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["example", "test"])) | ||
|
||
def test_auto_tag_by_domain_matches_domain_rtl(self): | ||
script = """ | ||
example.com example | ||
""" | ||
url = "https://example.com.bad-website.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set([])) | ||
|
||
def test_auto_tag_by_domain_ignores_schema(self): | ||
script = """ | ||
https://example.com/ https | ||
http://example.com/ http | ||
""" | ||
url = "http://example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["https", "http"])) | ||
|
||
def test_auto_tag_by_domain_ignores_lines_with_no_tags(self): | ||
script = """ | ||
example.com | ||
""" | ||
url = "https://example.com/" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set([])) | ||
|
||
def test_auto_tag_by_domain_path_and_qs(self): | ||
script = """ | ||
example.com/page?a=b tag1 # true, matches a=b | ||
example.com/page?a=c&c=d tag2 # true, matches both a=c and c=d | ||
example.com/page?c=d&l=p tag3 # false, l=p doesn't exists | ||
example.com/page?a=bb tag4 # false bb != b | ||
example.com/page?a=b&a=c tag5 # true, matches both a=b and a=c | ||
example.com/page?a=B tag6 # true, matches a=b because case insensitive | ||
example.com/page?A=b tag7 # true, matches a=b because case insensitive | ||
""" | ||
url = "https://example.com/page/some?z=x&a=b&v=b&c=d&o=p&a=c" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["tag1", "tag2", "tag5", "tag6", "tag7"])) | ||
|
||
def test_auto_tag_by_domain_path_and_qs_with_empty_value(self): | ||
script = """ | ||
example.com/page?a= tag1 | ||
example.com/page?b= tag2 | ||
""" | ||
url = "https://example.com/page/some?a=value" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["tag1"])) | ||
|
||
def test_auto_tag_by_domain_path_and_qs_works_with_encoded_url(self): | ||
script = """ | ||
example.com/page?a=йцу tag1 | ||
example.com/page?a=%D0%B9%D1%86%D1%83 tag2 | ||
""" | ||
url = "https://example.com/page?a=%D0%B9%D1%86%D1%83" | ||
|
||
tags = auto_tagging.get_tags(script, url) | ||
|
||
self.assertEqual(tags, set(["tag1", "tag2"])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.