From d32914b7dd1381f47e7e96d76458e986c3550cf0 Mon Sep 17 00:00:00 2001 From: funilrys Date: Mon, 1 May 2023 21:35:04 +0200 Subject: [PATCH] Improve adblock decoder. Indeed, before this patch we weren't decoding the following cases: * |http://example.com/* * |http://example.org^ This patch fixes PyFunceble/adblock-decoder#3. Contributors: * @smed79 --- .../converter/adblock_input_line2subject.py | 85 ++++++++++++++++++- .../test_adblock_input_line2subject.py | 58 ++++++++++++- 2 files changed, 139 insertions(+), 4 deletions(-) diff --git a/PyFunceble/converter/adblock_input_line2subject.py b/PyFunceble/converter/adblock_input_line2subject.py index 238b96c8..ac4e1c2d 100644 --- a/PyFunceble/converter/adblock_input_line2subject.py +++ b/PyFunceble/converter/adblock_input_line2subject.py @@ -51,7 +51,7 @@ limitations under the License. """ -from typing import Any, List, Optional, Set, Union +from typing import Any, List, Optional, Set, Tuple, Union from PyFunceble.converter.base import ConverterBase from PyFunceble.converter.url2netloc import Url2Netloc @@ -68,6 +68,7 @@ class AdblockInputLine2Subject(ConverterBase): _aggressive: bool = False _regex_helper: Optional[RegexHelper] = None + url2netloc: Optional[Url2Netloc] = None def __init__( self, @@ -75,6 +76,7 @@ def __init__( aggressive: bool = False, *, regex_helper: Optional[RegexHelper] = None, + url2netloc: Optional[Url2Netloc] = None, ) -> None: if aggressive is not None: self.aggressive = aggressive @@ -84,6 +86,11 @@ def __init__( else: self._regex_helper = regex_helper + if url2netloc is None: + self.url2netloc = Url2Netloc() + else: + self.url2netloc = url2netloc + super().__init__(data_to_convert=data_to_convert) @ConverterBase.data_to_convert.setter @@ -144,8 +151,7 @@ def should_be_ignored(line: str) -> bool: return any(line.startswith(x) for x in starting_chars) - @staticmethod - def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]: + def extract_base(self, subject: Union[str, List[str]]) -> Union[str, List[str]]: """ Extracts the base of the given subject (supposely URL). @@ -160,10 +166,38 @@ def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]: subject = subject.replace("*", "").replace("~", "") try: + # TODO: Fix this. return Url2Netloc(subject).get_converted() except ValueError: return subject + def split_seprators(self, line: str) -> Tuple[Set[str], str]: + """ + Splits the separators providing the 2 possible parts: domains and body. + + :param line: + The line to convert. + + Example: + + Given: :code:`"||example.com$script,domain=example.org` returns + :code:`({"example.org"}, {"script,domain=example.org"})` + """ + + separators = ["##", "#?#", "#@#", "#$#", "$"] + + targets, options = set(), set() + + for separator in separators: + if separator not in line: + continue + + target, option = line.rsplit(separator, 1) + targets.add(target) + options.add(option) + + return targets, options + def _decode_multiple_subject(self, decoded: str) -> Set[str]: """ Implementation of the decoding of the case that multiple @@ -427,6 +461,50 @@ def _decode_v6(self, line: str, *, aggressive: bool = False) -> Set[str]: return {x for x in result if "." in x} + def _decode_v7(self, line: str, *, aggressive: bool = False) -> Set[str]: + """ + Implementation of our seventh decoding mode. + + In this mode we try to decode the explicit URL: + + |http://example.org/.* + |https://example.org/.* + + :param line: + The line to decode. + """ + + local_line = line.strip() + result = set() + + if ( + local_line.startswith("||") + or (local_line.startswith("|") and local_line.endswith("|")) + or (not line.startswith("|")) + ): + return result + + if local_line.startswith("|"): + local_line = local_line.replace("|", "", 1) + + if local_line.endswith("^"): + local_line = local_line.rstrip("^") + + targets, options = self.split_seprators(local_line) + + for target in targets: + result.update(self._decode_multiple_subject(target)) + + if aggressive: + for option in options: + result.update(self._decode_options(option.split(","))) + + if not options: + # Wish me luck :-) + result.update(self._decode_multiple_subject(local_line)) + + return {x for x in result if "." in x} + def get_converted(self) -> List[str]: """ Provides the converted data. @@ -452,6 +530,7 @@ def convert(self, data: Any, *, aggressive: bool = False) -> List[str]: result.update(self._decode_v3(data, aggressive=aggressive)) result.update(self._decode_v5(data, aggressive=aggressive)) result.update(self._decode_v6(data, aggressive=aggressive)) + result.update(self._decode_v7(data, aggressive=aggressive)) result.update(self._decode_v4(data, aggressive=aggressive)) diff --git a/tests/converter/test_adblock_input_line2subject.py b/tests/converter/test_adblock_input_line2subject.py index 66b521f7..f38d00dc 100644 --- a/tests/converter/test_adblock_input_line2subject.py +++ b/tests/converter/test_adblock_input_line2subject.py @@ -54,6 +54,7 @@ from typing import List from PyFunceble.converter.adblock_input_line2subject import AdblockInputLine2Subject +from PyFunceble.converter.url2netloc import Url2Netloc from PyFunceble.helpers.regex import RegexHelper @@ -274,6 +275,55 @@ class TestAdblockInputLine2Subject(unittest.TestCase): "aggressive": ["example.com", "example.net", "example.org"], }, }, + { + "subject": "|http://example.org/hello-world^$scripts,image", + "expected": {"aggressive": ["example.org"], "standard": ["example.org"]}, + }, + { + "subject": "|http://example.org/*", + "expected": {"aggressive": ["example.org"], "standard": ["example.org"]}, + }, + { + "subject": "|http://example.org^", + "expected": {"aggressive": ["example.org"], "standard": ["example.org"]}, + }, + { + "subject": "|http://example.org", + "expected": {"aggressive": ["example.org"], "standard": ["example.org"]}, + }, + { + "subject": "|https://example.org/^$domain=example.com", + "expected": { + "aggressive": ["example.com", "example.org"], + "standard": ["example.org"], + }, + }, + { + "subject": "|ftp://example.org$domain=example.com|example.net", + "expected": { + "aggressive": ["example.com", "example.net", "example.org"], + "standard": ["example.org"], + }, + }, + { + "subject": "|http://example.com$script,image,domain=example.org|foo.example.net", + "expected": { + "aggressive": ["example.com", "example.org", "foo.example.net"], + "standard": ["example.com"], + }, + }, + { + "subject": "|http://example.com,https://example.de$script,image,domain=example.org|foo.example.net", + "expected": { + "aggressive": [ + "example.com", + "example.de", + "example.org", + "foo.example.net", + ], + "standard": ["example.com", "example.de"], + }, + }, ] def setUp(self) -> None: @@ -296,12 +346,18 @@ def test_init_with_helper(self) -> None: """ regex_helper = RegexHelper() - self.converter = AdblockInputLine2Subject(regex_helper=regex_helper) + url2netloc = Url2Netloc() + self.converter = AdblockInputLine2Subject( + regex_helper=regex_helper, url2netloc=url2netloc + ) # pylint: disable=protected-access self.assertIsInstance(self.converter._regex_helper, RegexHelper) self.assertEqual(id(regex_helper), id(self.converter._regex_helper)) + self.assertIsInstance(self.converter.url2netloc, Url2Netloc) + self.assertEqual(id(url2netloc), id(self.converter.url2netloc)) + def test_set_data_to_convert_no_string(self) -> None: """ Tests the method which let us set the data to work with for the case