From d32914b7dd1381f47e7e96d76458e986c3550cf0 Mon Sep 17 00:00:00 2001
From: funilrys <contact@funilrys.com>
Date: Mon, 1 May 2023 21:35:04 +0200
Subject: [PATCH] Improve adblock decoder.

Indeed, before this patch we weren't decoding the following cases:

  * |http://example.com/*
  * |http://example.org^

This patch fixes PyFunceble/adblock-decoder#3.

Contributors:
  * @smed79
---
 .../converter/adblock_input_line2subject.py   | 85 ++++++++++++++++++-
 .../test_adblock_input_line2subject.py        | 58 ++++++++++++-
 2 files changed, 139 insertions(+), 4 deletions(-)

diff --git a/PyFunceble/converter/adblock_input_line2subject.py b/PyFunceble/converter/adblock_input_line2subject.py
index 238b96c8..ac4e1c2d 100644
--- a/PyFunceble/converter/adblock_input_line2subject.py
+++ b/PyFunceble/converter/adblock_input_line2subject.py
@@ -51,7 +51,7 @@
     limitations under the License.
 """
 
-from typing import Any, List, Optional, Set, Union
+from typing import Any, List, Optional, Set, Tuple, Union
 
 from PyFunceble.converter.base import ConverterBase
 from PyFunceble.converter.url2netloc import Url2Netloc
@@ -68,6 +68,7 @@ class AdblockInputLine2Subject(ConverterBase):
     _aggressive: bool = False
 
     _regex_helper: Optional[RegexHelper] = None
+    url2netloc: Optional[Url2Netloc] = None
 
     def __init__(
         self,
@@ -75,6 +76,7 @@ def __init__(
         aggressive: bool = False,
         *,
         regex_helper: Optional[RegexHelper] = None,
+        url2netloc: Optional[Url2Netloc] = None,
     ) -> None:
         if aggressive is not None:
             self.aggressive = aggressive
@@ -84,6 +86,11 @@ def __init__(
         else:
             self._regex_helper = regex_helper
 
+        if url2netloc is None:
+            self.url2netloc = Url2Netloc()
+        else:
+            self.url2netloc = url2netloc
+
         super().__init__(data_to_convert=data_to_convert)
 
     @ConverterBase.data_to_convert.setter
@@ -144,8 +151,7 @@ def should_be_ignored(line: str) -> bool:
 
         return any(line.startswith(x) for x in starting_chars)
 
-    @staticmethod
-    def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]:
+    def extract_base(self, subject: Union[str, List[str]]) -> Union[str, List[str]]:
         """
         Extracts the base of the given subject (supposely URL).
 
@@ -160,10 +166,38 @@ def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]:
         subject = subject.replace("*", "").replace("~", "")
 
         try:
+            # TODO: Fix this.
             return Url2Netloc(subject).get_converted()
         except ValueError:
             return subject
 
+    def split_seprators(self, line: str) -> Tuple[Set[str], str]:
+        """
+        Splits the separators providing the 2 possible parts: domains and body.
+
+        :param line:
+            The line to convert.
+
+        Example:
+
+            Given: :code:`"||example.com$script,domain=example.org` returns
+            :code:`({"example.org"}, {"script,domain=example.org"})`
+        """
+
+        separators = ["##", "#?#", "#@#", "#$#", "$"]
+
+        targets, options = set(), set()
+
+        for separator in separators:
+            if separator not in line:
+                continue
+
+            target, option = line.rsplit(separator, 1)
+            targets.add(target)
+            options.add(option)
+
+        return targets, options
+
     def _decode_multiple_subject(self, decoded: str) -> Set[str]:
         """
         Implementation of the decoding of the case that multiple
@@ -427,6 +461,50 @@ def _decode_v6(self, line: str, *, aggressive: bool = False) -> Set[str]:
 
         return {x for x in result if "." in x}
 
+    def _decode_v7(self, line: str, *, aggressive: bool = False) -> Set[str]:
+        """
+        Implementation of our seventh decoding mode.
+
+        In this mode we try to decode the explicit URL:
+
+            |http://example.org/.*
+            |https://example.org/.*
+
+        :param line:
+            The line to decode.
+        """
+
+        local_line = line.strip()
+        result = set()
+
+        if (
+            local_line.startswith("||")
+            or (local_line.startswith("|") and local_line.endswith("|"))
+            or (not line.startswith("|"))
+        ):
+            return result
+
+        if local_line.startswith("|"):
+            local_line = local_line.replace("|", "", 1)
+
+        if local_line.endswith("^"):
+            local_line = local_line.rstrip("^")
+
+        targets, options = self.split_seprators(local_line)
+
+        for target in targets:
+            result.update(self._decode_multiple_subject(target))
+
+        if aggressive:
+            for option in options:
+                result.update(self._decode_options(option.split(",")))
+
+        if not options:
+            # Wish me luck :-)
+            result.update(self._decode_multiple_subject(local_line))
+
+        return {x for x in result if "." in x}
+
     def get_converted(self) -> List[str]:
         """
         Provides the converted data.
@@ -452,6 +530,7 @@ def convert(self, data: Any, *, aggressive: bool = False) -> List[str]:
             result.update(self._decode_v3(data, aggressive=aggressive))
             result.update(self._decode_v5(data, aggressive=aggressive))
             result.update(self._decode_v6(data, aggressive=aggressive))
+            result.update(self._decode_v7(data, aggressive=aggressive))
 
         result.update(self._decode_v4(data, aggressive=aggressive))
 
diff --git a/tests/converter/test_adblock_input_line2subject.py b/tests/converter/test_adblock_input_line2subject.py
index 66b521f7..f38d00dc 100644
--- a/tests/converter/test_adblock_input_line2subject.py
+++ b/tests/converter/test_adblock_input_line2subject.py
@@ -54,6 +54,7 @@
 from typing import List
 
 from PyFunceble.converter.adblock_input_line2subject import AdblockInputLine2Subject
+from PyFunceble.converter.url2netloc import Url2Netloc
 from PyFunceble.helpers.regex import RegexHelper
 
 
@@ -274,6 +275,55 @@ class TestAdblockInputLine2Subject(unittest.TestCase):
                 "aggressive": ["example.com", "example.net", "example.org"],
             },
         },
+        {
+            "subject": "|http://example.org/hello-world^$scripts,image",
+            "expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
+        },
+        {
+            "subject": "|http://example.org/*",
+            "expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
+        },
+        {
+            "subject": "|http://example.org^",
+            "expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
+        },
+        {
+            "subject": "|http://example.org",
+            "expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
+        },
+        {
+            "subject": "|https://example.org/^$domain=example.com",
+            "expected": {
+                "aggressive": ["example.com", "example.org"],
+                "standard": ["example.org"],
+            },
+        },
+        {
+            "subject": "|ftp://example.org$domain=example.com|example.net",
+            "expected": {
+                "aggressive": ["example.com", "example.net", "example.org"],
+                "standard": ["example.org"],
+            },
+        },
+        {
+            "subject": "|http://example.com$script,image,domain=example.org|foo.example.net",
+            "expected": {
+                "aggressive": ["example.com", "example.org", "foo.example.net"],
+                "standard": ["example.com"],
+            },
+        },
+        {
+            "subject": "|http://example.com,https://example.de$script,image,domain=example.org|foo.example.net",
+            "expected": {
+                "aggressive": [
+                    "example.com",
+                    "example.de",
+                    "example.org",
+                    "foo.example.net",
+                ],
+                "standard": ["example.com", "example.de"],
+            },
+        },
     ]
 
     def setUp(self) -> None:
@@ -296,12 +346,18 @@ def test_init_with_helper(self) -> None:
         """
 
         regex_helper = RegexHelper()
-        self.converter = AdblockInputLine2Subject(regex_helper=regex_helper)
+        url2netloc = Url2Netloc()
+        self.converter = AdblockInputLine2Subject(
+            regex_helper=regex_helper, url2netloc=url2netloc
+        )
 
         # pylint: disable=protected-access
         self.assertIsInstance(self.converter._regex_helper, RegexHelper)
         self.assertEqual(id(regex_helper), id(self.converter._regex_helper))
 
+        self.assertIsInstance(self.converter.url2netloc, Url2Netloc)
+        self.assertEqual(id(url2netloc), id(self.converter.url2netloc))
+
     def test_set_data_to_convert_no_string(self) -> None:
         """
         Tests the method which let us set the data to work with for the case