From f9a9f8651fdd9139e0b7ce5b6024feb6f71249ca Mon Sep 17 00:00:00 2001
From: Erik Schamper <1254028+Schamper@users.noreply.github.com>
Date: Fri, 4 Aug 2023 14:14:20 +0200
Subject: [PATCH] Preserve newlines when removing comments (#43)

---
 dissect/cstruct/parser.py |  8 ++++----
 tests/test_parser.py      | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_parser.py

diff --git a/dissect/cstruct/parser.py b/dissect/cstruct/parser.py
index 64b2447..0394a00 100644
--- a/dissect/cstruct/parser.py
+++ b/dissect/cstruct/parser.py
@@ -299,11 +299,11 @@ def _remove_comments(string: str) -> str:
         # second group captures comments (//single-line or /* multi-line */)
         regex = re.compile(pattern, re.MULTILINE | re.DOTALL)
 
-        def _replacer(match):
+        def _replacer(match: re.Match) -> str:
             # if the 2nd group (capturing comments) is not None,
             # it means we have captured a non-quoted (real) comment string.
-            if match.group(2) is not None:
-                return ""  # so we will return empty to remove the comment
+            if comment := match.group(2):
+                return "\n" * comment.count("\n")  # so we will return empty to remove the comment
             else:  # otherwise, we will return the 1st group
                 return match.group(1)  # captured quoted-string
 
@@ -314,7 +314,7 @@ def _lineno(tok: Token) -> int:
         """Quick and dirty line number calculator"""
 
         match = tok.match
-        return match.string.count("\n", 0, match.start())
+        return match.string.count("\n", 0, match.start()) + 1
 
     def _config_flag(self, tokens: TokenConsumer) -> None:
         flag_token = tokens.consume()
diff --git a/tests/test_parser.py b/tests/test_parser.py
new file mode 100644
index 0000000..bdc408b
--- /dev/null
+++ b/tests/test_parser.py
@@ -0,0 +1,26 @@
+from unittest.mock import Mock
+
+from dissect.cstruct.parser import TokenParser
+
+
+def test_preserve_comment_newlines():
+    cdef = """
+    // normal comment
+    #define normal_anchor
+    /*
+     * Multi
+     * line
+     * comment
+     */
+    #define multi_anchor
+    """
+    data = TokenParser._remove_comments(cdef)
+    print(repr(data))
+
+    mock_token = Mock()
+    mock_token.match.string = data
+    mock_token.match.start.return_value = data.index("#define normal_anchor")
+    assert TokenParser._lineno(mock_token) == 3
+
+    mock_token.match.start.return_value = data.index("#define multi_anchor")
+    assert TokenParser._lineno(mock_token) == 9