Support for matching license header within multiline comment block (#361)

zflat · William Wedler · web-flow · commit 37e2707f72a2 · 2022-06-17T22:46:58.000-07:00
Co-authored-by: William Wedler &lt;william.wedler@resquared.com&gt;
diff --git a/ament_copyright/ament_copyright/parser.py b/ament_copyright/ament_copyright/parser.py
@@ -115,11 +115,14 @@ def parse(self):
 
         # get first comment block without leading comment tokens
         block, _ = get_comment_block(self.content, index)
-        if not block:
-            return
         copyrights, remaining_block = search_copyright_information(block)
-        if not copyrights:
-            return None
+
+        if len(copyrights) == 0:
+            block = get_multiline_comment_block(self.content, index)
+            copyrights, remaining_block = search_copyright_information(block)
+
+        if len(copyrights) == 0:
+            return
 
         self.copyrights = copyrights
 
@@ -178,6 +181,8 @@ def determine_filetype(path):
 
 
 def search_copyright_information(content):
+    if content is None:
+        return [], content
     # regex for matching years or year ranges (yyyy-yyyy) separated by colons
     year = r'\d{4}'
     year_range = '%s-%s' % (year, year)
@@ -279,6 +284,49 @@ def get_comment_block(content, index):
     return '\n'.join(lines), start_index + len(comment_token) + 1
 
 
+def get_multiline_comment_block(content, index):
+    patterns = [('^(/[*])', '([*]/)$'),
+                ('^(<!--)', '(-->)$')]
+    for pattern_pair in patterns:
+        start_pattern, end_pattern = pattern_pair
+        # find the first match of the comment start token
+        # also accept BOM if present
+        if index == 0 and content[0] == '\ufeff':
+            start_pattern = start_pattern[0] + '\ufeff' + start_pattern[1:]
+        start_regex = re.compile(start_pattern, re.MULTILINE)
+        start_match = start_regex.search(content, index)
+        if not start_match:
+            continue
+        start_index = start_match.start(1)
+
+        # find the first match of the comment end token
+        end_regex = re.compile(end_pattern, re.MULTILINE)
+        end_match = end_regex.search(content, index)
+        if not end_match:
+            continue
+        end_index = end_match.start(1)
+
+        # collect all lines between start and end (open interval) and strip out any common prefix
+        block = content[start_index:end_index]
+        block_lines = block.splitlines()
+        if len(block_lines) == 1:
+            prefixed_lines = block_lines
+        elif len(block_lines) == 2:
+            prefixed_lines = block_lines[1:]
+        else:
+            prefixed_lines = block_lines[1:-1]
+
+        if len(prefixed_lines) > 1:
+            line_prefix = os.path.commonprefix(prefixed_lines)
+            lines = [line[len(line_prefix):] for line in prefixed_lines]
+        else:
+            # Single-line header does not have a common prefix to strip out
+            lines = prefixed_lines
+
+        return '\n'.join(lines)
+    return None
+
+
 def scan_past_empty_lines(content, index):
     while is_empty_line(content, index):
         index = get_index_of_next_line(content, index)
diff --git a/ament_copyright/test/cases/apache2_license_multiline_comment/case.cpp b/ament_copyright/test/cases/apache2_license_multiline_comment/case.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2018 Open Source Robotics Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <memory>
diff --git a/ament_copyright/test/cases/apache2_license_multiline_comment/case.xml b/ament_copyright/test/cases/apache2_license_multiline_comment/case.xml
@@ -0,0 +1,15 @@
+<!--
+  Copyright (C) 2018 Open Source Robotics Foundation
+  
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  
+      http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
diff --git a/ament_copyright/test/test_copyright.py b/ament_copyright/test/test_copyright.py
@@ -28,6 +28,16 @@ def test_apache2_standard():
     assert rc == 0, 'Found errors'
 
 
+def test_apache2_cpp_multiline():
+    rc = main(argv=[os.path.join(cases_path, 'apache2_license_multiline_comment/case.cpp')])
+    assert rc == 0, 'Found errors'
+
+
+def test_apache2_xml_multiline():
+    rc = main(argv=[os.path.join(cases_path, 'apache2_license_multiline_comment/case.xml')])
+    assert rc == 0, 'Found errors'
+
+
 def test_boost1_cpp():
     rc = main(argv=[os.path.join(cases_path, 'boost1/case2.cpp')])
     assert rc == 0, 'Found errors'
diff --git a/ament_copyright/test/test_parser.py b/ament_copyright/test/test_parser.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 
 from ament_copyright import UNKNOWN_IDENTIFIER
-from ament_copyright.parser import FileDescriptor, search_copyright_information, split_template
+from ament_copyright.parser import FileDescriptor
+from ament_copyright.parser import get_comment_block
+from ament_copyright.parser import get_multiline_comment_block
+from ament_copyright.parser import scan_past_empty_lines
+from ament_copyright.parser import search_copyright_information
+from ament_copyright.parser import split_template
 
 
 def test_search_copyright_information_incorrect_typo():
@@ -50,7 +55,6 @@ def test_search_copyright_information_capitalization1():
     """
     copyrights, remaining_block = search_copyright_information(
         '  Copyright 2020 Open Source Robotics Foundation, Inc.')
-    print(copyrights[0].name)
     assert copyrights[0].name == 'Open Source Robotics Foundation, Inc.'
     assert len(copyrights) == 1
 
@@ -235,3 +239,138 @@ class TempLicense(object):
     dut = FileDescriptor(0, '/')
     dut.identify_license(content, 'file_headers', {'temp': temp_license})
     assert dut.license_identifier == 'temp'
+
+
+def test_get_comment_block_slashes():
+    """Test parsing comment block with c-style comment forward slashes."""
+    commented_content = """
+// aaa
+// bbb
+// ccc
+
+// Comment not part of the header
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block, _ = get_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_comment_block_slashes2():
+    """Test parsing comment multiline block that is not at the start of the content."""
+    commented_content = """
+// aaa
+// bbb
+// ccc
+
+///
+/**
+ddd
+*/
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block = get_multiline_comment_block(commented_content, index)
+    assert block is not None
+    assert block == 'ddd'
+
+
+def test_get_comment_block_doxygen():
+    """Test parsing comment block with doxygen-style comment forward slashes."""
+    commented_content = """
+/// aaa
+/// bbb
+/// ccc
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block, _ = get_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_comment_block_pound():
+    """Test parsing comment block with python-style comment pound signs."""
+    commented_content = """
+# aaa
+# bbb
+# ccc
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block, _ = get_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_multiline_comment_block_cstyle():
+    """Test parsing comment block with multiline c-style comment block."""
+    commented_content = """
+/**
+ * aaa
+ * bbb
+ * ccc
+ */
+
+
+/**
+ * Comment not part of the header
+ */
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block = get_multiline_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_multiline_comment_block_cstyle2():
+    """Test parsing comment block with multiline c-style comment block."""
+    commented_content = """
+/**
+ * aaa
+ * bbb
+ * ccc
+ */
+
+// Comment not part of
+// the header
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block = get_multiline_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_multiline_comment_block_xmlstyle():
+    """Test parsing comment block with multiline xml-style comment block."""
+    commented_content = """
+<!--
+  aaa
+  bbb
+  ccc
+ -->
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block = get_multiline_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
+
+
+def test_get_multiline_comment_block_xmlstyle_prefixed():
+    """Test parsing comment block with multiline xml-style comment block containing a prefix."""
+    commented_content = """
+<!--
+  # aaa
+  # bbb
+  # ccc
+ -->
+    """
+    index = 0
+    index = scan_past_empty_lines(commented_content, index)
+    block = get_multiline_comment_block(commented_content, index)
+    assert block is not None
+    assert block == '\n'.join(['aaa', 'bbb', 'ccc'])