Merge branch 'matthewwithanm:develop' into G-Rath-patch-1

G-Rath · web-flow · commit b54501ed45bc · 2025-07-15T07:52:10.000+12:00
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python 3.8
       uses: actions/setup-python@v2
       with:
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
diff --git a/README.rst b/README.rst
@@ -157,12 +157,22 @@ strip_document
   within the document are unaffected.
   Defaults to ``STRIP``.
 
-beautiful_soup_parser
-  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
-  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
-  environment. Defaults to ``html.parser``.
+strip_pre
+  Controls whether leading/trailing blank lines are removed from ``<pre>``
+  tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
+  ``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
+  Defaults to ``STRIP``.
+
+bs4_options
+  Specify additional configuration options for the ``BeautifulSoup`` object
+  used to interpret the HTML markup. String and list values (such as ``lxml``
+  or ``html5lib``) are treated as ``features`` arguments to control parser
+  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
+  are treated as full kwargs to be used for the BeautifulSoup constructor,
+  allowing specification of any parameter. For parameter details, see the
+  Beautiful Soup documentation at:
 
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -11,6 +11,10 @@
 re_all_whitespace = re.compile(r'[\t \r\n]+')
 re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 re_html_heading = re.compile(r'h(\d+)')
+re_pre_lstrip1 = re.compile(r'^ *\n')
+re_pre_rstrip1 = re.compile(r'\n *$')
+re_pre_lstrip = re.compile(r'^[ \n]*\n')
+re_pre_rstrip = re.compile(r'[ \n]*$')
 
 # Pattern for creating convert_<tag> function names from tag names
 re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -37,6 +41,9 @@
 # confused with a list item
 re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
 
+# Find consecutive backtick sequences in a string
+re_backtick_runs = re.compile(r'`+')
+
 # Heading styles
 ATX = 'atx'
 ATX_CLOSED = 'atx_closed'
@@ -51,10 +58,25 @@
 ASTERISK = '*'
 UNDERSCORE = '_'
 
-# Document strip styles
+# Document/pre strip styles
 LSTRIP = 'lstrip'
 RSTRIP = 'rstrip'
 STRIP = 'strip'
+STRIP_ONE = 'strip_one'
+
+
+def strip1_pre(text):
+    """Strip one leading and trailing newline from a <pre> string."""
+    text = re_pre_lstrip1.sub('', text)
+    text = re_pre_rstrip1.sub('', text)
+    return text
+
+
+def strip_pre(text):
+    """Strip all leading and trailing newlines from a <pre> string."""
+    text = re_pre_lstrip.sub('', text)
+    text = re_pre_rstrip.sub('', text)
+    return text
 
 
 def chomp(text):
@@ -154,7 +176,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
-        beautiful_soup_parser = 'html.parser'
+        bs4_options = 'html.parser'
         bullets = '*+-'  # An iterable of bullet types.
         code_language = ''
         code_language_callback = None
@@ -168,6 +190,7 @@ class DefaultOptions:
         newline_style = SPACES
         strip = None
         strip_document = STRIP
+        strip_pre = STRIP
         strong_em_symbol = ASTERISK
         sub_symbol = ''
         sup_symbol = ''
@@ -188,11 +211,15 @@ def __init__(self, **options):
             raise ValueError('You may specify either tags to strip or tags to'
                              ' convert, but not both.')
 
+        # If a string or list is passed to bs4_options, assume it is a 'features' specification
+        if not isinstance(self.options['bs4_options'], dict):
+            self.options['bs4_options'] = {'features': self.options['bs4_options']}
+
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
 
     def convert(self, html):
-        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
         return self.convert_soup(soup)
 
     def convert_soup(self, soup):
@@ -456,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
             return '  \n'
 
     def convert_code(self, el, text, parent_tags):
-        if 'pre' in parent_tags:
+        if '_noformat' in parent_tags:
             return text
-        converter = abstract_inline_conversion(lambda self: '`')
-        return converter(self, el, text, parent_tags)
+
+        prefix, suffix, text = chomp(text)
+        if not text:
+            return ''
+
+        # Find the maximum number of consecutive backticks in the text, then
+        # delimit the code span with one more backtick than that
+        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
+        markup_delimiter = '`' * (max_backticks + 1)
+
+        # If the maximum number of backticks is greater than zero, add a space
+        # to avoid interpretation of inside backticks as literals
+        if max_backticks > 0:
+            text = " " + text + " "
+
+        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
 
     convert_del = abstract_inline_conversion(lambda self: '~~')
 
@@ -652,6 +693,15 @@ def convert_pre(self, el, text, parent_tags):
         if self.options['code_language_callback']:
             code_language = self.options['code_language_callback'](el) or code_language
 
+        if self.options['strip_pre'] == STRIP:
+            text = strip_pre(text)  # remove all leading/trailing newlines
+        elif self.options['strip_pre'] == STRIP_ONE:
+            text = strip1_pre(text)  # remove one leading/trailing newline
+        elif self.options['strip_pre'] is None:
+            pass  # leave leading and trailing newlines as-is
+        else:
+            raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
+
         return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
 
     def convert_q(self, el, text, parent_tags):
@@ -685,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
     def convert_td(self, el, text, parent_tags):
         colspan = 1
         if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = int(el['colspan'])
+            colspan = max(1, min(1000, int(el['colspan'])))
         return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
 
     def convert_th(self, el, text, parent_tags):
         colspan = 1
         if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = int(el['colspan'])
+            colspan = max(1, min(1000, int(el['colspan'])))
         return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
 
     def convert_tr(self, el, text, parent_tags):
@@ -712,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
         full_colspan = 0
         for cell in cells:
             if 'colspan' in cell.attrs and cell['colspan'].isdigit():
-                full_colspan += int(cell["colspan"])
+                full_colspan += max(1, min(1000, int(cell['colspan'])))
             else:
                 full_colspan += 1
         if ((is_headrow
diff --git a/markdownify/main.py b/markdownify/main.py
@@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
     parser.add_argument('-w', '--wrap', action='store_true',
                         help="Wrap all text paragraphs at --wrap-width characters.")
     parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('-p', '--beautiful-soup-parser',
-                        dest='beautiful_soup_parser',
+    parser.add_argument('--bs4-options',
                         default='html.parser',
-                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
-                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
-                             "environment.")
+                        help="Specifies the parser that BeautifulSoup should use to parse "
+                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+                             "'html5lib'.")
 
     args = parser.parse_args(argv)
     print(markdownify(**vars(args)))
diff --git a/tests/test_args.py b/tests/test_args.py
@@ -2,7 +2,7 @@
 Test whitelisting/blacklisting of specific tags.
 
 """
-from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
 from .utils import md
 
 
@@ -32,3 +32,16 @@ def test_strip_document():
     assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
     assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
     assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+
+def test_strip_pre():
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>") == "```\n  Hello\n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP) == "```\n  Hello\n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP_ONE) == "```\n  \n  Hello  \n  \n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=None) == "```\n  \n  \n  Hello  \n  \n  \n```"
+
+
+def bs4_options():
+    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -101,6 +101,9 @@ def test_code():
     assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
     assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
     assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
+    assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
+    assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
+    assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'
 
 
 def test_dl():
@@ -370,4 +373,4 @@ def test_spaces():
     assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
     assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
     assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
-    assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo \n```\n\nbar'
+    assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'