Skip to content

Commit b54501e

Browse files
authored
Merge branch 'matthewwithanm:develop' into G-Rath-patch-1
2 parents a158b19 + 7edbc5a commit b54501e

File tree

7 files changed

+98
-23
lines changed

7 files changed

+98
-23
lines changed

.github/workflows/python-app.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ubuntu-latest
1616

1717
steps:
18-
- uses: actions/checkout@v2
18+
- uses: actions/checkout@v4
1919
- name: Set up Python 3.8
2020
uses: actions/setup-python@v2
2121
with:

.github/workflows/python-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
runs-on: ubuntu-latest
1414

1515
steps:
16-
- uses: actions/checkout@v2
16+
- uses: actions/checkout@v4
1717
- name: Set up Python
1818
uses: actions/setup-python@v2
1919
with:

README.rst

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,22 @@ strip_document
157157
within the document are unaffected.
158158
Defaults to ``STRIP``.
159159

160-
beautiful_soup_parser
161-
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162-
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
163-
environment. Defaults to ``html.parser``.
160+
strip_pre
161+
Controls whether leading/trailing blank lines are removed from ``<pre>``
162+
tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
163+
``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
164+
Defaults to ``STRIP``.
165+
166+
bs4_options
167+
Specify additional configuration options for the ``BeautifulSoup`` object
168+
used to interpret the HTML markup. String and list values (such as ``lxml``
169+
or ``html5lib``) are treated as ``features`` arguments to control parser
170+
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
171+
are treated as full kwargs to be used for the BeautifulSoup constructor,
172+
allowing specification of any parameter. For parameter details, see the
173+
Beautiful Soup documentation at:
164174

165-
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
175+
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
166176

167177
Options may be specified as kwargs to the ``markdownify`` function, or as a
168178
nested ``Options`` class in ``MarkdownConverter`` subclasses.

markdownify/__init__.py

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
re_all_whitespace = re.compile(r'[\t \r\n]+')
1212
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
1313
re_html_heading = re.compile(r'h(\d+)')
14+
re_pre_lstrip1 = re.compile(r'^ *\n')
15+
re_pre_rstrip1 = re.compile(r'\n *$')
16+
re_pre_lstrip = re.compile(r'^[ \n]*\n')
17+
re_pre_rstrip = re.compile(r'[ \n]*$')
1418

1519
# Pattern for creating convert_<tag> function names from tag names
1620
re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -37,6 +41,9 @@
3741
# confused with a list item
3842
re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
3943

44+
# Find consecutive backtick sequences in a string
45+
re_backtick_runs = re.compile(r'`+')
46+
4047
# Heading styles
4148
ATX = 'atx'
4249
ATX_CLOSED = 'atx_closed'
@@ -51,10 +58,25 @@
5158
ASTERISK = '*'
5259
UNDERSCORE = '_'
5360

54-
# Document strip styles
61+
# Document/pre strip styles
5562
LSTRIP = 'lstrip'
5663
RSTRIP = 'rstrip'
5764
STRIP = 'strip'
65+
STRIP_ONE = 'strip_one'
66+
67+
68+
def strip1_pre(text):
69+
"""Strip one leading and trailing newline from a <pre> string."""
70+
text = re_pre_lstrip1.sub('', text)
71+
text = re_pre_rstrip1.sub('', text)
72+
return text
73+
74+
75+
def strip_pre(text):
76+
"""Strip all leading and trailing newlines from a <pre> string."""
77+
text = re_pre_lstrip.sub('', text)
78+
text = re_pre_rstrip.sub('', text)
79+
return text
5880

5981

6082
def chomp(text):
@@ -154,7 +176,7 @@ def _next_block_content_sibling(el):
154176
class MarkdownConverter(object):
155177
class DefaultOptions:
156178
autolinks = True
157-
beautiful_soup_parser = 'html.parser'
179+
bs4_options = 'html.parser'
158180
bullets = '*+-' # An iterable of bullet types.
159181
code_language = ''
160182
code_language_callback = None
@@ -168,6 +190,7 @@ class DefaultOptions:
168190
newline_style = SPACES
169191
strip = None
170192
strip_document = STRIP
193+
strip_pre = STRIP
171194
strong_em_symbol = ASTERISK
172195
sub_symbol = ''
173196
sup_symbol = ''
@@ -188,11 +211,15 @@ def __init__(self, **options):
188211
raise ValueError('You may specify either tags to strip or tags to'
189212
' convert, but not both.')
190213

214+
# If a string or list is passed to bs4_options, assume it is a 'features' specification
215+
if not isinstance(self.options['bs4_options'], dict):
216+
self.options['bs4_options'] = {'features': self.options['bs4_options']}
217+
191218
# Initialize the conversion function cache
192219
self.convert_fn_cache = {}
193220

194221
def convert(self, html):
195-
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
222+
soup = BeautifulSoup(html, **self.options['bs4_options'])
196223
return self.convert_soup(soup)
197224

198225
def convert_soup(self, soup):
@@ -456,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
456483
return ' \n'
457484

458485
def convert_code(self, el, text, parent_tags):
459-
if 'pre' in parent_tags:
486+
if '_noformat' in parent_tags:
460487
return text
461-
converter = abstract_inline_conversion(lambda self: '`')
462-
return converter(self, el, text, parent_tags)
488+
489+
prefix, suffix, text = chomp(text)
490+
if not text:
491+
return ''
492+
493+
# Find the maximum number of consecutive backticks in the text, then
494+
# delimit the code span with one more backtick than that
495+
max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
496+
markup_delimiter = '`' * (max_backticks + 1)
497+
498+
# If the maximum number of backticks is greater than zero, add a space
499+
# to avoid interpretation of inside backticks as literals
500+
if max_backticks > 0:
501+
text = " " + text + " "
502+
503+
return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
463504

464505
convert_del = abstract_inline_conversion(lambda self: '~~')
465506

@@ -652,6 +693,15 @@ def convert_pre(self, el, text, parent_tags):
652693
if self.options['code_language_callback']:
653694
code_language = self.options['code_language_callback'](el) or code_language
654695

696+
if self.options['strip_pre'] == STRIP:
697+
text = strip_pre(text) # remove all leading/trailing newlines
698+
elif self.options['strip_pre'] == STRIP_ONE:
699+
text = strip1_pre(text) # remove one leading/trailing newline
700+
elif self.options['strip_pre'] is None:
701+
pass # leave leading and trailing newlines as-is
702+
else:
703+
raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
704+
655705
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
656706

657707
def convert_q(self, el, text, parent_tags):
@@ -685,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
685735
def convert_td(self, el, text, parent_tags):
686736
colspan = 1
687737
if 'colspan' in el.attrs and el['colspan'].isdigit():
688-
colspan = int(el['colspan'])
738+
colspan = max(1, min(1000, int(el['colspan'])))
689739
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
690740

691741
def convert_th(self, el, text, parent_tags):
692742
colspan = 1
693743
if 'colspan' in el.attrs and el['colspan'].isdigit():
694-
colspan = int(el['colspan'])
744+
colspan = max(1, min(1000, int(el['colspan'])))
695745
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
696746

697747
def convert_tr(self, el, text, parent_tags):
@@ -712,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
712762
full_colspan = 0
713763
for cell in cells:
714764
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
715-
full_colspan += int(cell["colspan"])
765+
full_colspan += max(1, min(1000, int(cell['colspan'])))
716766
else:
717767
full_colspan += 1
718768
if ((is_headrow

markdownify/main.py

100644100755
Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
7070
parser.add_argument('-w', '--wrap', action='store_true',
7171
help="Wrap all text paragraphs at --wrap-width characters.")
7272
parser.add_argument('--wrap-width', type=int, default=80)
73-
parser.add_argument('-p', '--beautiful-soup-parser',
74-
dest='beautiful_soup_parser',
73+
parser.add_argument('--bs4-options',
7574
default='html.parser',
76-
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77-
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78-
"environment.")
75+
help="Specifies the parser that BeautifulSoup should use to parse "
76+
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
77+
"'html5lib'.")
7978

8079
args = parser.parse_args(argv)
8180
print(markdownify(**vars(args)))

tests/test_args.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Test whitelisting/blacklisting of specific tags.
33
44
"""
5-
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
5+
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
66
from .utils import md
77

88

@@ -32,3 +32,16 @@ def test_strip_document():
3232
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
3333
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
3434
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
35+
36+
37+
def test_strip_pre():
38+
assert markdownify("<pre> \n \n Hello \n \n </pre>") == "```\n Hello\n```"
39+
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP) == "```\n Hello\n```"
40+
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP_ONE) == "```\n \n Hello \n \n```"
41+
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=None) == "```\n \n \n Hello \n \n \n```"
42+
43+
44+
def bs4_options():
45+
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
46+
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
47+
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"

tests/test_conversions.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ def test_code():
101101
assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
102102
assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
103103
assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
104+
assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
105+
assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
106+
assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'
104107

105108

106109
def test_dl():
@@ -370,4 +373,4 @@ def test_spaces():
370373
assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
371374
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
372375
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
373-
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo \n```\n\nbar'
376+
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'

0 commit comments

Comments
 (0)