1111re_all_whitespace = re .compile (r'[\t \r\n]+' )
1212re_newline_whitespace = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
1313re_html_heading = re .compile (r'h(\d+)' )
14+ re_pre_lstrip1 = re .compile (r'^ *\n' )
15+ re_pre_rstrip1 = re .compile (r'\n *$' )
16+ re_pre_lstrip = re .compile (r'^[ \n]*\n' )
17+ re_pre_rstrip = re .compile (r'[ \n]*$' )
1418
1519# Pattern for creating convert_<tag> function names from tag names
1620re_make_convert_fn_name = re .compile (r'[\[\]:-]' )
3741# confused with a list item
3842re_escape_misc_list_items = re .compile (r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))' )
3943
44+ # Find consecutive backtick sequences in a string
45+ re_backtick_runs = re .compile (r'`+' )
46+
4047# Heading styles
4148ATX = 'atx'
4249ATX_CLOSED = 'atx_closed'
5158ASTERISK = '*'
5259UNDERSCORE = '_'
5360
54- # Document strip styles
61+ # Document/pre strip styles
5562LSTRIP = 'lstrip'
5663RSTRIP = 'rstrip'
5764STRIP = 'strip'
65+ STRIP_ONE = 'strip_one'
66+
67+
68+ def strip1_pre (text ):
69+ """Strip one leading and trailing newline from a <pre> string."""
70+ text = re_pre_lstrip1 .sub ('' , text )
71+ text = re_pre_rstrip1 .sub ('' , text )
72+ return text
73+
74+
75+ def strip_pre (text ):
76+ """Strip all leading and trailing newlines from a <pre> string."""
77+ text = re_pre_lstrip .sub ('' , text )
78+ text = re_pre_rstrip .sub ('' , text )
79+ return text
5880
5981
6082def chomp (text ):
@@ -154,7 +176,7 @@ def _next_block_content_sibling(el):
154176class MarkdownConverter (object ):
155177 class DefaultOptions :
156178 autolinks = True
157- beautiful_soup_parser = 'html.parser'
179+ bs4_options = 'html.parser'
158180 bullets = '*+-' # An iterable of bullet types.
159181 code_language = ''
160182 code_language_callback = None
@@ -168,6 +190,7 @@ class DefaultOptions:
168190 newline_style = SPACES
169191 strip = None
170192 strip_document = STRIP
193+ strip_pre = STRIP
171194 strong_em_symbol = ASTERISK
172195 sub_symbol = ''
173196 sup_symbol = ''
@@ -188,11 +211,15 @@ def __init__(self, **options):
188211 raise ValueError ('You may specify either tags to strip or tags to'
189212 ' convert, but not both.' )
190213
214+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
215+ if not isinstance (self .options ['bs4_options' ], dict ):
216+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
217+
191218 # Initialize the conversion function cache
192219 self .convert_fn_cache = {}
193220
194221 def convert (self , html ):
195- soup = BeautifulSoup (html , self .options ['beautiful_soup_parser ' ])
222+ soup = BeautifulSoup (html , ** self .options ['bs4_options ' ])
196223 return self .convert_soup (soup )
197224
198225 def convert_soup (self , soup ):
@@ -456,10 +483,24 @@ def convert_br(self, el, text, parent_tags):
456483 return ' \n '
457484
458485 def convert_code (self , el , text , parent_tags ):
459- if 'pre ' in parent_tags :
486+ if '_noformat ' in parent_tags :
460487 return text
461- converter = abstract_inline_conversion (lambda self : '`' )
462- return converter (self , el , text , parent_tags )
488+
489+ prefix , suffix , text = chomp (text )
490+ if not text :
491+ return ''
492+
493+ # Find the maximum number of consecutive backticks in the text, then
494+ # delimit the code span with one more backtick than that
495+ max_backticks = max ((len (match ) for match in re .findall (re_backtick_runs , text )), default = 0 )
496+ markup_delimiter = '`' * (max_backticks + 1 )
497+
498+ # If the maximum number of backticks is greater than zero, add a space
499+ # to avoid interpretation of inside backticks as literals
500+ if max_backticks > 0 :
501+ text = " " + text + " "
502+
503+ return '%s%s%s%s%s' % (prefix , markup_delimiter , text , markup_delimiter , suffix )
463504
464505 convert_del = abstract_inline_conversion (lambda self : '~~' )
465506
@@ -652,6 +693,15 @@ def convert_pre(self, el, text, parent_tags):
652693 if self .options ['code_language_callback' ]:
653694 code_language = self .options ['code_language_callback' ](el ) or code_language
654695
696+ if self .options ['strip_pre' ] == STRIP :
697+ text = strip_pre (text ) # remove all leading/trailing newlines
698+ elif self .options ['strip_pre' ] == STRIP_ONE :
699+ text = strip1_pre (text ) # remove one leading/trailing newline
700+ elif self .options ['strip_pre' ] is None :
701+ pass # leave leading and trailing newlines as-is
702+ else :
703+ raise ValueError ('Invalid value for strip_pre: %s' % self .options ['strip_pre' ])
704+
655705 return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
656706
657707 def convert_q (self , el , text , parent_tags ):
@@ -685,13 +735,13 @@ def convert_figcaption(self, el, text, parent_tags):
685735 def convert_td (self , el , text , parent_tags ):
686736 colspan = 1
687737 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
688- colspan = int (el ['colspan' ])
738+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
689739 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
690740
691741 def convert_th (self , el , text , parent_tags ):
692742 colspan = 1
693743 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
694- colspan = int (el ['colspan' ])
744+ colspan = max ( 1 , min ( 1000 , int (el ['colspan' ])) )
695745 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
696746
697747 def convert_tr (self , el , text , parent_tags ):
@@ -712,7 +762,7 @@ def convert_tr(self, el, text, parent_tags):
712762 full_colspan = 0
713763 for cell in cells :
714764 if 'colspan' in cell .attrs and cell ['colspan' ].isdigit ():
715- full_colspan += int (cell [" colspan" ] )
765+ full_colspan += max ( 1 , min ( 1000 , int (cell [' colspan' ])) )
716766 else :
717767 full_colspan += 1
718768 if ((is_headrow
0 commit comments