@@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
5757 the text if it looks like an HTML tag. markup_fn is necessary to allow for
5858 references to self.strong_em_symbol etc.
5959 """
60- def implementation (self , el , text , convert_as_inline ):
60+ def implementation (self , el , text , parent_tags ):
6161 markup_prefix = markup_fn (self )
6262 if markup_prefix .startswith ('<' ) and markup_prefix .endswith ('>' ):
6363 markup_suffix = '</' + markup_prefix [1 :]
6464 else :
6565 markup_suffix = markup_prefix
66- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
66+ if '_noformat' in parent_tags :
6767 return text
6868 prefix , suffix , text = chomp (text )
6969 if not text :
@@ -170,24 +170,18 @@ def convert(self, html):
170170 return self .convert_soup (soup )
171171
172172 def convert_soup (self , soup ):
173- return self .process_tag (soup , convert_as_inline = False )
173+ return self .process_tag (soup , parent_tags = set () )
174174
175- def process_element (self , node , convert_as_inline ):
175+ def process_element (self , node , parent_tags = None ):
176176 if isinstance (node , NavigableString ):
177- return self .process_text (node )
177+ return self .process_text (node , parent_tags = parent_tags )
178178 else :
179- return self .process_tag (node , convert_as_inline )
179+ return self .process_tag (node , parent_tags = parent_tags )
180180
181- def process_tag (self , node , convert_as_inline ):
182- text = ''
183-
184- # For Markdown headings and table cells, convert children as inline
185- # (so that block element children do not produce newlines).
186- convert_children_as_inline = (
187- convert_as_inline # propagated from parent
188- or html_heading_re .match (node .name ) is not None # headings
189- or node .name in ['td' , 'th' ] # table cells
190- )
181+ def process_tag (self , node , parent_tags = None ):
182+ # For the top-level element, initialize the parent context with an empty set.
183+ if parent_tags is None :
184+ parent_tags = set ()
191185
192186 # Collect child elements to process, ignoring whitespace-only text elements
193187 # adjacent to the inner/outer boundaries of block elements.
@@ -220,8 +214,27 @@ def _can_ignore(el):
220214
221215 children_to_convert = [el for el in node .children if not _can_ignore (el )]
222216
217+ # Create a copy of this tag's parent context, then update it to include this tag
218+ # to propagate down into the children.
219+ parent_tags_for_children = set (parent_tags )
220+ parent_tags_for_children .add (node .name )
221+
222+ # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
223+ if (
224+ html_heading_re .match (node .name ) is not None # headings
225+ or node .name in {'td' , 'th' } # table cells
226+ ):
227+ parent_tags_for_children .add ('_inline' )
228+
229+ # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
230+ if node .name in {'pre' , 'code' , 'kbd' , 'samp' }:
231+ parent_tags_for_children .add ('_noformat' )
232+
223233 # Convert the children elements into a list of result strings.
224- child_strings = [self .process_element (el , convert_children_as_inline ) for el in children_to_convert ]
234+ child_strings = [
235+ self .process_element (el , parent_tags = parent_tags_for_children )
236+ for el in children_to_convert
237+ ]
225238
226239 # Remove empty string values.
227240 child_strings = [s for s in child_strings if s ]
@@ -256,11 +269,11 @@ def _can_ignore(el):
256269 convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
257270 convert_fn = getattr (self , convert_fn_name , None )
258271 if convert_fn and self .should_convert_tag (node .name ):
259- text = convert_fn (node , text , convert_as_inline )
272+ text = convert_fn (node , text , parent_tags = parent_tags )
260273
261274 return text
262275
263- def convert__document_ (self , el , text , convert_as_inline ):
276+ def convert__document_ (self , el , text , parent_tags ):
264277 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
265278 if self .options ['strip_document' ] == LSTRIP :
266279 text = text .lstrip ('\n ' ) # remove leading separation newlines
@@ -275,19 +288,23 @@ def convert__document_(self, el, text, convert_as_inline):
275288
276289 return text
277290
278- def process_text (self , el ):
291+ def process_text (self , el , parent_tags = None ):
292+ # For the top-level element, initialize the parent context with an empty set.
293+ if parent_tags is None :
294+ parent_tags = set ()
295+
279296 text = six .text_type (el ) or ''
280297
281298 # normalize whitespace if we're not inside a preformatted element
282- if not el . find_parent ( 'pre' ) :
299+ if 'pre' not in parent_tags :
283300 if self .options ['wrap' ]:
284301 text = all_whitespace_re .sub (' ' , text )
285302 else :
286303 text = newline_whitespace_re .sub ('\n ' , text )
287304 text = whitespace_re .sub (' ' , text )
288305
289306 # escape special characters if we're not inside a preformatted or code element
290- if not el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
307+ if '_noformat' not in parent_tags :
291308 text = self .escape (text )
292309
293310 # remove leading whitespace at the start or just after a
@@ -310,8 +327,8 @@ def __getattr__(self, attr):
310327 if m :
311328 n = int (m .group (1 ))
312329
313- def convert_tag (el , text , convert_as_inline ):
314- return self ._convert_hn (n , el , text , convert_as_inline )
330+ def convert_tag (el , text , parent_tags ):
331+ return self ._convert_hn (n , el , text , parent_tags )
315332
316333 convert_tag .__name__ = 'convert_h%s' % n
317334 setattr (self , convert_tag .__name__ , convert_tag )
@@ -358,8 +375,8 @@ def underline(self, text, pad_char):
358375 text = (text or '' ).rstrip ()
359376 return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
360377
361- def convert_a (self , el , text , convert_as_inline ):
362- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
378+ def convert_a (self , el , text , parent_tags ):
379+ if '_noformat' in parent_tags :
363380 return text
364381 prefix , suffix , text = chomp (text )
365382 if not text :
@@ -380,10 +397,10 @@ def convert_a(self, el, text, convert_as_inline):
380397
381398 convert_b = abstract_inline_conversion (lambda self : 2 * self .options ['strong_em_symbol' ])
382399
383- def convert_blockquote (self , el , text , convert_as_inline ):
400+ def convert_blockquote (self , el , text , parent_tags ):
384401 # handle some early-exit scenarios
385402 text = (text or '' ).strip ()
386- if convert_as_inline :
403+ if '_inline' in parent_tags :
387404 return ' ' + text + ' '
388405 if not text :
389406 return "\n "
@@ -396,25 +413,25 @@ def _indent_for_blockquote(match):
396413
397414 return '\n ' + text + '\n \n '
398415
399- def convert_br (self , el , text , convert_as_inline ):
400- if convert_as_inline :
416+ def convert_br (self , el , text , parent_tags ):
417+ if '_inline' in parent_tags :
401418 return ""
402419
403420 if self .options ['newline_style' ].lower () == BACKSLASH :
404421 return '\\ \n '
405422 else :
406423 return ' \n '
407424
408- def convert_code (self , el , text , convert_as_inline ):
409- if el . parent . name == 'pre' :
425+ def convert_code (self , el , text , parent_tags ):
426+ if 'pre' in parent_tags :
410427 return text
411428 converter = abstract_inline_conversion (lambda self : '`' )
412- return converter (self , el , text , convert_as_inline )
429+ return converter (self , el , text , parent_tags )
413430
414431 convert_del = abstract_inline_conversion (lambda self : '~~' )
415432
416- def convert_div (self , el , text , convert_as_inline ):
417- if convert_as_inline :
433+ def convert_div (self , el , text , parent_tags ):
434+ if '_inline' in parent_tags :
418435 return ' ' + text .strip () + ' '
419436 text = text .strip ()
420437 return '\n \n %s\n \n ' % text if text else ''
@@ -427,9 +444,9 @@ def convert_div(self, el, text, convert_as_inline):
427444
428445 convert_kbd = convert_code
429446
430- def convert_dd (self , el , text , convert_as_inline ):
447+ def convert_dd (self , el , text , parent_tags ):
431448 text = (text or '' ).strip ()
432- if convert_as_inline :
449+ if '_inline' in parent_tags :
433450 return ' ' + text + ' '
434451 if not text :
435452 return '\n '
@@ -445,11 +462,11 @@ def _indent_for_dd(match):
445462
446463 return '%s\n ' % text
447464
448- def convert_dt (self , el , text , convert_as_inline ):
465+ def convert_dt (self , el , text , parent_tags ):
449466 # remove newlines from term text
450467 text = (text or '' ).strip ()
451468 text = all_whitespace_re .sub (' ' , text )
452- if convert_as_inline :
469+ if '_inline' in parent_tags :
453470 return ' ' + text + ' '
454471 if not text :
455472 return '\n '
@@ -459,9 +476,9 @@ def convert_dt(self, el, text, convert_as_inline):
459476
460477 return '\n %s\n ' % text
461478
462- def _convert_hn (self , n , el , text , convert_as_inline ):
479+ def _convert_hn (self , n , el , text , parent_tags ):
463480 """ Method name prefixed with _ to prevent <hn> to call this """
464- if convert_as_inline :
481+ if '_inline' in parent_tags :
465482 return text
466483
467484 # prevent MemoryErrors in case of very large n
@@ -478,46 +495,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
478495 return '\n \n %s %s %s\n \n ' % (hashes , text , hashes )
479496 return '\n \n %s %s\n \n ' % (hashes , text )
480497
481- def convert_hr (self , el , text , convert_as_inline ):
498+ def convert_hr (self , el , text , parent_tags ):
482499 return '\n \n ---\n \n '
483500
484501 convert_i = convert_em
485502
486- def convert_img (self , el , text , convert_as_inline ):
503+ def convert_img (self , el , text , parent_tags ):
487504 alt = el .attrs .get ('alt' , None ) or ''
488505 src = el .attrs .get ('src' , None ) or ''
489506 title = el .attrs .get ('title' , None ) or ''
490507 title_part = ' "%s"' % title .replace ('"' , r'\"' ) if title else ''
491- if (convert_as_inline
508+ if ('_inline' in parent_tags
492509 and el .parent .name not in self .options ['keep_inline_images_in' ]):
493510 return alt
494511
495512 return '' % (alt , src , title_part )
496513
497- def convert_list (self , el , text , convert_as_inline ):
514+ def convert_list (self , el , text , parent_tags ):
498515
499516 # Converting a list to inline is undefined.
500- # Ignoring convert_to_inline for list.
517+ # Ignoring inline conversion parents for list.
501518
502- nested = False
503519 before_paragraph = False
504520 next_sibling = _next_block_content_sibling (el )
505521 if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
506522 before_paragraph = True
507- while el :
508- if el .name == 'li' :
509- nested = True
510- break
511- el = el .parent
512- if nested :
513- # remove trailing newline if nested
523+ if 'li' in parent_tags :
524+ # remove trailing newline if we're in a nested list
514525 return '\n ' + text .rstrip ()
515526 return '\n \n ' + text + ('\n ' if before_paragraph else '' )
516527
517528 convert_ul = convert_list
518529 convert_ol = convert_list
519530
520- def convert_li (self , el , text , convert_as_inline ):
531+ def convert_li (self , el , text , parent_tags ):
521532 # handle some early-exit scenarios
522533 text = (text or '' ).strip ()
523534 if not text :
@@ -554,8 +565,8 @@ def _indent_for_li(match):
554565
555566 return '%s\n ' % text
556567
557- def convert_p (self , el , text , convert_as_inline ):
558- if convert_as_inline :
568+ def convert_p (self , el , text , parent_tags ):
569+ if '_inline' in parent_tags :
559570 return ' ' + text .strip () + ' '
560571 text = text .strip ()
561572 if self .options ['wrap' ]:
@@ -577,7 +588,7 @@ def convert_p(self, el, text, convert_as_inline):
577588 text = '\n ' .join (new_lines )
578589 return '\n \n %s\n \n ' % text if text else ''
579590
580- def convert_pre (self , el , text , convert_as_inline ):
591+ def convert_pre (self , el , text , parent_tags ):
581592 if not text :
582593 return ''
583594 code_language = self .options ['code_language' ]
@@ -587,10 +598,10 @@ def convert_pre(self, el, text, convert_as_inline):
587598
588599 return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
589600
590- def convert_script (self , el , text , convert_as_inline ):
601+ def convert_script (self , el , text , parent_tags ):
591602 return ''
592603
593- def convert_style (self , el , text , convert_as_inline ):
604+ def convert_style (self , el , text , parent_tags ):
594605 return ''
595606
596607 convert_s = convert_del
@@ -603,28 +614,28 @@ def convert_style(self, el, text, convert_as_inline):
603614
604615 convert_sup = abstract_inline_conversion (lambda self : self .options ['sup_symbol' ])
605616
606- def convert_table (self , el , text , convert_as_inline ):
617+ def convert_table (self , el , text , parent_tags ):
607618 return '\n \n ' + text .strip () + '\n \n '
608619
609- def convert_caption (self , el , text , convert_as_inline ):
620+ def convert_caption (self , el , text , parent_tags ):
610621 return text .strip () + '\n \n '
611622
612- def convert_figcaption (self , el , text , convert_as_inline ):
623+ def convert_figcaption (self , el , text , parent_tags ):
613624 return '\n \n ' + text .strip () + '\n \n '
614625
615- def convert_td (self , el , text , convert_as_inline ):
626+ def convert_td (self , el , text , parent_tags ):
616627 colspan = 1
617628 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
618629 colspan = int (el ['colspan' ])
619630 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
620631
621- def convert_th (self , el , text , convert_as_inline ):
632+ def convert_th (self , el , text , parent_tags ):
622633 colspan = 1
623634 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
624635 colspan = int (el ['colspan' ])
625636 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
626637
627- def convert_tr (self , el , text , convert_as_inline ):
638+ def convert_tr (self , el , text , parent_tags ):
628639 cells = el .find_all (['td' , 'th' ])
629640 is_first_row = el .find_previous_sibling () is None
630641 is_headrow = (
0 commit comments