@@ -53,13 +53,13 @@ def abstract_inline_conversion(markup_fn):
5353 the text if it looks like an HTML tag. markup_fn is necessary to allow for
5454 references to self.strong_em_symbol etc.
5555 """
56- def implementation (self , el , text , convert_as_inline ):
56+ def implementation (self , el , text , parent_tags ):
5757 markup_prefix = markup_fn (self )
5858 if markup_prefix .startswith ('<' ) and markup_prefix .endswith ('>' ):
5959 markup_suffix = '</' + markup_prefix [1 :]
6060 else :
6161 markup_suffix = markup_prefix
62- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
62+ if '_noformat' in parent_tags :
6363 return text
6464 prefix , suffix , text = chomp (text )
6565 if not text :
@@ -166,18 +166,13 @@ def convert(self, html):
166166 return self .convert_soup (soup )
167167
168168 def convert_soup (self , soup ):
169- return self .process_tag (soup , convert_as_inline = False )
169+ return self .process_tag (soup , parent_tags = set () )
170170
171- def process_tag (self , node , convert_as_inline ):
172- text = ''
171+ def process_tag (self , node , parent_tags = None ):
172+ if parent_tags is None :
173+ parent_tags = set ()
173174
174- # For Markdown headings and table cells, convert children as inline
175- # (so that block element children do not produce newlines).
176- convert_children_as_inline = (
177- convert_as_inline # propagated from parent
178- or html_heading_re .match (node .name ) is not None # headings
179- or node .name in ['td' , 'th' ] # table cells
180- )
175+ text = ''
181176
182177 # Collect child elements to process, ignoring whitespace-only text elements
183178 # adjacent to the inner/outer boundaries of block elements.
@@ -208,28 +203,48 @@ def _can_ignore(el):
208203
209204 children_to_convert = [child for child in node .children if not _can_ignore (child )]
210205
206+ node_name = node .name
207+
211208 # Convert the children first
212- for el in children_to_convert :
213- if isinstance (el , NavigableString ):
214- text += self .process_text (el )
215- else :
216- text_strip = text .rstrip ('\n ' )
217- newlines_left = len (text ) - len (text_strip )
218- next_text = self .process_tag (el , convert_children_as_inline )
219- next_text_strip = next_text .lstrip ('\n ' )
220- newlines_right = len (next_text ) - len (next_text_strip )
221- newlines = '\n ' * max (newlines_left , newlines_right )
222- text = text_strip + newlines + next_text_strip
209+ if children_to_convert :
210+ # for children tags, start with a copy of the parent tag set
211+ parent_tags_for_children = set (parent_tags )
212+
213+ # add this tag's name as a parent
214+ parent_tags_for_children .add (node_name )
215+
216+ # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
217+ if (
218+ html_heading_re .match (node_name ) is not None # headings
219+ or node_name in {'td' , 'th' } # table cells
220+ ):
221+ parent_tags_for_children .add ('_inline' )
222+
223+ # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
224+ if node_name in {'pre' , 'code' , 'kbd' , 'samp' }:
225+ parent_tags_for_children .add ('_noformat' )
226+
227+ for el in children_to_convert :
228+ if isinstance (el , NavigableString ):
229+ text += self .process_text (el , parent_tags = parent_tags_for_children )
230+ else :
231+ text_strip = text .rstrip ('\n ' )
232+ newlines_left = len (text ) - len (text_strip )
233+ next_text = self .process_tag (el , parent_tags = parent_tags_for_children )
234+ next_text_strip = next_text .lstrip ('\n ' )
235+ newlines_right = len (next_text ) - len (next_text_strip )
236+ newlines = '\n ' * max (newlines_left , newlines_right )
237+ text = text_strip + newlines + next_text_strip
223238
224239 # apply this tag's final conversion function
225- convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node . name )
240+ convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node_name )
226241 convert_fn = getattr (self , convert_fn_name , None )
227- if convert_fn and self .should_convert_tag (node . name ):
228- text = convert_fn (node , text , convert_as_inline )
242+ if convert_fn and self .should_convert_tag (node_name ):
243+ text = convert_fn (node , text , parent_tags = parent_tags )
229244
230245 return text
231246
232- def convert__document_ (self , el , text , convert_as_inline ):
247+ def convert__document_ (self , el , text , parent_tags ):
233248 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
234249 if self .options ['strip_document' ] == LSTRIP :
235250 text = text .lstrip ('\n ' ) # remove leading separation newlines
@@ -244,19 +259,19 @@ def convert__document_(self, el, text, convert_as_inline):
244259
245260 return text
246261
247- def process_text (self , el ):
262+ def process_text (self , el , parent_tags ):
248263 text = six .text_type (el ) or ''
249264
250265 # normalize whitespace if we're not inside a preformatted element
251- if not el . find_parent ( 'pre' ) :
266+ if 'pre' not in parent_tags :
252267 if self .options ['wrap' ]:
253268 text = all_whitespace_re .sub (' ' , text )
254269 else :
255270 text = newline_whitespace_re .sub ('\n ' , text )
256271 text = whitespace_re .sub (' ' , text )
257272
258273 # escape special characters if we're not inside a preformatted or code element
259- if not el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
274+ if '_noformat' not in parent_tags :
260275 text = self .escape (text )
261276
262277 # remove leading whitespace at the start or just after a
@@ -279,8 +294,8 @@ def __getattr__(self, attr):
279294 if m :
280295 n = int (m .group (1 ))
281296
282- def convert_tag (el , text , convert_as_inline ):
283- return self ._convert_hn (n , el , text , convert_as_inline )
297+ def convert_tag (el , text , parent_tags ):
298+ return self ._convert_hn (n , el , text , parent_tags )
284299
285300 convert_tag .__name__ = 'convert_h%s' % n
286301 setattr (self , convert_tag .__name__ , convert_tag )
@@ -327,8 +342,8 @@ def underline(self, text, pad_char):
327342 text = (text or '' ).rstrip ()
328343 return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
329344
330- def convert_a (self , el , text , convert_as_inline ):
331- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
345+ def convert_a (self , el , text , parent_tags ):
346+ if '_noformat' in parent_tags :
332347 return text
333348 prefix , suffix , text = chomp (text )
334349 if not text :
@@ -349,10 +364,10 @@ def convert_a(self, el, text, convert_as_inline):
349364
350365 convert_b = abstract_inline_conversion (lambda self : 2 * self .options ['strong_em_symbol' ])
351366
352- def convert_blockquote (self , el , text , convert_as_inline ):
367+ def convert_blockquote (self , el , text , parent_tags ):
353368 # handle some early-exit scenarios
354369 text = (text or '' ).strip ()
355- if convert_as_inline :
370+ if '_inline' in parent_tags :
356371 return ' ' + text + ' '
357372 if not text :
358373 return "\n "
@@ -365,25 +380,25 @@ def _indent_for_blockquote(match):
365380
366381 return '\n ' + text + '\n \n '
367382
368- def convert_br (self , el , text , convert_as_inline ):
369- if convert_as_inline :
383+ def convert_br (self , el , text , parent_tags ):
384+ if '_inline' in parent_tags :
370385 return ""
371386
372387 if self .options ['newline_style' ].lower () == BACKSLASH :
373388 return '\\ \n '
374389 else :
375390 return ' \n '
376391
377- def convert_code (self , el , text , convert_as_inline ):
378- if el . parent . name == 'pre' :
392+ def convert_code (self , el , text , parent_tags ):
393+ if 'pre' in parent_tags :
379394 return text
380395 converter = abstract_inline_conversion (lambda self : '`' )
381- return converter (self , el , text , convert_as_inline )
396+ return converter (self , el , text , parent_tags )
382397
383398 convert_del = abstract_inline_conversion (lambda self : '~~' )
384399
385- def convert_div (self , el , text , convert_as_inline ):
386- if convert_as_inline :
400+ def convert_div (self , el , text , parent_tags ):
401+ if '_inline' in parent_tags :
387402 return ' ' + text .strip () + ' '
388403 text = text .strip ()
389404 return '\n \n %s\n \n ' % text if text else ''
@@ -396,9 +411,9 @@ def convert_div(self, el, text, convert_as_inline):
396411
397412 convert_kbd = convert_code
398413
399- def convert_dd (self , el , text , convert_as_inline ):
414+ def convert_dd (self , el , text , parent_tags ):
400415 text = (text or '' ).strip ()
401- if convert_as_inline :
416+ if '_inline' in parent_tags :
402417 return ' ' + text + ' '
403418 if not text :
404419 return '\n '
@@ -414,11 +429,11 @@ def _indent_for_dd(match):
414429
415430 return '%s\n ' % text
416431
417- def convert_dt (self , el , text , convert_as_inline ):
432+ def convert_dt (self , el , text , parent_tags ):
418433 # remove newlines from term text
419434 text = (text or '' ).strip ()
420435 text = all_whitespace_re .sub (' ' , text )
421- if convert_as_inline :
436+ if '_inline' in parent_tags :
422437 return ' ' + text + ' '
423438 if not text :
424439 return '\n '
@@ -428,9 +443,9 @@ def convert_dt(self, el, text, convert_as_inline):
428443
429444 return '\n %s\n ' % text
430445
431- def _convert_hn (self , n , el , text , convert_as_inline ):
446+ def _convert_hn (self , n , el , text , parent_tags ):
432447 """ Method name prefixed with _ to prevent <hn> to call this """
433- if convert_as_inline :
448+ if '_inline' in parent_tags :
434449 return text
435450
436451 # prevent MemoryErrors in case of very large n
@@ -447,46 +462,41 @@ def _convert_hn(self, n, el, text, convert_as_inline):
447462 return '\n \n %s %s %s\n \n ' % (hashes , text , hashes )
448463 return '\n \n %s %s\n \n ' % (hashes , text )
449464
450- def convert_hr (self , el , text , convert_as_inline ):
465+ def convert_hr (self , el , text , parent_tags ):
451466 return '\n \n ---\n \n '
452467
453468 convert_i = convert_em
454469
455- def convert_img (self , el , text , convert_as_inline ):
470+ def convert_img (self , el , text , parent_tags ):
456471 alt = el .attrs .get ('alt' , None ) or ''
457472 src = el .attrs .get ('src' , None ) or ''
458473 title = el .attrs .get ('title' , None ) or ''
459474 title_part = ' "%s"' % title .replace ('"' , r'\"' ) if title else ''
460- if (convert_as_inline
475+ if ('_inline' in parent_tags
461476 and el .parent .name not in self .options ['keep_inline_images_in' ]):
462477 return alt
463478
464479 return '' % (alt , src , title_part )
465480
466- def convert_list (self , el , text , convert_as_inline ):
481+ def convert_list (self , el , text , parent_tags ):
467482
468483 # Converting a list to inline is undefined.
469- # Ignoring convert_to_inline for list.
484+ # Ignoring inline conversion parents for list.
470485
471486 nested = False
472487 before_paragraph = False
473488 next_sibling = _next_block_content_sibling (el )
474489 if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
475490 before_paragraph = True
476- while el :
477- if el .name == 'li' :
478- nested = True
479- break
480- el = el .parent
481- if nested :
482- # remove trailing newline if nested
491+ if 'li' in parent_tags :
492+ # remove trailing newline if we're in a nested list
483493 return '\n ' + text .rstrip ()
484494 return '\n \n ' + text + ('\n ' if before_paragraph else '' )
485495
486496 convert_ul = convert_list
487497 convert_ol = convert_list
488498
489- def convert_li (self , el , text , convert_as_inline ):
499+ def convert_li (self , el , text , parent_tags ):
490500 # handle some early-exit scenarios
491501 text = (text or '' ).strip ()
492502 if not text :
@@ -523,8 +533,8 @@ def _indent_for_li(match):
523533
524534 return '%s\n ' % text
525535
526- def convert_p (self , el , text , convert_as_inline ):
527- if convert_as_inline :
536+ def convert_p (self , el , text , parent_tags ):
537+ if '_inline' in parent_tags :
528538 return ' ' + text .strip () + ' '
529539 text = text .strip ()
530540 if self .options ['wrap' ]:
@@ -546,7 +556,7 @@ def convert_p(self, el, text, convert_as_inline):
546556 text = '\n ' .join (new_lines )
547557 return '\n \n %s\n \n ' % text if text else ''
548558
549- def convert_pre (self , el , text , convert_as_inline ):
559+ def convert_pre (self , el , text , parent_tags ):
550560 if not text :
551561 return ''
552562 code_language = self .options ['code_language' ]
@@ -556,10 +566,10 @@ def convert_pre(self, el, text, convert_as_inline):
556566
557567 return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
558568
559- def convert_script (self , el , text , convert_as_inline ):
569+ def convert_script (self , el , text , parent_tags ):
560570 return ''
561571
562- def convert_style (self , el , text , convert_as_inline ):
572+ def convert_style (self , el , text , parent_tags ):
563573 return ''
564574
565575 convert_s = convert_del
@@ -572,28 +582,28 @@ def convert_style(self, el, text, convert_as_inline):
572582
573583 convert_sup = abstract_inline_conversion (lambda self : self .options ['sup_symbol' ])
574584
575- def convert_table (self , el , text , convert_as_inline ):
585+ def convert_table (self , el , text , parent_tags ):
576586 return '\n \n ' + text .strip () + '\n \n '
577587
578- def convert_caption (self , el , text , convert_as_inline ):
588+ def convert_caption (self , el , text , parent_tags ):
579589 return text .strip () + '\n \n '
580590
581- def convert_figcaption (self , el , text , convert_as_inline ):
591+ def convert_figcaption (self , el , text , parent_tags ):
582592 return '\n \n ' + text .strip () + '\n \n '
583593
584- def convert_td (self , el , text , convert_as_inline ):
594+ def convert_td (self , el , text , parent_tags ):
585595 colspan = 1
586596 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
587597 colspan = int (el ['colspan' ])
588598 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
589599
590- def convert_th (self , el , text , convert_as_inline ):
600+ def convert_th (self , el , text , parent_tags ):
591601 colspan = 1
592602 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
593603 colspan = int (el ['colspan' ])
594604 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
595605
596- def convert_tr (self , el , text , convert_as_inline ):
606+ def convert_tr (self , el , text , parent_tags ):
597607 cells = el .find_all (['td' , 'th' ])
598608 is_first_row = el .find_previous_sibling () is None
599609 is_headrow = (
0 commit comments