1- from bs4 import BeautifulSoup , NavigableString , Comment , Doctype
1+ from bs4 import BeautifulSoup , Comment , Doctype , NavigableString , Tag
22from textwrap import fill
33import re
44import six
@@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el):
7979 if html_heading_re .match (el .name ) is not None :
8080 return True
8181 return el .name in ('p' , 'blockquote' ,
82+ 'article' , 'div' , 'section' ,
8283 'ol' , 'ul' , 'li' ,
8384 'table' , 'thead' , 'tbody' , 'tfoot' ,
8485 'tr' , 'td' , 'th' )
@@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el):
8990 return should_remove_whitespace_inside (el ) or (el and el .name == 'pre' )
9091
9192
93+ def _is_block_content_element (el ):
94+ """
95+ In a block context, returns:
96+
97+ - True for content elements (tags and non-whitespace text)
98+ - False for non-content elements (whitespace text, comments, doctypes)
99+ """
100+ if isinstance (el , Tag ):
101+ return True
102+ elif isinstance (el , (Comment , Doctype )):
103+ return False # (subclasses of NavigableString, must test first)
104+ elif isinstance (el , NavigableString ):
105+ return el .strip () != ''
106+ else :
107+ return False
108+
109+
110+ def _prev_block_content_sibling (el ):
111+ """Returns the first previous sibling that is a content element, else None."""
112+ while el is not None :
113+ el = el .previous_sibling
114+ if _is_block_content_element (el ):
115+ return el
116+ return None
117+
118+
119+ def _next_block_content_sibling (el ):
120+ """Returns the first next sibling that is a content element, else None."""
121+ while el is not None :
122+ el = el .next_sibling
123+ if _is_block_content_element (el ):
124+ return el
125+ return None
126+
127+
92128class MarkdownConverter (object ):
93129 class DefaultOptions :
94130 autolinks = True
@@ -143,29 +179,38 @@ def process_tag(self, node, convert_as_inline):
143179 or node .name in ['td' , 'th' ] # table cells
144180 )
145181
146- # Remove whitespace-only textnodes just before, after or
147- # inside block-level elements.
182+ # Collect child elements to process, ignoring whitespace-only text elements
183+ # adjacent to the inner/outer boundaries of block elements.
148184 should_remove_inside = should_remove_whitespace_inside (node )
149- for el in node .children :
150- # Only extract (remove) whitespace-only text node if any of the
151- # conditions is true:
152- # - el is the first element in its parent (block-level)
153- # - el is the last element in its parent (block-level)
154- # - el is adjacent to a block-level node
155- can_extract = (should_remove_inside and (not el .previous_sibling
156- or not el .next_sibling )
157- or should_remove_whitespace_outside (el .previous_sibling )
158- or should_remove_whitespace_outside (el .next_sibling ))
159- if (isinstance (el , NavigableString )
160- and six .text_type (el ).strip () == ''
161- and can_extract ):
162- el .extract ()
163185
164- # Convert the children first
165- for el in node .children :
166- if isinstance (el , Comment ) or isinstance (el , Doctype ):
167- continue
186+ def _can_ignore (el ):
187+ if isinstance (el , Tag ):
188+ # Tags are always processed.
189+ return False
190+ elif isinstance (el , (Comment , Doctype )):
191+ # Comment and Doctype elements are always ignored.
192+ # (subclasses of NavigableString, must test first)
193+ return True
168194 elif isinstance (el , NavigableString ):
195+ if six .text_type (el ).strip () != '' :
196+ # Non-whitespace text nodes are always processed.
197+ return False
198+ elif should_remove_inside and (not el .previous_sibling or not el .next_sibling ):
199+ # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
200+ return True
201+ elif should_remove_whitespace_outside (el .previous_sibling ) or should_remove_whitespace_outside (el .next_sibling ):
202+ # Outside block elements (including <pre>), ignore adjacent whitespace elements.
203+ return True
204+ else :
205+ return False
206+ else :
207+ raise ValueError ('Unexpected element type: %s' % type (el ))
208+
209+ children_to_convert = [child for child in node .children if not _can_ignore (child )]
210+
211+ # Convert the children first
212+ for el in children_to_convert :
213+ if isinstance (el , NavigableString ):
169214 text += self .process_text (el )
170215 else :
171216 text_strip = text .rstrip ('\n ' )
@@ -337,6 +382,16 @@ def convert_code(self, el, text, convert_as_inline):
337382
338383 convert_del = abstract_inline_conversion (lambda self : '~~' )
339384
385+ def convert_div (self , el , text , convert_as_inline ):
386+ if convert_as_inline :
387+ return ' ' + text .strip () + ' '
388+ text = text .strip ()
389+ return '\n \n %s\n \n ' % text if text else ''
390+
391+ convert_article = convert_div
392+
393+ convert_section = convert_div
394+
340395 convert_em = abstract_inline_conversion (lambda self : self .options ['strong_em_symbol' ])
341396
342397 convert_kbd = convert_code
@@ -415,7 +470,8 @@ def convert_list(self, el, text, convert_as_inline):
415470
416471 nested = False
417472 before_paragraph = False
418- if el .next_sibling and el .next_sibling .name not in ['ul' , 'ol' ]:
473+ next_sibling = _next_block_content_sibling (el )
474+ if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
419475 before_paragraph = True
420476 while el :
421477 if el .name == 'li' :
@@ -539,22 +595,23 @@ def convert_th(self, el, text, convert_as_inline):
539595
540596 def convert_tr (self , el , text , convert_as_inline ):
541597 cells = el .find_all (['td' , 'th' ])
598+ is_first_row = el .find_previous_sibling () is None
542599 is_headrow = (
543600 all ([cell .name == 'th' for cell in cells ])
544601 or (el .parent .name == 'thead'
545602 # avoid multiple tr in thead
546603 and len (el .parent .find_all ('tr' )) == 1 )
547604 )
548605 is_head_row_missing = (
549- (not el . previous_sibling and not el .parent .name == 'tbody' )
550- or (not el . previous_sibling and el .parent .name == 'tbody' and len (el .parent .parent .find_all (['thead' ])) < 1 )
606+ (is_first_row and not el .parent .name == 'tbody' )
607+ or (is_first_row and el .parent .name == 'tbody' and len (el .parent .parent .find_all (['thead' ])) < 1 )
551608 )
552609 overline = ''
553610 underline = ''
554611 if ((is_headrow
555612 or (is_head_row_missing
556613 and self .options ['table_infer_header' ]))
557- and not el . previous_sibling ):
614+ and is_first_row ):
558615 # first row and:
559616 # - is headline or
560617 # - headline is missing and header inference is enabled
@@ -568,10 +625,10 @@ def convert_tr(self, el, text, convert_as_inline):
568625 underline += '| ' + ' | ' .join (['---' ] * full_colspan ) + ' |' + '\n '
569626 elif ((is_head_row_missing
570627 and not self .options ['table_infer_header' ])
571- or (not el . previous_sibling
628+ or (is_first_row
572629 and (el .parent .name == 'table'
573630 or (el .parent .name == 'tbody'
574- and not el .parent .previous_sibling )))):
631+ and not el .parent .find_previous_sibling () )))):
575632 # headline is missing and header inference is disabled or:
576633 # first row, not headline, and:
577634 # - the parent is table or
0 commit comments