Skip to content

Commit

Permalink
Support tri-backquote style code block and fix ordered list indent
Browse files Browse the repository at this point in the history
  • Loading branch information
wetor committed Nov 4, 2024
1 parent 8917f5c commit 6630122
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 14 deletions.
39 changes: 30 additions & 9 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(
self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
self.hide_strikethrough = False # covered in cli
self.mark_code = config.MARK_CODE
self.backquote_code_style = config.BACKQUOTE_CODE_STYLE
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
self.wrap_links = config.WRAP_LINKS # covered in cli
self.wrap_tables = config.WRAP_TABLES
Expand Down Expand Up @@ -111,6 +112,8 @@ def __init__(
self.blockquote = 0
self.pre = False
self.startpre = False
self.pre_indent = ""
self.list_code_indent = ""
self.code = False
self.quote = False
self.br_toggle = ""
Expand Down Expand Up @@ -629,6 +632,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
self.lastWasList = False

if tag == "li":
self.list_code_indent = ""
self.pbr()
if start:
if self.list:
Expand All @@ -644,15 +648,16 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
# TODO: line up <ol><li>s > 9 correctly.
parent_list = None
for list in self.list:
self.o(
" " if parent_list == "ol" and list.name == "ul" else " "
)
self.list_code_indent += " " if parent_list == "ol" else " "
parent_list = list.name
self.o(self.list_code_indent)

if li.name == "ul":
self.list_code_indent += " "
self.o(self.ul_item_mark + " ")
elif li.name == "ol":
li.num += 1
self.list_code_indent += " "
self.o(str(li.num) + ". ")
self.start = True

Expand Down Expand Up @@ -715,8 +720,11 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
if start:
self.startpre = True
self.pre = True
self.pre_indent = ""
else:
self.pre = False
if self.backquote_code_style:
self.out("\n" + self.pre_indent + "```")
if self.mark_code:
self.out("\n[/code]")
self.p()
Expand Down Expand Up @@ -786,17 +794,23 @@ def o(
bq += " "

if self.pre:
if not self.list:
if self.list:
bq += self.list_code_indent

if not self.backquote_code_style:
bq += " "
# else: list content is already partially indented
bq += " " * len(self.list)

data = data.replace("\n", "\n" + bq)
self.pre_indent = bq

if self.startpre:
self.startpre = False
if self.list:
if self.backquote_code_style:
self.out("\n" + self.pre_indent + "```")
self.p_p = 0
elif self.list:
# use existing initial indentation
data = data.lstrip("\n")
data = data.lstrip("\n" + self.pre_indent)

if self.start:
self.space = False
Expand Down Expand Up @@ -952,8 +966,15 @@ def optwrap(self, text: str) -> str:
# because of the presence of a link in it
if not self.wrap_links:
self.inline_links = False
start_code = False
for para in text.split("\n"):
if len(para) > 0:
# If the text is between tri-backquote pairs, it's a code block;
# don't wrap
if self.backquote_code_style and para.lstrip().startswith("```"):
start_code = not start_code
if start_code:
result += para + "\n"
elif len(para) > 0:
if not skipwrap(
para, self.wrap_links, self.wrap_list_items, self.wrap_tables
):
Expand Down
8 changes: 8 additions & 0 deletions html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,13 @@ class bcolors:
default=config.MARK_CODE,
help="Mark program code blocks with [code]...[/code]",
)
p.add_argument(
"--backquote-code-style",
action="store_true",
dest="backquote_code_style",
default=config.BACKQUOTE_CODE_STYLE,
help="Multi line code block using tri-backquote style",
)
p.add_argument(
"--decode-errors",
dest="decode_errors",
Expand Down Expand Up @@ -318,6 +325,7 @@ class bcolors:
h.skip_internal_links = args.skip_internal_links
h.links_each_paragraph = args.links_each_paragraph
h.mark_code = args.mark_code
h.backquote_code_style = args.backquote_code_style
h.wrap_links = args.wrap_links
h.wrap_list_items = args.wrap_list_items
h.wrap_tables = args.wrap_tables
Expand Down
1 change: 1 addition & 0 deletions html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
IMAGES_WITH_SIZE = False
IGNORE_EMPHASIS = False
MARK_CODE = False
BACKQUOTE_CODE_STYLE = False
DECODE_ERRORS = "strict"
DEFAULT_IMAGE_ALT = ""
PAD_TABLES = False
Expand Down
31 changes: 31 additions & 0 deletions test/backquote_code_style.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<p><pre>
def func(x):
if x &lt; 1:
return 'a'
return 'b'
</pre></p>

<ul>
<li>unordered</li>
<li>...</li>
<ol>
<li>ordered</li>
<li>code:
<pre>a
b
c</pre>
</li>
<li>...</li>
<ol>
<li>ordered</li>
<li>code:
<pre>d
e
f</pre>
</li>
<li>...</li>
</ol>
<li>end</li>
</ol>
<li>end</li>
</ul>
32 changes: 32 additions & 0 deletions test/backquote_code_style.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

```
def func(x):
if x < 1:
return 'a'
return 'b'
```

* unordered
* ...
1. ordered
2. code:
```
a
b
c
```

3. ...
1. ordered
2. code:
```
d
e
f
```

3. ...
4. end
* end

25 changes: 25 additions & 0 deletions test/mixed_nested_lists.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,28 @@
<li>end</li>
</ul>


<ul>
<li>unordered</li>
<li>...</li>
<ol>
<li>ordered</li>
<li>code:
<pre>a
b
c</pre>
</li>
<li>...</li>
<ol>
<li>ordered</li>
<li>code:
<pre>d
e
f</pre>
</li>
<li>...</li>
</ol>
<li>end</li>
</ol>
<li>end</li>
</ul>
22 changes: 22 additions & 0 deletions test/mixed_nested_lists.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,25 @@
1. ordered
2. ...
* end

* unordered
* ...
1. ordered
2. code:

a
b
c

3. ...
1. ordered
2. code:

d
e
f

3. ...
4. end
* end

4 changes: 2 additions & 2 deletions test/normal.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ text to separate lists

1. now with numbers
2. the prisoner
1. not an _italic number_
2. a **bold human** being
1. not an _italic number_
2. a **bold human** being
3. end

**bold**
Expand Down
4 changes: 2 additions & 2 deletions test/normal_escape_snob.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ text to separate lists

1. now with numbers
2. the prisoner
1. not an _italic number_
2. a **bold human** being
1. not an _italic number_
2. a **bold human** being
3. end

**bold**
Expand Down
2 changes: 1 addition & 1 deletion test/preformatted_in_list.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
* Run this command:

ls -l *.html

* ?
Expand Down
5 changes: 5 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ def generate_testdata():
cmdline_args.append("--mark-code")
func_args = skip

if base_fn.startswith("backquote_code_style"):
module_args["backquote_code_style"] = True
cmdline_args.append("--backquote-code-style")
func_args = skip

if base_fn.startswith("pad_table"):
module_args["pad_tables"] = True
cmdline_args.append("--pad-tables")
Expand Down

0 comments on commit 6630122

Please sign in to comment.