Skip to content

Commit 40187f5

Browse files
committed
Merge pull request aaronsw#55 from mgontav/fix-img-links
<a> tags with no text or <img> tags no longer discarded. Thanks @mgontav
2 parents 2174151 + 84ce93b commit 40187f5

File tree

10 files changed

+71
-3
lines changed

10 files changed

+71
-3
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The AUTHORS/Contributors are (and/or have been):
1111
* Ivan Gromov <[email protected]>
1212
* Jocelyn Delalande <[email protected]>
1313
* Matt Dorn <[email protected]>
14+
* Miguel Tavares <[email protected]>
1415

1516
Maintainer:
1617

ChangeLog.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
2015.02.18
2+
==========
3+
----
4+
5+
* Fix #38: Anchor tags with empty text or with `<img>` tags inside are no longer stripped.
6+
7+
18
2014.12.29
29
==========
310
----

html2text/__init__.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
8484
self.a = []
8585
self.astack = []
8686
self.maybe_automatic_link = None
87+
self.empty_link = False
8788
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
8889
self.acount = 0
8990
self.list = []
@@ -370,16 +371,21 @@ def handle_tag(self, tag, attrs, start):
370371
attrs['href'].startswith('#')):
371372
self.astack.append(attrs)
372373
self.maybe_automatic_link = attrs['href']
374+
self.empty_link = True
373375
if self.protect_links:
374376
attrs['href'] = '<'+attrs['href']+'>'
375377
else:
376378
self.astack.append(None)
377379
else:
378380
if self.astack:
379381
a = self.astack.pop()
380-
if self.maybe_automatic_link:
382+
if self.maybe_automatic_link and not self.empty_link:
381383
self.maybe_automatic_link = None
382384
elif a:
385+
if self.empty_link:
386+
self.o("[")
387+
self.empty_link = False
388+
self.maybe_automatic_link = None
383389
if self.inline_links:
384390
self.o("](" + escape_md(a['href']) + ")")
385391
else:
@@ -399,6 +405,19 @@ def handle_tag(self, tag, attrs, start):
399405
attrs['href'] = attrs['src']
400406
alt = attrs.get('alt') or ''
401407

408+
# If we have a link to create, output the start
409+
if not self.maybe_automatic_link is None:
410+
href = self.maybe_automatic_link
411+
if self.images_to_alt and escape_md(alt) == href and \
412+
self.absolute_url_matcher.match(href):
413+
self.o("<" + escape_md(alt) + ">")
414+
self.empty_link = False
415+
return
416+
else:
417+
self.o("[")
418+
self.maybe_automatic_link = None
419+
self.empty_link = False
420+
402421
# If we have images_to_alt, we discard the image itself,
403422
# considering only the alt text.
404423
if self.images_to_alt:
@@ -637,10 +656,12 @@ def handle_data(self, data):
637656
href = self.maybe_automatic_link
638657
if href == data and self.absolute_url_matcher.match(href):
639658
self.o("<" + data + ">")
659+
self.empty_link = False
640660
return
641661
else:
642662
self.o("[")
643663
self.maybe_automatic_link = None
664+
self.empty_link = False
644665

645666
if not self.code and not self.pre:
646667
data = escape_md_section(data, snob=self.escape_snob)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def run(self):
3434

3535
setup(
3636
name="html2text",
37-
version="2014.12.29",
37+
version="2015.02.18",
3838
description="Turn HTML into equivalent Markdown-structured text.",
3939
author="Aaron Swartz",
4040
author_email="[email protected]",

test/empty-link.html

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<h1>Processing empty hyperlinks</h1>
2+
3+
<p>This test checks wheter empty hyperlinks still appear in the markdown result.</p>
4+
5+
<a href="http://some.link"></a>
6+
<a href="http://some.link"><p></p></a>

test/empty-link.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Processing empty hyperlinks
2+
3+
This test checks wheter empty hyperlinks still appear in the markdown result.
4+
5+
[](http://some.link)
6+
7+
[](http://some.link)
8+

test/images_to_alt.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
<a href="http://example.com">
22
<img src="http://example.com/img.png" alt="ALT TEXT" />
33
</a>
4+
<br>
5+
<a href="http://example.com"><img src="http://example.com/img.png" alt="ALT TEXT" /></a>
6+
<br>
7+
<a href="http://example.com"><img src="http://example.com/img.png" alt="http://example.com" /></a>

test/images_to_alt.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1-
[ ALT TEXT ](http://example.com)
1+
[ ALT TEXT ](http://example.com)
2+
[ALT TEXT](http://example.com)
3+
<http://example.com>
24

test/img-tag-with-link.html

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<h1>Processing images with links</h1>
2+
3+
<p>This test checks images with associated links.</p>
4+
5+
<a href="http://some.link"><img src="http://placehold.it/350x150#(banana)" width="350" height="150" alt="(banana)"></a>
6+
<a href="http://some.link"><img src="http://placehold.it/350x150#[banana]" width="350" height="150" alt="[banana]"></a>
7+
<a href="http://some.link"><img src="http://placehold.it/350x150#{banana}" width="350" height="150" alt="{banana}"></a>
8+
<a href="http://some.link"><img src="http://placehold.it/350x150#([{}])" width="350" height="150" alt="([{}])"></a>
9+
<a href="http://some.link"><img src="http://placehold.it/350x150#([{}])" width="350" height="150" alt></a>

test/img-tag-with-link.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Processing images with links
2+
3+
This test checks images with associated links.
4+
5+
[![\(banana\)](http://placehold.it/350x150#\(banana\))](http://some.link)
6+
[![\[banana\]](http://placehold.it/350x150#\[banana\])](http://some.link)
7+
[![{banana}](http://placehold.it/350x150#{banana})](http://some.link)
8+
[![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\))](http://some.link)
9+
[![](http://placehold.it/350x150#\(\[{}\]\))](http://some.link)
10+

0 commit comments

Comments
 (0)