Skip to content

Commit ddb3943

Browse files
authored
Strip HTML tags but keep content while rendering
This makes the renderer print the content of informational HTML tags while stripping the tags themselves. Tags like script, iframe, style, etc, which are unlikely to ever hold presentable content, are exempt from this, and their content is skipped from rendering as well as the tags themselves. <br>, a hard-break tag, is supported as a Markdown hard-break replacement (the two spaces before newline). This also adds tests for this behavior inside general_text.md. Fixes #6, a longstanding issue with inline HTML in blockquotes.
1 parent fc76187 commit ddb3943

File tree

5 files changed

+213
-29
lines changed

5 files changed

+213
-29
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
github.com/Masterminds/sprig/v3 v3.2.2
88
github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a
99
github.com/google/uuid v1.3.0 // indirect
10+
github.com/grokify/html-strip-tags-go v0.0.1
1011
github.com/hexops/gotextdiff v1.0.3
1112
github.com/huandu/xstrings v1.3.2 // indirect
1213
github.com/imdario/mergo v0.3.12 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a/go.mod h1:JDGc
2121
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
2222
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
2323
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
24+
github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
25+
github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
2426
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
2527
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
2628
github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=

internal/renderer/renderer.go

Lines changed: 98 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ package renderer
2020
import (
2121
"bytes"
2222
"fmt"
23+
"html"
2324
"io"
2425
"regexp"
25-
"strings"
2626

2727
"github.com/gomarkdown/markdown/ast"
28+
"github.com/grokify/html-strip-tags-go"
2829
"github.com/olekukonko/tablewriter"
2930
)
3031

@@ -51,6 +52,25 @@ var (
5152
// matches a FULL string that contains no non-whitespace characters
5253
var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)
5354

55+
// fairly tolerant to handle weird HTML
56+
var tagPairRegexString = `<[\n\f ]*%s([\n\f ]+[^\n\f \/>"'=]+[\n\f ]*(=[\n\f ]*([a-zA-Z1-9\-]+|"[^\n\f"]+"|'[^\n\f']+'))?)*[\n\f ]*>.*?<[\n\f ]*/[\n\f ]*%s[\n\f ]*>`
57+
58+
// HTML block tags whose contents should not be rendered
59+
var htmlNoRenderRegex = []*regexp.Regexp{
60+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "fieldset", "fieldset")),
61+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "form", "form")),
62+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "iframe", "iframe")),
63+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "script", "script")),
64+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "style", "style")),
65+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "canvas", "canvas")),
66+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "dialog", "dialog")),
67+
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "progress", "progress")),
68+
}
69+
70+
var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
71+
var hardBreakTag = regexp.MustCompile(`< *br */? *>`)
72+
var escapedHtmlChar = regexp.MustCompile(`(?:^|[^\\\\])&[[:alnum:]]+;`)
73+
5474
// Renderer implements markdown.Renderer.
5575
type Renderer struct{}
5676

@@ -82,7 +102,7 @@ func (r Renderer) link(w io.Writer, node *ast.Link, entering bool) {
82102
w.Write(linkPrefix)
83103
w.Write(node.Destination)
84104
w.Write(space)
85-
r.text(w, node)
105+
r.text(w, node, true)
86106
}
87107
}
88108
}
@@ -92,7 +112,7 @@ func (r Renderer) image(w io.Writer, node *ast.Image, entering bool) {
92112
w.Write(linkPrefix)
93113
w.Write(node.Destination)
94114
w.Write(space)
95-
r.text(w, node)
115+
r.text(w, node, true)
96116
}
97117
}
98118

@@ -125,7 +145,7 @@ func (r Renderer) subscript(w io.Writer, node *ast.Subscript, entering bool) {
125145
if entering {
126146
if node := node.AsLeaf(); node != nil {
127147
w.Write(subOpen)
128-
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
148+
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
129149
w.Write(subClose)
130150
}
131151
}
@@ -134,7 +154,7 @@ func (r Renderer) superscript(w io.Writer, node *ast.Superscript, entering bool)
134154
if entering {
135155
if node := node.AsLeaf(); node != nil {
136156
w.Write(supOpen)
137-
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
157+
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
138158
w.Write(supClose)
139159
}
140160
}
@@ -151,7 +171,7 @@ func (r Renderer) heading(w io.Writer, node *ast.Heading, entering bool) {
151171
heading[i] = '#'
152172
}
153173
w.Write(heading)
154-
r.text(w, node)
174+
r.text(w, node, true)
155175
} else {
156176
w.Write(lineBreak)
157177
}
@@ -277,8 +297,16 @@ func (r Renderer) paragraph(w io.Writer, node *ast.Paragraph, entering bool) (no
277297
// only render links text in the paragraph if they're
278298
// combined with some other text on page
279299
switch child := child.(type) {
280-
case *ast.Text, *ast.Code, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
281-
r.text(w, child)
300+
case *ast.Text, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
301+
r.text(w, child, true)
302+
case *ast.Code:
303+
r.text(w, child, false)
304+
case *ast.Hardbreak:
305+
w.Write(lineBreak)
306+
case *ast.HTMLSpan:
307+
if hardBreakTag.Match(child.AsLeaf().Literal) {
308+
w.Write(lineBreak)
309+
}
282310
case *ast.Subscript:
283311
r.subscript(w, child, true)
284312
case *ast.Superscript:
@@ -326,7 +354,7 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
326354
} else if !isTerm {
327355
w.Write(itemPrefix)
328356
}
329-
r.text(w, item)
357+
r.text(w, item, true)
330358
w.Write(lineBreak)
331359
if l >= 2 {
332360
if list, ok := item.Children[1].(*ast.List); ok {
@@ -337,22 +365,43 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
337365
}
338366
}
339367

340-
var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
341-
342-
func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
368+
func textWithNewlineReplacement(node ast.Node, replacement []byte, unescapeHtml bool) []byte {
343369
buf := bytes.Buffer{}
344370
delimiter := getNodeDelimiter(node)
345371
// special case for footnotes: we want them in the text
346372
if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
347373
fmt.Fprintf(&buf, "[^%d]", node.NoteID)
348374
}
349-
if node := node.AsLeaf(); node != nil {
375+
if leaf := node.AsLeaf(); leaf != nil {
350376
// replace all newlines in text with preferred symbols; this may
351377
// be spaces for general text, allowing for soft wrapping, which
352378
// is recommended as per Gemini spec p. 5.4.1, or line breaks
353379
// with a blockquote symbols for blockquotes, or just nothing
354380
buf.Write(delimiter)
355-
buf.Write(lineBreakCharacters.ReplaceAll(node.Literal, replacement))
381+
switch node := node.(type) {
382+
case *ast.Hardbreak:
383+
buf.Write(lineBreak)
384+
// If the blockquote ends with a double space, the parser will
385+
// not create a Hardbreak at the end, so this works.
386+
if _, ok := leaf.Parent.(*ast.BlockQuote); !ok {
387+
buf.Write(quotePrefix)
388+
}
389+
case *ast.HTMLSpan:
390+
if hardBreakTag.Match(leaf.Literal) {
391+
buf.Write(lineBreak)
392+
}
393+
buf.Write(leaf.Content)
394+
case *ast.HTMLBlock:
395+
buf.Write([]byte(extractHtml(node, quotePrefix)))
396+
default:
397+
textWithoutBreaks := lineBreakCharacters.ReplaceAll(leaf.Literal, replacement)
398+
if unescapeHtml {
399+
unescapedText := escapedHtmlChar.ReplaceAll(textWithoutBreaks, []byte(html.UnescapeString(string(textWithoutBreaks))))
400+
buf.Write(unescapedText)
401+
} else {
402+
buf.Write(textWithoutBreaks)
403+
}
404+
}
356405
buf.Write(delimiter)
357406
}
358407
if node := node.AsContainer(); node != nil {
@@ -362,24 +411,38 @@ func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
362411
switch child := child.(type) {
363412
case *ast.List:
364413
default:
365-
buf.Write(textWithNewlineReplacement(child, replacement))
414+
buf.Write(textWithNewlineReplacement(child, replacement, unescapeHtml))
366415
}
367416
}
368417
buf.Write(delimiter)
369418
}
370419
return buf.Bytes()
371420
}
372421

373-
func (r Renderer) text(w io.Writer, node ast.Node) {
374-
w.Write(textWithNewlineReplacement(node, space))
422+
func (r Renderer) text(w io.Writer, node ast.Node, unescapeHtml bool) {
423+
w.Write(textWithNewlineReplacement(node, space, unescapeHtml))
375424
}
376425

377426
func (r Renderer) blockquoteText(w io.Writer, node ast.Node) {
378-
w.Write(textWithNewlineReplacement(node, quoteBrPrefix))
427+
w.Write(textWithNewlineReplacement(node, quoteBrPrefix, true))
379428
}
380429

381430
func extractText(node ast.Node) string {
382-
return string(textWithNewlineReplacement(node, space))
431+
return string(textWithNewlineReplacement(node, space, true))
432+
}
433+
434+
func extractHtml(node *ast.HTMLBlock, linePrefix []byte) string {
435+
// Only render contents of allowed tags
436+
literal := node.Literal
437+
for _, re := range htmlNoRenderRegex {
438+
literal = re.ReplaceAllLiteral(literal, []byte{})
439+
}
440+
if len(literal) > 0 {
441+
literalWithBreaks := hardBreakTag.ReplaceAll(lineBreakCharacters.ReplaceAll(literal, space), append([]byte(lineBreak), linePrefix...))
442+
literalStripped := strip.StripTags(string(literalWithBreaks))
443+
return html.UnescapeString(literalStripped)
444+
}
445+
return ""
383446
}
384447

385448
func (r Renderer) tableHead(t *tablewriter.Table, node *ast.TableHeader) {
@@ -440,6 +503,17 @@ func (r Renderer) table(w io.Writer, node *ast.Table, entering bool) {
440503
}
441504
}
442505

506+
func (r Renderer) htmlBlock(w io.Writer, node *ast.HTMLBlock, entering bool) {
507+
if entering {
508+
htmlString := extractHtml(node, []byte{})
509+
if len(htmlString) > 0 {
510+
w.Write([]byte(htmlString))
511+
w.Write(lineBreak)
512+
w.Write(lineBreak)
513+
}
514+
}
515+
}
516+
443517
// RenderNode implements Renderer.RenderNode().
444518
func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.WalkStatus {
445519
// entering in gomarkdown was made to have elements of type switch
@@ -487,6 +561,11 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
487561
r.table(w, node, entering)
488562
noNewLine = false
489563
fetchLinks = true
564+
case *ast.HTMLBlock:
565+
// Do not render if already rendered as part of a blockquote
566+
if _, ok := node.Parent.(*ast.BlockQuote); !ok {
567+
r.htmlBlock(w, node, entering)
568+
}
490569
}
491570
if !noNewLine && !entering {
492571
w.Write(lineBreak)

testdata/general_text.gmi

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Single newlines (like in this multi-line paragraph) will get replaced by a space
66

77
Inline formatting bits (like this **bold** text, *emphasized* text, ~~strikethrough~~ text, `preformatted text`) are kept to make sure Gemini readers still have the stylistic context of your text.
88

9+
Adding two spaces at the end of a line will insert a hard
10+
break. You can also create a hard break using a backslash at the end
11+
of a line. Hard breaks at the end of a paragraph are ignored.
12+
913
## Blockquotes
1014

1115
Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with a space. This facilitates appending authorship information to the quote, or using blockquotes to write poems.
@@ -22,6 +26,9 @@ Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with
2226

2327
> — also Timur Demin, in the process of writing this test file
2428

29+
> Hard breaks are also supported in blockquotes,
30+
> for compatibility. Hard breaks at the end of a blockquote are ignored.
31+
2532
## Code
2633

2734
gmnhg will use Gemtext preformatted blocks for that. Markdown alt-text for preformatted blocks is supported, and is used to render alt-text as specified by Gemini spec p. 5.4.3.
@@ -74,15 +81,45 @@ Since clients like Lagrange treat the fourth and the rest of #-s as heading cont
7481

7582
###### Heading 6
7683

77-
## Misc
84+
## HTML
85+
86+
Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future. HTML tags can be escaped with \ as in <span></span> or enclosed with ``.
87+
88+
HTML tags are stripped from HTML blocks. (Note that HTML blocks must begin and end with a supported HTML block tag, and must have blank lines before and after the block.)
89+
90+
### Break tags
91+
92+
Hard breaks
93+
using <br> are supported.
94+
95+
Hard breaks using <br> are supported
96+
inside HTML blocks.
97+
98+
### HTML entities
7899

79-
Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future.
100+
HTML escaped entities like & and < are unescaped, even when they show up inside an inline HTML section. Escaping them with a leading backslash is possible outside of HTML blocks: &amp;, &lt;. Any escaped characters inside a code span (such as `&lt; or &gt;`) will not be unescaped.
80101

81-
> There's currently a bug in gmnhg which prevents it from
82-
> stripping HTML in certain scenarios. HTML is noticeably still present
83-
> inside <span>blockquotes</span>.
102+
HTML escaped entities like < and > are also unescaped inside HTML blocks. Backslash escapes have no effect: \&.
84103

85-
=> https://github.com/tdemin/gmnhg/issues/6 bug in gmnhg
104+
### Forbidden tags
105+
106+
Tags that are unable to output Gemini-compatible text are completely removed from the output.
107+
108+
Note that the contents of "forbidden" tags will be rendered if they are placed inline, although the tags themselves will be stripped. Placing HTML block elements inline in this manner violates the spec of common Markdown flavors, but gmnhg handles it the best it can.
109+
110+
### HTML in blockquotes
111+
112+
> HTML spans are stripped from
113+
> inside blockquotes.
114+
115+
> Non HTML block text before the block.
116+
> HTML blocks are stripped from inside blockquotes.
117+
> Non HTML block text after the block.
118+
119+
> Standalone blockquoted HTML blocks
120+
> are also stripped of their tags.
121+
122+
## Misc
86123

87124
---
88125

0 commit comments

Comments
 (0)