From 1bc044a150890f3c544f4281f1684d19841a6ced Mon Sep 17 00:00:00 2001 From: blagoySimandoff Date: Mon, 9 Oct 2023 20:37:31 +0100 Subject: [PATCH 1/3] Removed an unnecessary (and faulty) package called chardet and used a go official library for encoding detection in place of it. --- extensions/auto_encoding_detection.go | 32 +++++++++++++++++++++++++++ response.go | 13 ++++------- 2 files changed, 36 insertions(+), 9 deletions(-) create mode 100644 extensions/auto_encoding_detection.go diff --git a/extensions/auto_encoding_detection.go b/extensions/auto_encoding_detection.go new file mode 100644 index 000000000..9021a758d --- /dev/null +++ b/extensions/auto_encoding_detection.go @@ -0,0 +1,32 @@ +package extensions + +import ( + "bufio" + "io" + + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding/htmlindex" +) + +func detectContentCharset(body io.Reader) string { + r := bufio.NewReader(body) + if data, err := r.Peek(1024); err == nil { + if _, name, ok := charset.DetermineEncoding(data, ""); ok { + return name + } + } + return "utf-8" +} +func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error) { + if charset == "" { + charset = detectContentCharset(body) + } + e, err := htmlindex.Get(charset) + if err != nil { + return nil, err + } + if name, _ := htmlindex.Name(e); name != "utf-8" { + body = e.NewDecoder().Reader(body) + } + return body, nil +} diff --git a/response.go b/response.go index 049d8801e..230b4379e 100644 --- a/response.go +++ b/response.go @@ -22,7 +22,6 @@ import ( "net/http" "strings" - "github.com/saintfish/chardet" "golang.org/x/net/html/charset" ) @@ -74,7 +73,7 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error return nil } contentType := strings.ToLower(r.Headers.Get("Content-Type")) - + fmt.Println(detectCharset) if strings.Contains(contentType, "image/") || strings.Contains(contentType, "video/") || strings.Contains(contentType, "audio/") || @@ -88,12 +87,9 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error if !detectCharset { return nil } - d := chardet.NewTextDetector() - r, err := d.DetectBest(r.Body) - if err != nil { - return err - } - contentType = "text/plain; charset=" + r.Charset + _, nameOfEncoding, _ := charset.DetermineEncoding(r.Body, contentType) + fmt.Println(nameOfEncoding) //name of charset/encoding + contentType = "text/plain; charset=" + nameOfEncoding } if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") { return nil @@ -105,7 +101,6 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error r.Body = tmpBody return nil } - func encodeBytes(b []byte, contentType string) ([]byte, error) { r, err := charset.NewReader(bytes.NewReader(b), contentType) if err != nil { From 70b533345f2be15a42d962e74e51e4267e4f47d3 Mon Sep 17 00:00:00 2001 From: blagoySimandoff Date: Mon, 9 Oct 2023 20:39:40 +0100 Subject: [PATCH 2/3] removed a test file --- extensions/auto_encoding_detection.go | 32 --------------------------- 1 file changed, 32 deletions(-) delete mode 100644 extensions/auto_encoding_detection.go diff --git a/extensions/auto_encoding_detection.go b/extensions/auto_encoding_detection.go deleted file mode 100644 index 9021a758d..000000000 --- a/extensions/auto_encoding_detection.go +++ /dev/null @@ -1,32 +0,0 @@ -package extensions - -import ( - "bufio" - "io" - - "golang.org/x/net/html/charset" - "golang.org/x/text/encoding/htmlindex" -) - -func detectContentCharset(body io.Reader) string { - r := bufio.NewReader(body) - if data, err := r.Peek(1024); err == nil { - if _, name, ok := charset.DetermineEncoding(data, ""); ok { - return name - } - } - return "utf-8" -} -func DecodeHTMLBody(body io.Reader, charset string) (io.Reader, error) { - if charset == "" { - charset = detectContentCharset(body) - } - e, err := htmlindex.Get(charset) - if err != nil { - return nil, err - } - if name, _ := htmlindex.Name(e); name != "utf-8" { - body = e.NewDecoder().Reader(body) - } - return body, nil -} From c297a5f66759319a9f823d05b03d002157e4416a Mon Sep 17 00:00:00 2001 From: blagoySimandoff Date: Mon, 9 Oct 2023 20:59:25 +0100 Subject: [PATCH 3/3] removed print statements --- response.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/response.go b/response.go index 230b4379e..f04344625 100644 --- a/response.go +++ b/response.go @@ -73,7 +73,6 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error return nil } contentType := strings.ToLower(r.Headers.Get("Content-Type")) - fmt.Println(detectCharset) if strings.Contains(contentType, "image/") || strings.Contains(contentType, "video/") || strings.Contains(contentType, "audio/") || @@ -87,8 +86,7 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error if !detectCharset { return nil } - _, nameOfEncoding, _ := charset.DetermineEncoding(r.Body, contentType) - fmt.Println(nameOfEncoding) //name of charset/encoding + _, nameOfEncoding, _ := charset.DetermineEncoding(r.Body, contentType) //name of charset/encoding contentType = "text/plain; charset=" + nameOfEncoding } if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") {