Skip to content

Commit

Permalink
Don't decompress gzip if data doesn't look like gzip
Browse files Browse the repository at this point in the history
Prevents incorrect response being returned in cases like
/sitemap.xml.gz is requested, but uncompressed 404 page is served
instead.
  • Loading branch information
WGH- committed Oct 16, 2023
1 parent 1b000c4 commit fcff719
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
35 changes: 35 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const testXML = `<?xml version="1.0" encoding="UTF-8"?>
<paragraph type="description">This is a test paragraph</paragraph>
</page>`

const custom404 = `404 not found`

func newUnstartedTestServer() *httptest.Server {
mux := http.NewServeMux()

Expand Down Expand Up @@ -86,6 +88,10 @@ func newUnstartedTestServer() *httptest.Server {
ww.Write([]byte(testXML))
})

mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, custom404, http.StatusNotFound)
})

mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
if r.Method == "POST" {
w.Header().Set("Content-Type", "text/html")
Expand Down Expand Up @@ -1477,6 +1483,35 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) {
testCollectorOnXMLWithXML(t, "/test.xml.gz")
}

func TestCollectorNonexistentXMLGZ(t *testing.T) {
// This is a regression test for colly
// attempting to decompress all .xml.gz URLs
// even if they're not compressed.
ts := newTestServer()
defer ts.Close()

c := NewCollector(ParseHTTPErrorResponse())

onResponseCalled := false

c.OnResponse(func(resp *Response) {
onResponseCalled = true
if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want {
t.Errorf("wrong response body got=%q want=%q", got, want)
}
})

c.OnError(func(resp *Response, err error) {
t.Errorf("called on OnError: err=%v", err)
})

c.Visit(ts.URL + "/nonexistent.xml.gz")

if !onResponseCalled {
t.Error("OnResponse was not called")
}
}

func TestCollectorVisitWithTrace(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
17 changes: 15 additions & 2 deletions http_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package colly

import (
"bufio"
"crypto/sha1"
"encoding/gob"
"encoding/hex"
Expand Down Expand Up @@ -202,11 +203,23 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c
}
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
bodyReader, err = gzip.NewReader(bodyReader)
// Even if URL contains .xml.gz, it doesn't mean that we get gzip
// compressed data back. We might get 404 error page instead,
// for example. So check gzip magic bytes.
bufReader := bufio.NewReader(bodyReader)
bodyReader = bufReader
magic, err := bufReader.Peek(2)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
// gzip magic, as specified in RFC 1952
if magic[0] == 0x1f && magic[1] == 0x8b {
bodyReader, err = gzip.NewReader(bufReader)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
}
}
body, err := io.ReadAll(bodyReader)
if err != nil {
Expand Down

0 comments on commit fcff719

Please sign in to comment.