x-pack/filebeat/input/gcs: add support for CSV decoding (elastic#40979)

The test file txn.csv.gz was obtained from https://netskopepartnerlogfilebucket.s3.amazonaws.com/txn-1722875066329034-fe10b6a23cc643c4b282e6190de2352d.csv.gz
Linu-Elias · Oct 8, 2024 · 4be27b6 · 4be27b6
1 parent 6339a42
commit 4be27b6
Show file tree

Hide file tree

Showing 14 changed files with 1,224 additions and 21 deletions.
diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
@@ -322,6 +322,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
 - Change request trace logging to include headers instead of complete request. {pull}41072[41072]
 - Improved GCS input documentation. {pull}41143[41143]
 - Add CSV decoding capacity to azureblobstorage input {pull}40978[40978]
+- Add CSV decoding capacity to gcs input {pull}40979[40979]
 
 *Auditbeat*
 

diff --git a/x-pack/filebeat/docs/inputs/input-gcs.asciidoc b/x-pack/filebeat/docs/inputs/input-gcs.asciidoc
@@ -253,6 +253,61 @@ highly nested json data. If this is set to `false` the *gcs.storage.object.json_
 applicable for json objects and has no effect on other types of objects. This attribute can be specified both at the root level of the configuration as well at the bucket level. 
 The bucket level values will always take priority and override the root level values if both are specified.
 
+[id="input-{type}-encoding"]
+[float]
+==== `encoding`
+
+The file encoding to use for reading data that contains international
+characters. This only applies to non-JSON logs. See <<_encoding_3>>.
+
+[id="input-{type}-decoding"]
+[float]
+==== `decoding`
+
+The file decoding option is used to specify a codec that will be used to
+decode the file contents. This can apply to any file stream data.
+An example config is shown below:
+
+Currently supported codecs are given below:-
+
+    1. <<attrib-decoding-csv-gcs,CSV>>: This codec decodes RFC 4180 CSV data streams.
+
+[id="attrib-decoding-csv-gcs"]
+[float]
+==== `the CSV codec`
+The `CSV` codec is used to decode RFC 4180 CSV data streams.
+Enabling the codec without other options will use the default codec options.
+
+[source,yaml]
+----
+  decoding.codec.csv.enabled: true
+----
+
+The CSV codec supports five sub attributes to control aspects of CSV decoding.
+The `comma` attribute specifies the field separator character used by the CSV
+format. If it is not specified, the comma character '`,`' is used. The `comment`
+attribute specifies the character that should be interpreted as a comment mark.
+If it is specified, lines starting with the character will be ignored. Both
+`comma` and `comment` must be single characters. The `lazy_quotes` attribute
+controls how quoting in fields is handled. If `lazy_quotes` is true, a quote may
+appear in an unquoted field and a non-doubled quote may appear in a quoted field.
+The `trim_leading_space` attribute specifies that leading white space should be
+ignored, even if the `comma` character is white space. For complete details
+of the preceding configuration attribute behaviors, see the CSV decoder
+https://pkg.go.dev/encoding/csv#Reader[documentation] The `fields_names`
+attribute can be used to specify the column names for the data. If it is
+absent, the field names are obtained from the first non-comment line of
+data. The number of fields must match the number of field names.
+
+An example config is shown below:
+
+[source,yaml]
+----
+  decoding.codec.csv.enabled: true
+  decoding.codec.csv.comma: "\t"
+  decoding.codec.csv.comment: "#"
+----
+
 [id="attrib-file_selectors-gcs"]
 [float]
 ==== `file_selectors`
@@ -408,4 +463,4 @@ In this configuration even though we have specified `max_workers = 10`, `poll =
 will override these values with their own respective values which are defined as part of their sub attibutes.
 
 
-NOTE: Any feedback is welcome which will help us further optimize this input. Please feel free to open a github issue for any bugs or feature requests.
+NOTE: Any feedback is welcome which will help us further optimize this input. Please feel free to open a github issue for any bugs or feature requests.
diff --git a/x-pack/filebeat/input/gcs/config.go b/x-pack/filebeat/input/gcs/config.go
@@ -16,6 +16,7 @@ import (
 	"golang.org/x/oauth2/google"
 
 	"github.com/elastic/beats/v7/libbeat/common/match"
+	"github.com/elastic/beats/v7/libbeat/reader/parser"
 )
 
 // MaxWorkers, Poll, PollInterval, BucketTimeOut, ParseJSON, FileSelectors, TimeStampEpoch & ExpandEventListFromField
@@ -41,6 +42,8 @@ type config struct {
 	Buckets []bucket `config:"buckets" validate:"required"`
 	// FileSelectors - Defines a list of regex patterns that can be used to filter out objects from the bucket.
 	FileSelectors []fileSelectorConfig `config:"file_selectors"`
+	// ReaderConfig is the default parser and decoder configuration.
+	ReaderConfig readerConfig `config:",inline"`
 	// TimeStampEpoch - Defines the epoch time in seconds, which is used to filter out objects that are older than the specified timestamp.
 	TimeStampEpoch *int64 `config:"timestamp_epoch"`
 	// ExpandEventListFromField - Defines the field name that will be used to expand the event into separate events.
@@ -58,6 +61,7 @@ type bucket struct {
 	PollInterval             *time.Duration       `config:"poll_interval,omitempty"`
 	ParseJSON                *bool                `config:"parse_json,omitempty"`
 	FileSelectors            []fileSelectorConfig `config:"file_selectors"`
+	ReaderConfig             readerConfig         `config:",inline"`
 	TimeStampEpoch           *int64               `config:"timestamp_epoch"`
 	ExpandEventListFromField string               `config:"expand_event_list_from_field"`
 }
@@ -68,6 +72,12 @@ type fileSelectorConfig struct {
 	// TODO: Add support for reader config in future
 }
 
+// readerConfig defines the options for reading the content of an GCS object.
+type readerConfig struct {
+	Parsers  parser.Config `config:",inline"`
+	Decoding decoderConfig `config:"decoding"`
+}
+
 type authConfig struct {
 	CredentialsJSON *jsonCredentialsConfig `config:"credentials_json,omitempty"`
 	CredentialsFile *fileCredentialsConfig `config:"credentials_file,omitempty"`

diff --git a/x-pack/filebeat/input/gcs/decoding.go b/x-pack/filebeat/input/gcs/decoding.go
@@ -0,0 +1,47 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"fmt"
+	"io"
+)
+
+// decoder is an interface for decoding data from an io.Reader.
+type decoder interface {
+	// decode reads and decodes data from an io reader based on the codec type.
+	// It returns the decoded data and an error if the data cannot be decoded.
+	decode() ([]byte, error)
+	// next advances the decoder to the next data item and returns true if there is more data to be decoded.
+	next() bool
+	// close closes the decoder and releases any resources associated with it.
+	// It returns an error if the decoder cannot be closed.
+
+	// more returns whether there are more records to read.
+	more() bool
+
+	close() error
+}
+
+// valueDecoder is a decoder that can decode directly to a JSON serialisable value.
+type valueDecoder interface {
+	decoder
+
+	decodeValue() ([]byte, map[string]any, error)
+}
+
+// newDecoder creates a new decoder based on the codec type.
+// It returns a decoder type and an error if the codec type is not supported.
+// If the reader config codec option is not set, it returns a nil decoder and nil error.
+func newDecoder(cfg decoderConfig, r io.Reader) (decoder, error) {
+	switch {
+	case cfg.Codec == nil:
+		return nil, nil
+	case cfg.Codec.CSV != nil:
+		return newCSVDecoder(cfg, r)
+	default:
+		return nil, fmt.Errorf("unsupported config value: %v", cfg)
+	}
+}
diff --git a/x-pack/filebeat/input/gcs/decoding_config.go b/x-pack/filebeat/input/gcs/decoding_config.go
@@ -0,0 +1,54 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"fmt"
+	"unicode/utf8"
+)
+
+// decoderConfig contains the configuration options for instantiating a decoder.
+type decoderConfig struct {
+	Codec *codecConfig `config:"codec"`
+}
+
+// codecConfig contains the configuration options for different codecs used by a decoder.
+type codecConfig struct {
+	CSV *csvCodecConfig `config:"csv"`
+}
+
+// csvCodecConfig contains the configuration options for the CSV codec.
+type csvCodecConfig struct {
+	Enabled bool `config:"enabled"`
+
+	// Fields is the set of field names. If it is present
+	// it is used to specify the object names of returned
+	// values and the FieldsPerRecord field in the csv.Reader.
+	// Otherwise, names are obtained from the first
+	// line of the CSV data.
+	Fields []string `config:"fields_names"`
+
+	// The fields below have the same meaning as the
+	// fields of the same name in csv.Reader.
+	Comma            *configRune `config:"comma"`
+	Comment          configRune  `config:"comment"`
+	LazyQuotes       bool        `config:"lazy_quotes"`
+	TrimLeadingSpace bool        `config:"trim_leading_space"`
+}
+
+type configRune rune
+
+func (r *configRune) Unpack(s string) error {
+	if s == "" {
+		return nil
+	}
+	n := utf8.RuneCountInString(s)
+	if n != 1 {
+		return fmt.Errorf("single character option given more than one character: %q", s)
+	}
+	_r, _ := utf8.DecodeRuneInString(s)
+	*r = configRune(_r)
+	return nil
+}
diff --git a/x-pack/filebeat/input/gcs/decoding_csv.go b/x-pack/filebeat/input/gcs/decoding_csv.go
@@ -0,0 +1,139 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"bytes"
+	"encoding/csv"
+	"fmt"
+	"io"
+	"slices"
+)
+
+// csvDecoder is a decoder for CSV data.
+type csvDecoder struct {
+	r *csv.Reader
+
+	header  []string
+	current []string
+	coming  []string
+
+	err error
+}
+
+// newCSVDecoder creates a new CSV decoder.
+func newCSVDecoder(config decoderConfig, r io.Reader) (decoder, error) {
+	d := csvDecoder{r: csv.NewReader(r)}
+	d.r.ReuseRecord = true
+	if config.Codec.CSV.Comma != nil {
+		d.r.Comma = rune(*config.Codec.CSV.Comma)
+	}
+	d.r.Comment = rune(config.Codec.CSV.Comment)
+	d.r.LazyQuotes = config.Codec.CSV.LazyQuotes
+	d.r.TrimLeadingSpace = config.Codec.CSV.TrimLeadingSpace
+	if len(config.Codec.CSV.Fields) != 0 {
+		d.r.FieldsPerRecord = len(config.Codec.CSV.Fields)
+		d.header = config.Codec.CSV.Fields
+	} else {
+		h, err := d.r.Read()
+		if err != nil {
+			return nil, err
+		}
+		d.header = slices.Clone(h)
+	}
+	var err error
+	d.coming, err = d.r.Read()
+	if err != nil {
+		return nil, err
+	}
+	d.current = make([]string, 0, len(d.header))
+	return &d, nil
+}
+
+func (d *csvDecoder) more() bool { return len(d.coming) == len(d.header) }
+
+// next advances the decoder to the next data item and returns true if
+// there is more data to be decoded.
+func (d *csvDecoder) next() bool {
+	if !d.more() && d.err != nil {
+		return false
+	}
+	d.current = d.current[:len(d.header)]
+	copy(d.current, d.coming)
+	d.coming, d.err = d.r.Read()
+	if d.err == io.EOF {
+		d.coming = nil
+	}
+	return true
+}
+
+// decode returns the JSON encoded value of the current CSV line. next must
+// have been called before any calls to decode.
+func (d *csvDecoder) decode() ([]byte, error) {
+	err := d.check()
+	if err != nil {
+		return nil, err
+	}
+	var buf bytes.Buffer
+	buf.WriteByte('{')
+	for i, n := range d.header {
+		if i != 0 {
+			buf.WriteByte(',')
+		}
+		buf.WriteByte('"')
+		buf.WriteString(n)
+		buf.WriteString(`":"`)
+		buf.WriteString(d.current[i])
+		buf.WriteByte('"')
+	}
+	buf.WriteByte('}')
+	d.current = d.current[:0]
+	return buf.Bytes(), nil
+}
+
+// decodeValue returns the value of the current CSV line interpreted as
+// an object with fields based on the header held by the receiver. next must
+// have been called before any calls to decode.
+func (d *csvDecoder) decodeValue() ([]byte, map[string]any, error) {
+	err := d.check()
+	if err != nil {
+		return nil, nil, err
+	}
+	m := make(map[string]any, len(d.header))
+	for i, n := range d.header {
+		m[n] = d.current[i]
+	}
+	d.current = d.current[:0]
+	b, err := d.decode()
+	if err != nil {
+		return nil, nil, err
+	}
+	return b, m, nil
+}
+
+func (d *csvDecoder) check() error {
+	if d.err != nil {
+		if d.err == io.EOF && d.coming == nil {
+			return nil
+		}
+		return d.err
+	}
+	if len(d.current) == 0 {
+		return fmt.Errorf("decode called before next")
+	}
+	// By the time we are here, current must be the same
+	// length as header; if it was not read, it would be
+	// zero, but if it was, it must match by the contract
+	// of the csv.Reader.
+	return nil
+}
+
+// close closes the csv decoder and releases the resources.
+func (d *csvDecoder) close() error {
+	if d.err == io.EOF {
+		return nil
+	}
+	return d.err
+}