Skip to content
This repository has been archived by the owner on Aug 21, 2023. It is now read-only.

Commit

Permalink
fix the bug that output csv files are invalid when csv-delimiter is e…
Browse files Browse the repository at this point in the history
…mpty (#219)

* fix Makefile

* fix csv bug

* add lightning csv integration test

* add separator test

* simplify code

* fix ci test

* tmp

* block some tests

* fix bug

* address comments

* fix ut and make check

* fix integration test

* fix

* fix

* try to fix

* try to fix

* Revert "try to fix"

This reverts commit ecdbeaa.

* try to fix

* address comments

* address comments
  • Loading branch information
lichunzhu authored Dec 14, 2020
1 parent c90415b commit 11d8d5d
Show file tree
Hide file tree
Showing 17 changed files with 290 additions and 48 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ PACKAGE_DIRECTORIES := $(PACKAGES) | sed 's/github.com\/pingcap\/dumpling\/*//'
DUMPLING_PKG := github.com/pingcap/dumpling
CHECKER := awk '{ print } END { if (NR > 0) { exit 1 } }'

LDFLAGS += -X "github.com/pingcap/dumpling/v4/cli.ReleaseVersion=$(shell git describe --tags --dirty)"
LDFLAGS += -X "github.com/pingcap/dumpling/v4/cli.ReleaseVersion=$(shell git describe --tags --dirty='-dev')"
LDFLAGS += -X "github.com/pingcap/dumpling/v4/cli.BuildTimestamp=$(shell date -u '+%Y-%m-%d %I:%M:%S')"
LDFLAGS += -X "github.com/pingcap/dumpling/v4/cli.GitHash=$(shell git rev-parse HEAD)"
LDFLAGS += -X "github.com/pingcap/dumpling/v4/cli.GitBranch=$(shell git rev-parse --abbrev-ref HEAD)"
Expand All @@ -24,7 +24,7 @@ bin/%: cmd/%/main.go $(wildcard v4/**/*.go)
$(GOBUILD) $(RACEFLAG) -tags codes -o $@ $<

test: failpoint-enable
$(GOTEST) $(RACEFLAG) -tags leak ./... || ( make failpoint-disable && exit 1 )
$(GOTEST) $(RACEFLAG) -coverprofile=coverage.txt -covermode=atomic -tags leak ./... || ( make failpoint-disable && exit 1 )
@make failpoint-disable

integration_test: failpoint-enable bin/dumpling
Expand Down
6 changes: 3 additions & 3 deletions tests/basic/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ export DUMPLING_TEST_PORT=4000
# Test for --sql option.
run_sql "drop database if exists \`$DB_NAME\`;"
run_sql "create database \`$DB_NAME\`;"
run_sql "create sequence \`$DB_NAME\`.\`SEQUENCE_NAME\` increment by 1;"
run_sql "create sequence \`$DB_NAME\`.\`$SEQUENCE_NAME\` increment by 1;"

run_dumpling --sql "select nextval(\`$DB_NAME\`.\`SEQUENCE_NAME\`)"
run_dumpling --sql "select nextval(\`$DB_NAME\`.\`$SEQUENCE_NAME\`)"

actual=$(grep -w "(.*)[,|;]" ${DUMPLING_OUTPUT_DIR}/result.000000000.sql | cut -c2-2)
echo "expected 1, actual ${actual}"
[ "$actual" = 1 ]

run_dumpling --sql "select nextval(\`$DB_NAME\`.\`SEQUENCE_NAME\`)"
run_dumpling --sql "select nextval(\`$DB_NAME\`.\`$SEQUENCE_NAME\`)"

actual=$(grep -w "(.*)[,|;]" ${DUMPLING_OUTPUT_DIR}/result.000000000.sql | cut -c2-2)
echo "expected 2, actual ${actual}"
Expand Down
51 changes: 51 additions & 0 deletions tests/e2e_csv/conf/diff_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# diff Configuration.

log-level = "info"

chunk-size = 1000

check-thread-count = 4

sample-percent = 100

use-rowid = false

use-checksum = true

fix-sql-file = "fix.sql"

# tables need to check.
[[check-tables]]
schema = "e2e_csv"
tables = ["escape", "t"]

[[table-config]]
schema = "e2e_csv"
table = "t"

[[table-config.source-tables]]
instance-id = "source-1"
schema = "e2e_csv"
table = "t"

[[table-config]]
schema = "e2e_csv"
table = "escape"

[[table-config.source-tables]]
instance-id = "source-1"
schema = "e2e_csv"
table = "escape"

[[source-db]]
host = "127.0.0.1"
port = 3306
user = "root"
password = ""
instance-id = "source-1"

[target-db]
host = "127.0.0.1"
port = 4000
user = "root"
password = ""
29 changes: 29 additions & 0 deletions tests/e2e_csv/conf/lightning.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
### tidb-lightning config

[lightning]
server-mode = false
level = "error"
check-requirements = false

[tikv-importer]
backend = "tidb"
on-duplicate = "error"

[mydumper]
data-source-dir = "/tmp/dumpling_test_result/sql_res.e2e_csv"

[mydumper.csv]
separator = 'separator-place-holder'
delimiter = "delimiter-place-holder"
header = true
not-null = false
null = '\N'
backslash-escape = backslash-escape-place-holder
trim-last-separator = false

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
password = ""
status-port = 10080
1 change: 1 addition & 0 deletions tests/e2e_csv/data/e2e_csv-schema-create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE DATABASE `e2e_csv` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_bin */ /*!80016 DEFAULT ENCRYPTION='N' */;
1 change: 1 addition & 0 deletions tests/e2e_csv/data/e2e_csv.escape-schema.sql
1 change: 1 addition & 0 deletions tests/e2e_csv/data/e2e_csv.escape.sql
1 change: 1 addition & 0 deletions tests/e2e_csv/data/e2e_csv.t-schema.sql
1 change: 1 addition & 0 deletions tests/e2e_csv/data/e2e_csv.t.sql
84 changes: 84 additions & 0 deletions tests/e2e_csv/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
#
# Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.

set -eu
cur=$(cd `dirname $0`; pwd)

DB_NAME="e2e_csv"

# drop database on mysql
export DUMPLING_TEST_PORT=3306
run_sql "drop database if exists $DB_NAME;"

run_sql_file "$DUMPLING_BASE_NAME/data/e2e_csv-schema-create.sql"
export DUMPLING_TEST_DATABASE="e2e_csv"
run_sql_file "$DUMPLING_BASE_NAME/data/e2e_csv.escape-schema.sql"
run_sql_file "$DUMPLING_BASE_NAME/data/e2e_csv.escape.sql"
run_sql_file "$DUMPLING_BASE_NAME/data/e2e_csv.t-schema.sql"

mkdir -p $DUMPLING_TEST_DIR/data
# lightning will omit empty lines without delimiters now, skip these cases
sed "s/('')/-- ('')/g" "$DUMPLING_BASE_NAME/data/e2e_csv.t.sql" | sed "s/(' ')/-- (' ')/g" > $DUMPLING_TEST_DIR/data/e2e_csv.t.sql
run_sql_file "$DUMPLING_TEST_DIR/data/e2e_csv.t.sql"

run() {
echo "*** running subtest case ***"
echo "escape_backslash is $escape_backslash"
echo "csv_delimiter is $csv_delimiter"
echo "csv_separator is $csv_separator"

# drop database on tidb
export DUMPLING_TEST_PORT=4000
export DUMPLING_TEST_DATABASE=""
run_sql "drop database if exists $DB_NAME;"

# dumping
export DUMPLING_TEST_PORT=3306
export DUMPLING_TEST_DATABASE=$DB_NAME
run_dumpling --filetype="csv" --escape-backslash=$escape_backslash --csv-delimiter="$csv_delimiter" --csv-separator="$csv_separator"

# construct lightning configuration
mkdir -p $DUMPLING_TEST_DIR/conf
cp "$cur/conf/lightning.toml" $DUMPLING_TEST_DIR/conf

sed -i -e "s/separator-place-holder/$csv_separator/g" $DUMPLING_TEST_DIR/conf/lightning.toml
csv_delimiter_holder=$csv_delimiter
if [ "$csv_delimiter" = '"' ]; then
# We want to replace delimiter-place-holder in lightning.toml to \",
# but sed will identify \" as ", so we need to use \\\" here.
csv_delimiter_holder='\\\"'
fi
sed -i -e "s/delimiter-place-holder/$csv_delimiter_holder/g" $DUMPLING_TEST_DIR/conf/lightning.toml
escape_backslash_holder="true"
if [ "$escape_backslash" = "false" ] && [ "$csv_delimiter" != "" ]; then
escape_backslash_holder="false"
fi
sed -i -e "s/backslash-escape-place-holder/$escape_backslash_holder/g" $DUMPLING_TEST_DIR/conf/lightning.toml

cat "$DUMPLING_TEST_DIR/conf/lightning.toml"
# use lightning import data to tidb
run_lightning $DUMPLING_TEST_DIR/conf/lightning.toml

# check mysql and tidb data
check_sync_diff $cur/conf/diff_config.toml
}

escape_backslash_arr="true false"
csv_delimiter_arr="\" '"
csv_separator_arr=', a aa |*|'

for escape_backslash in $escape_backslash_arr
do
for csv_separator in $csv_separator_arr
do
for csv_delimiter in $csv_delimiter_arr
do
run
done
if [ "$escape_backslash" = "true" ]; then
csv_delimiter=""
run
fi
done
done
4 changes: 2 additions & 2 deletions tests/naughty_strings/data/naughty_strings.escape-schema.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CREATE TABLE `escape` (
`a` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
`a` text CHARACTER SET utf8mb4 COLLATE utf8mb4_bin,
`b` varchar(13) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;

16 changes: 9 additions & 7 deletions tests/naughty_strings/data/naughty_strings.escape.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*!40101 SET NAMES binary*/;
INSERT INTO `escape` VALUES
(''''),
('"'),
(''''''),
('""'),
('''"'''),
('"''''''''"''"'),
('"''"''"''''''''"');
('''', '"'),
('"', ''''''),
('''''', '""'),
('""', '''"'''),
('''"''', '"''''''"''"'),
('"''''''''"''"', '"''"''''''''''"'),
('"''"''"''''''''"', ''),
('a",b,"a', 'a,"c",a'),
('bbaa|*|aabb', 'bba|*|a|*|abb');
16 changes: 9 additions & 7 deletions tests/naughty_strings/expect/naughty_strings.escape.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*!40101 SET NAMES binary*/;
INSERT INTO `escape` VALUES
('\''),
('\"'),
('\'\''),
('\"\"'),
('\'\"\''),
('\"\'\'\'\'\"\'\"'),
('\"\'\"\'\"\'\'\'\'\"');
('\'','\"'),
('\"','\'\''),
('\'\'','\"\"'),
('\"\"','\'\"\''),
('\'\"\'','\"\'\'\'\"\'\"'),
('\"\'\'\'\'\"\'\"','\"\'\"\'\'\'\'\'\"'),
('\"\'\"\'\"\'\'\'\'\"',''),
('a\",b,\"a','a,\"c\",a'),
('bbaa|*|aabb','bba|*|a|*|abb');
3 changes: 3 additions & 0 deletions v4/export/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,9 @@ func (conf *Config) ParseFromFlags(flags *pflag.FlagSet) error {
if conf.Threads <= 0 {
return errors.Errorf("--threads is set to %d. It should be greater than 0", conf.Threads)
}
if len(conf.CsvSeparator) == 0 {
return errors.New("--csv-separator is set to \"\". It must not be an empty string")
}

if conf.SessionParams == nil {
conf.SessionParams = make(map[string]interface{})
Expand Down
80 changes: 61 additions & 19 deletions v4/export/sql_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ var colTypeRowReceiverMap = map[string]func() RowReceiverStringer{}
var (
nullValue = "NULL"
quotationMark = []byte{'\''}
twoQuotationMarks = []byte{'\'', '\''}
doubleQuotationMark = []byte{'"'}
)

Expand Down Expand Up @@ -49,19 +50,7 @@ var dataTypeBin = []string{
"BIT",
}

func getEscapeQuotation(escapeBackSlash bool, escapeQuotation []byte) []byte { // revive:disable-line:flag-parameter
if escapeBackSlash {
return nil
}
return escapeQuotation
}

func escape(s []byte, bf *bytes.Buffer, escapeQuotation []byte) {
if len(escapeQuotation) > 0 {
bf.Write(bytes.ReplaceAll(s, escapeQuotation, append(escapeQuotation, escapeQuotation...)))
return
}

func escapeBackslashSQL(s []byte, bf *bytes.Buffer) {
var (
escape byte
last = 0
Expand Down Expand Up @@ -94,10 +83,63 @@ func escape(s []byte, bf *bytes.Buffer, escapeQuotation []byte) {
last = i + 1
}
}
if last == 0 {
bf.Write(s[last:])
}

func escapeBackslashCSV(s []byte, bf *bytes.Buffer, opt *csvOption) {
var (
escape byte
last = 0
specCmt byte = 0
)
if len(opt.delimiter) > 0 {
specCmt = opt.delimiter[0] // if csv has a delimiter, we should use backslash to comment the delimiter in field value
} else if len(opt.separator) > 0 {
specCmt = opt.separator[0] // if csv's delimiter is "", we should escape the separator to avoid error
}

for i := 0; i < len(s); i++ {
escape = 0

switch s[i] {
case 0: /* Must be escaped for 'mysql' */
escape = '0'
case '\r':
escape = 'r'
case '\n': /* escaped for line terminators */
escape = 'n'
case '\\':
escape = '\\'
case specCmt:
escape = specCmt
}

if escape != 0 {
bf.Write(s[last:i])
bf.WriteByte('\\')
bf.WriteByte(escape)
last = i + 1
}
}
bf.Write(s[last:])
}

func escapeSQL(s []byte, bf *bytes.Buffer, escapeBackslash bool) { // revive:disable-line:flag-parameter
if escapeBackslash {
escapeBackslashSQL(s, bf)
} else {
bf.Write(bytes.ReplaceAll(s, quotationMark, twoQuotationMarks))
}
}

func escapeCSV(s []byte, bf *bytes.Buffer, escapeBackslash bool, opt *csvOption) { // revive:disable-line:flag-parameter
switch {
case escapeBackslash:
escapeBackslashCSV(s, bf, opt)
case len(opt.delimiter) > 0:
bf.Write(bytes.ReplaceAll(s, opt.delimiter, append(opt.delimiter, opt.delimiter...)))
default:
bf.Write(s)
} else if last < len(s) {
bf.Write(s[last:])
}
}

Expand Down Expand Up @@ -208,7 +250,7 @@ func (s *SQLTypeString) BindAddress(arg []interface{}) {
func (s *SQLTypeString) WriteToBuffer(bf *bytes.Buffer, escapeBackslash bool) {
if s.RawBytes != nil {
bf.Write(quotationMark)
escape(s.RawBytes, bf, getEscapeQuotation(escapeBackslash, quotationMark))
escapeSQL(s.RawBytes, bf, escapeBackslash)
bf.Write(quotationMark)
} else {
bf.WriteString(nullValue)
Expand All @@ -219,7 +261,7 @@ func (s *SQLTypeString) WriteToBuffer(bf *bytes.Buffer, escapeBackslash bool) {
func (s *SQLTypeString) WriteToBufferInCsv(bf *bytes.Buffer, escapeBackslash bool, opt *csvOption) {
if s.RawBytes != nil {
bf.Write(opt.delimiter)
escape(s.RawBytes, bf, getEscapeQuotation(escapeBackslash, opt.delimiter))
escapeCSV(s.RawBytes, bf, escapeBackslash, opt)
bf.Write(opt.delimiter)
} else {
bf.WriteString(opt.nullValue)
Expand Down Expand Up @@ -249,7 +291,7 @@ func (s *SQLTypeBytes) WriteToBuffer(bf *bytes.Buffer, _ bool) {
func (s *SQLTypeBytes) WriteToBufferInCsv(bf *bytes.Buffer, escapeBackslash bool, opt *csvOption) {
if s.RawBytes != nil {
bf.Write(opt.delimiter)
escape(s.RawBytes, bf, getEscapeQuotation(escapeBackslash, opt.delimiter))
escapeCSV(s.RawBytes, bf, escapeBackslash, opt)
bf.Write(opt.delimiter)
} else {
bf.WriteString(opt.nullValue)
Expand Down
Loading

0 comments on commit 11d8d5d

Please sign in to comment.