Skip to content

Commit 8653bd9

Browse files
authored
Merge pull request #79 from Quafadas/noescape
Improve test cases and durability of CSV parsing, particularly for non RFC compliant CSV files.
2 parents 4311887 + 33445a6 commit 8653bd9

File tree

2 files changed

+211
-7
lines changed

2 files changed

+211
-7
lines changed

scautable/src/csvParser.scala

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,54 @@ private[scautable] object CSVParser:
1212
var inQuotes = false
1313
val cellBuffer = new StringBuilder
1414
val result = scala.collection.mutable.ListBuffer.empty[String]
15+
var i = 0
16+
17+
while i < line.length do
18+
val char = line.charAt(i)
1519

16-
for char <- line do
1720
char match
1821
case `quote` if !inQuotes =>
1922
// Start of quoted section
2023
inQuotes = true
2124

2225
case `quote` if inQuotes =>
23-
// End of quoted section (peek ahead to handle escaped quotes)
24-
if cellBuffer.nonEmpty && cellBuffer.last == quote then
25-
cellBuffer.deleteCharAt(cellBuffer.length - 1) // Handle escaped quote
26-
cellBuffer.append(char)
27-
else inQuotes = false
26+
// Check for RFC 4180 double-quote escaping
27+
if i + 1 < line.length && line.charAt(i + 1) == quote then
28+
// RFC 4180: doubled quote within quotes becomes a single quote
29+
cellBuffer.append(quote)
30+
i += 1 // Skip the next quote
31+
else
32+
// End of quoted section
33+
inQuotes = false
34+
35+
case '\\' if inQuotes && i + 1 < line.length =>
36+
// Handle backslash-escaped characters
37+
val nextChar = line.charAt(i + 1)
38+
nextChar match
39+
case 'n' =>
40+
// Escaped linefeed
41+
cellBuffer.append('\n')
42+
i += 1
43+
case 'r' =>
44+
// Escaped carriage return
45+
cellBuffer.append('\r')
46+
i += 1
47+
case '\\' =>
48+
// Escaped backslash
49+
cellBuffer.append('\\')
50+
i += 1
51+
case `delimiter` =>
52+
// Escaped delimiter
53+
cellBuffer.append(delimiter)
54+
i += 1
55+
case `quote` =>
56+
// Escaped quote character
57+
cellBuffer.append(quote)
58+
i += 1
59+
case _ =>
60+
// Unknown escape sequence - treat backslash literally
61+
cellBuffer.append('\\')
62+
// Don't increment i, let the next character be processed normally
2863

2964
case `delimiter` if !inQuotes =>
3065
// Delimiter outside quotes ends the current cell
@@ -34,7 +69,9 @@ private[scautable] object CSVParser:
3469
case _ =>
3570
// Add character to the current cell
3671
cellBuffer.append(char)
37-
end for
72+
73+
i += 1
74+
end while
3875

3976
// Append the last cell, if any
4077
result.append(cellBuffer.toString)
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package io.github.quafadas.scautable
2+
3+
import munit.FunSuite
4+
5+
class CSVParserSuite extends FunSuite:
6+
7+
test("parseLine should handle simple unquoted fields") {
8+
val line = "field1,field2,field3"
9+
val result = CSVParser.parseLine(line)
10+
assertEquals(result, List("field1", "field2", "field3"))
11+
}
12+
13+
test("parseLine should handle quoted fields") {
14+
val line = "\"field1\",\"field2\",\"field3\""
15+
val result = CSVParser.parseLine(line)
16+
assertEquals(result, List("field1", "field2", "field3"))
17+
}
18+
19+
test("parseLine should handle mixed quoted and unquoted fields") {
20+
val line = "field1,\"field2\",field3"
21+
val result = CSVParser.parseLine(line)
22+
assertEquals(result, List("field1", "field2", "field3"))
23+
}
24+
25+
test("parseLine should handle fields with commas inside quotes") {
26+
val line = "field1,\"field2,with,commas\",field3"
27+
val result = CSVParser.parseLine(line)
28+
assertEquals(result, List("field1", "field2,with,commas", "field3"))
29+
}
30+
31+
test("parseLine should handle RFC 4180 compliant double-quote escaping") {
32+
// RFC 4180: quotes inside quoted fields are escaped by doubling them
33+
val line = "\"this is my \"\"Test String\"\"\""
34+
val result = CSVParser.parseLine(line)
35+
assertEquals(result, List("this is my \"Test String\""))
36+
}
37+
38+
test("parseLine should handle backslash-escaped quotes") {
39+
// Common alternative: quotes escaped with backslashes
40+
val line = "\"this is my \\\"Test String\\\"\""
41+
val result = CSVParser.parseLine(line)
42+
assertEquals(result, List("this is my \"Test String\""))
43+
}
44+
45+
test("parseLine should handle multiple fields with RFC 4180 escaping") {
46+
val line = "field1,\"field2 with \"\"quotes\"\"\",field3"
47+
val result = CSVParser.parseLine(line)
48+
assertEquals(result, List("field1", "field2 with \"quotes\"", "field3"))
49+
}
50+
51+
test("parseLine should handle multiple fields with backslash escaping") {
52+
val line = "field1,\"field2 with \\\"quotes\\\"\",field3"
53+
val result = CSVParser.parseLine(line)
54+
assertEquals(result, List("field1", "field2 with \"quotes\"", "field3"))
55+
}
56+
57+
test("parseLine should handle empty fields") {
58+
val line = "field1,,field3"
59+
val result = CSVParser.parseLine(line)
60+
assertEquals(result, List("field1", "", "field3"))
61+
}
62+
63+
test("parseLine should handle empty quoted fields") {
64+
val line = "field1,\"\",field3"
65+
val result = CSVParser.parseLine(line)
66+
assertEquals(result, List("field1", "", "field3"))
67+
}
68+
69+
test("parseLine should handle newlines inside quoted fields") {
70+
val line = "field1,\"field2\nwith\nnewlines\",field3"
71+
val result = CSVParser.parseLine(line)
72+
assertEquals(result, List("field1", "field2\nwith\nnewlines", "field3"))
73+
}
74+
75+
test("parseLine should handle custom delimiter") {
76+
val line = "field1;field2;field3"
77+
val result = CSVParser.parseLine(line, delimiter = ';')
78+
assertEquals(result, List("field1", "field2", "field3"))
79+
}
80+
81+
test("parseLine should handle custom quote character") {
82+
val line = "field1,'field2 with ''spaces''',field3"
83+
val result = CSVParser.parseLine(line, quote = '\'')
84+
assertEquals(result, List("field1", "field2 with 'spaces'", "field3"))
85+
}
86+
87+
test("parseLine should handle complex RFC 4180 case") {
88+
// Complex case with multiple escaped quotes
89+
val line = "\"He said \"\"Hello, \"\"World\"\"!\"\"\""
90+
val result = CSVParser.parseLine(line)
91+
assertEquals(result, List("He said \"Hello, \"World\"!\""))
92+
}
93+
94+
test("parseLine should handle complex backslash escape case") {
95+
// Complex case with multiple escaped quotes using backslashes
96+
val line = "\"He said \\\"Hello, \\\"World\\\"!\\\"\""
97+
val result = CSVParser.parseLine(line)
98+
assertEquals(result, List("He said \"Hello, \"World\"!\""))
99+
}
100+
101+
test("parseLine should handle mixed escaping in different fields") {
102+
// One field with RFC 4180 escaping, another with backslash escaping
103+
val line = "\"field with \"\"RFC escaping\"\"\",\"field with \\\"backslash escaping\\\"\""
104+
val result = CSVParser.parseLine(line)
105+
assertEquals(result, List("field with \"RFC escaping\"", "field with \"backslash escaping\""))
106+
}
107+
108+
test("parseLine should handle trailing empty field") {
109+
val line = "field1,field2,"
110+
val result = CSVParser.parseLine(line)
111+
assertEquals(result, List("field1", "field2", ""))
112+
}
113+
114+
test("parseLine should handle leading empty field") {
115+
val line = ",field2,field3"
116+
val result = CSVParser.parseLine(line)
117+
assertEquals(result, List("", "field2", "field3"))
118+
}
119+
120+
// Tests for full escape character support (ESCAPE specification)
121+
test("parseLine should handle backslash-escaped linefeeds") {
122+
val line = "field1,\"field2 with \\n newline\",field3"
123+
val result = CSVParser.parseLine(line)
124+
assertEquals(result, List("field1", "field2 with \n newline", "field3"))
125+
}
126+
127+
test("parseLine should handle backslash-escaped carriage returns") {
128+
val line = "field1,\"field2 with \\r return\",field3"
129+
val result = CSVParser.parseLine(line)
130+
assertEquals(result, List("field1", "field2 with \r return", "field3"))
131+
}
132+
133+
test("parseLine should handle backslash-escaped delimiters") {
134+
val line = "field1,\"field2 with \\, comma\",field3"
135+
val result = CSVParser.parseLine(line)
136+
assertEquals(result, List("field1", "field2 with , comma", "field3"))
137+
}
138+
139+
test("parseLine should handle backslash-escaped backslashes") {
140+
val line = "field1,\"field2 with \\\\ backslash\",field3"
141+
val result = CSVParser.parseLine(line)
142+
assertEquals(result, List("field1", "field2 with \\ backslash", "field3"))
143+
}
144+
145+
test("parseLine should handle custom delimiter with backslash-escaped delimiter") {
146+
val line = "field1;\"field2 with \\; semicolon\";field3"
147+
val result = CSVParser.parseLine(line, delimiter = ';')
148+
assertEquals(result, List("field1", "field2 with ; semicolon", "field3"))
149+
}
150+
151+
test("parseLine should handle multiple escape sequences in one field") {
152+
val line = "field1,\"field2 with \\n\\r\\, and \\\" escapes\",field3"
153+
val result = CSVParser.parseLine(line)
154+
assertEquals(result, List("field1", "field2 with \n\r, and \" escapes", "field3"))
155+
}
156+
157+
test("parseLine should handle backslash at end of field (not escaping anything)") {
158+
val line = "field1,\"field2 ends with \\\\\",field3"
159+
val result = CSVParser.parseLine(line)
160+
assertEquals(result, List("field1", "field2 ends with \\", "field3"))
161+
}
162+
163+
test("parseLine should handle invalid escape sequences by treating backslash literally") {
164+
val line = "field1,\"field2 with \\z invalid escape\",field3"
165+
val result = CSVParser.parseLine(line)
166+
assertEquals(result, List("field1", "field2 with \\z invalid escape", "field3"))
167+
}

0 commit comments

Comments
 (0)