Skip to content

Commit e86618a

Browse files
srielaucloud-fan
authored andcommitted
[SPARK-55991] Fix unicode related SQL text corruption with parameters
### What changes were proposed in this pull request? Fix parameter substitution code to be mindful of unicode supplemental characters ### Why are the changes needed? Emojies (and other special characters) cause corruption of the SQL text if parameter markers are substiution due to offset issues. codepoint vs character ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Wrote new testcases ### Was this patch authored or co-authored using generative AI tooling? YEs Claude Opus 4.6 high Closes #54798 from srielau/emoji. Authored-by: Serge Rielau <serge@rielau.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit 4d79768) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 50d9886 commit e86618a

File tree

2 files changed

+48
-8
lines changed

2 files changed

+48
-8
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParamsParser.scala

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,22 +188,25 @@ class SubstituteParamsParser extends Logging {
188188
* Apply a list of substitutions to the SQL text.
189189
* Inserts a space separator when a parameter is immediately preceded by a quote
190190
* to avoid back-to-back quotes after substitution.
191+
*
192+
* ANTLR's CodePointCharStream reports token positions in Unicode code points, but
193+
* Java/Scala String indices are in UTF-16 code units. Supplementary characters
194+
* (e.g. emojis) occupy 1 code point but 2 code units, so we must convert.
191195
*/
192196
private def applySubstitutions(sqlText: String, substitutions: List[Substitution]): String = {
193-
// Sort substitutions by start position in reverse order to avoid offset issues
194197
val sortedSubstitutions = substitutions.sortBy(-_.start)
195198

196199
var result = sqlText
197200
sortedSubstitutions.foreach { substitution =>
198-
val prefix = result.substring(0, substitution.start)
201+
val startCU = result.offsetByCodePoints(0, substitution.start)
202+
val endCU = result.offsetByCodePoints(0, substitution.end)
203+
val prefix = result.substring(0, startCU)
199204
val replacement = substitution.replacement
200-
val suffix = result.substring(substitution.end)
205+
val suffix = result.substring(endCU)
201206

202-
// Check if replacement is immediately preceded by a quote and doesn't already
203-
// start with whitespace
204-
val needsSpace = substitution.start > 0 &&
205-
(result(substitution.start - 1) == '\'' || result(substitution.start - 1) == '"') &&
206-
replacement.nonEmpty && !replacement(0).isWhitespace
207+
val needsSpace = startCU > 0 &&
208+
(result.charAt(startCU - 1) == '\'' || result.charAt(startCU - 1) == '"') &&
209+
replacement.nonEmpty && !replacement.charAt(0).isWhitespace
207210

208211
val space = if (needsSpace) " " else ""
209212
result = s"$prefix$space$replacement$suffix"

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ParameterSubstitutionSuite.scala

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,43 @@ class ParameterSubstitutionSuite extends SparkFunSuite {
121121
}
122122
}
123123

124+
test("ParameterHandler - named parameter with emoji in SQL") {
125+
val emoji = new String(Character.toChars(0x1F4AA)) // supplementary char (2 UTF-16 code units)
126+
val context = NamedParameterContext(Map("team" -> Literal("abc")))
127+
val sql = s"SELECT '${emoji}' AS a FROM T WHERE :team IS NULL"
128+
val (result, _) = ParameterHandler.substituteParameters(sql, context)
129+
assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
130+
}
131+
132+
test("ParameterHandler - positional parameter with emoji in SQL") {
133+
val emoji = new String(Character.toChars(0x1F4AA))
134+
val context = PositionalParameterContext(Seq(Literal("abc")))
135+
val sql = s"SELECT '${emoji}' AS a FROM T WHERE ? IS NULL"
136+
val (result, _) = ParameterHandler.substituteParameters(sql, context)
137+
assert(result === s"SELECT '${emoji}' AS a FROM T WHERE 'abc' IS NULL")
138+
}
139+
140+
test("ParameterHandler - multiple params with emoji in SQL and replacement values") {
141+
val flexed = new String(Character.toChars(0x1F4AA))
142+
val tada = new String(Character.toChars(0x1F389))
143+
val context = NamedParameterContext(Map(
144+
"p1" -> Literal(tada),
145+
"p2" -> Literal(42)
146+
))
147+
val sql = s"SELECT '${flexed}', :p1, '${flexed}', :p2"
148+
val (result, _) = ParameterHandler.substituteParameters(sql, context)
149+
assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}', 42")
150+
}
151+
152+
test("ParameterHandler - positional params with multiple emojis") {
153+
val flexed = new String(Character.toChars(0x1F4AA))
154+
val tada = new String(Character.toChars(0x1F389))
155+
val context = PositionalParameterContext(Seq(Literal(tada), Literal(99)))
156+
val sql = s"SELECT '${flexed}', ?, '${flexed}${flexed}', ?"
157+
val (result, _) = ParameterHandler.substituteParameters(sql, context)
158+
assert(result === s"SELECT '${flexed}', '${tada}', '${flexed}${flexed}', 99")
159+
}
160+
124161
test("Large parameter set") {
125162

126163
val largeParamMap = (1 to 100).map(i => s"param$i" -> Literal(i)).toMap

0 commit comments

Comments
 (0)