Skip to content

Commit a9b70d1

Browse files
gmaletteianks
andcommitted
Escape HTML Entities
Co-Authored-By: Ian Ker-Seymer <[email protected]>
1 parent c079793 commit a9b70d1

File tree

3 files changed

+120
-7
lines changed

3 files changed

+120
-7
lines changed

ext/json/ext/generator/generator.c

+54-7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ typedef struct JSON_Generator_StateStruct {
2222
bool ascii_only;
2323
bool script_safe;
2424
bool strict;
25+
bool escape_html_entities;
2526
} JSON_Generator_State;
2627

2728
#ifndef RB_UNLIKELY
@@ -32,7 +33,7 @@ static VALUE mJSON, cState, cFragment, mString_Extend, eGeneratorError, eNesting
3233

3334
static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode;
3435
static VALUE sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan,
35-
sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict, sym_as_json;
36+
sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict, sym_as_json, sym_escape_html_entities;
3637

3738

3839
#define GET_STATE_TO(self, state) \
@@ -251,11 +252,11 @@ static const unsigned char script_safe_escape_table[256] = {
251252
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
252253
};
253254

254-
static inline unsigned char search_script_safe_escape(search_state *search)
255+
static inline unsigned char search_with_escape_table(search_state *search, unsigned char *table)
255256
{
256257
while (search->ptr < search->end) {
257258
unsigned char ch = (unsigned char)*search->ptr;
258-
unsigned char ch_len = script_safe_escape_table[ch];
259+
unsigned char ch_len = table[ch];
259260

260261
if (RB_UNLIKELY(ch_len)) {
261262
if (ch_len & ESCAPE_MASK) {
@@ -279,14 +280,39 @@ static inline unsigned char search_script_safe_escape(search_state *search)
279280
return 0;
280281
}
281282

282-
static void convert_UTF8_to_script_safe_JSON(search_state *search)
283+
static inline void convert_UTF8_to_JSON_with_table(search_state *search, const unsigned char table[256])
283284
{
284285
unsigned char ch_len;
285-
while ((ch_len = search_script_safe_escape(search))) {
286+
while ((ch_len = search_with_escape_table(search, (unsigned char *)table))) {
286287
escape_UTF8_char(search, ch_len);
287288
}
288289
}
289290

291+
static const unsigned char escape_html_entities_escape_table[256] = {
292+
// ASCII Control Characters
293+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
294+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
295+
// ASCII Characters
296+
0, 0, 9, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"', '&', and '/'
297+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 9, 0, // < and >
298+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
299+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
300+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302+
// Continuation byte
303+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
304+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
305+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
306+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
307+
// First byte of a 2-byte code point
308+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
309+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
310+
// First byte of a 3-byte code point
311+
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
312+
//First byte of a 4+ byte code point
313+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
314+
};
315+
290316
static const unsigned char ascii_only_escape_table[256] = {
291317
// ASCII Control Characters
292318
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
@@ -977,9 +1003,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
9771003
case ENC_CODERANGE_7BIT:
9781004
case ENC_CODERANGE_VALID:
9791005
if (RB_UNLIKELY(state->ascii_only)) {
980-
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
1006+
convert_UTF8_to_ASCII_only_JSON(&search, state->escape_html_entities ? escape_html_entities_escape_table : (state->script_safe ? script_safe_escape_table : ascii_only_escape_table));
1007+
} else if (RB_UNLIKELY(state->escape_html_entities)) {
1008+
convert_UTF8_to_JSON_with_table(&search, escape_html_entities_escape_table);
9811009
} else if (RB_UNLIKELY(state->script_safe)) {
982-
convert_UTF8_to_script_safe_JSON(&search);
1010+
convert_UTF8_to_JSON_with_table(&search, script_safe_escape_table);
9831011
} else {
9841012
convert_UTF8_to_JSON(&search);
9851013
}
@@ -1609,6 +1637,19 @@ static VALUE cState_buffer_initial_length_set(VALUE self, VALUE buffer_initial_l
16091637
return Qnil;
16101638
}
16111639

1640+
static VALUE cState_escape_html_entities(VALUE self)
1641+
{
1642+
GET_STATE(self);
1643+
return state->escape_html_entities ? Qtrue : Qfalse;
1644+
}
1645+
1646+
static VALUE cState_escape_html_entities_set(VALUE self, VALUE val)
1647+
{
1648+
GET_STATE(self);
1649+
state->escape_html_entities = RTEST(val);
1650+
return val;
1651+
}
1652+
16121653
static int configure_state_i(VALUE key, VALUE val, VALUE _arg)
16131654
{
16141655
JSON_Generator_State *state = (JSON_Generator_State *)_arg;
@@ -1627,6 +1668,8 @@ static int configure_state_i(VALUE key, VALUE val, VALUE _arg)
16271668
else if (key == sym_escape_slash) { state->script_safe = RTEST(val); }
16281669
else if (key == sym_strict) { state->strict = RTEST(val); }
16291670
else if (key == sym_as_json) { state->as_json = RTEST(val) ? rb_convert_type(val, T_DATA, "Proc", "to_proc") : Qfalse; }
1671+
else if (key == sym_escape_html_entities) { state->escape_html_entities = RTEST(val); }
1672+
16301673
return ST_CONTINUE;
16311674
}
16321675

@@ -1740,6 +1783,9 @@ void Init_generator(void)
17401783
rb_define_method(cState, "depth=", cState_depth_set, 1);
17411784
rb_define_method(cState, "buffer_initial_length", cState_buffer_initial_length, 0);
17421785
rb_define_method(cState, "buffer_initial_length=", cState_buffer_initial_length_set, 1);
1786+
rb_define_method(cState, "escape_html_entities", cState_escape_html_entities, 0);
1787+
rb_define_method(cState, "escape_html_entities?", cState_escape_html_entities, 0);
1788+
rb_define_method(cState, "escape_html_entities=", cState_escape_html_entities_set, 1);
17431789
rb_define_method(cState, "generate", cState_generate, -1);
17441790
rb_define_alias(cState, "generate_new", "generate"); // :nodoc:
17451791

@@ -1813,6 +1859,7 @@ void Init_generator(void)
18131859
sym_escape_slash = ID2SYM(rb_intern("escape_slash"));
18141860
sym_strict = ID2SYM(rb_intern("strict"));
18151861
sym_as_json = ID2SYM(rb_intern("as_json"));
1862+
sym_escape_html_entities = ID2SYM(rb_intern("escape_html_entities"));
18161863

18171864
usascii_encindex = rb_usascii_encindex();
18181865
utf8_encindex = rb_utf8_encindex();

lib/json/ext/generator/state.rb

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def to_h
6666
strict: strict?,
6767
depth: depth,
6868
buffer_initial_length: buffer_initial_length,
69+
escape_html_entities: escape_html_entities?,
6970
}
7071

7172
instance_variables.each do |iv|

test/json/json_generator_test.rb

+65
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ def test_pretty_state
215215
:object_nl => "\n",
216216
:space => " ",
217217
:space_before => "",
218+
:escape_html_entities => false,
218219
}.sort_by { |n,| n.to_s }, state.to_h.sort_by { |n,| n.to_s })
219220
end
220221

@@ -234,6 +235,7 @@ def test_safe_state
234235
:object_nl => "",
235236
:space => "",
236237
:space_before => "",
238+
:escape_html_entities => false,
237239
}.sort_by { |n,| n.to_s }, state.to_h.sort_by { |n,| n.to_s })
238240
end
239241

@@ -253,6 +255,7 @@ def test_fast_state
253255
:object_nl => "",
254256
:space => "",
255257
:space_before => "",
258+
:escape_html_entities => false,
256259
}.sort_by { |n,| n.to_s }, state.to_h.sort_by { |n,| n.to_s })
257260
end
258261

@@ -481,6 +484,68 @@ def test_backslash
481484
assert_equal json, generate(data, script_safe: true)
482485
end
483486

487+
def test_escape_html_entities
488+
data = [ '/' ]
489+
json = '["\/"]'
490+
assert_equal json, generate(data, :escape_html_entities => true)
491+
#
492+
data = [ "\u2028\u2029" ]
493+
json = '["\u2028\u2029"]'
494+
assert_equal json, generate(data, :escape_html_entities => true)
495+
#
496+
data = ['&']
497+
json = '["\\u0026"]'
498+
assert_equal json, generate(data, escape_html_entities: true)
499+
#
500+
data = ['<']
501+
json = '["\\u003c"]'
502+
assert_equal json, generate(data, escape_html_entities: true)
503+
#
504+
data = ['>']
505+
json = '["\\u003e"]'
506+
assert_equal json, generate(data, escape_html_entities: true)
507+
#
508+
data = ["倩", "瀨"]
509+
json = '["倩","瀨"]'
510+
assert_equal json, generate(data, escape_html_entities: true)
511+
end
512+
513+
def test_escape_html_entities_priority_over_script_safe
514+
data = ['&']
515+
json = '["\\u0026"]'
516+
assert_equal json, generate(data, escape_html_entities: true, script_safe: true)
517+
#
518+
data = ['<']
519+
json = '["\\u003c"]'
520+
assert_equal json, generate(data, escape_html_entities: true, script_safe: true)
521+
#
522+
data = ['>']
523+
json = '["\\u003e"]'
524+
assert_equal json, generate(data, escape_html_entities: true, script_safe: true)
525+
#
526+
data = ['/']
527+
json = '["\/"]'
528+
assert_equal json, generate(data, escape_html_entities: true, script_safe: true)
529+
#
530+
data = ['&<>/']
531+
json = '["\\u0026\\u003c\\u003e\/"]'
532+
assert_equal json, generate(data, escape_html_entities: true, script_safe: true)
533+
end
534+
535+
def test_ascii_only_with_escape_html_entities
536+
data = ['é&<>']
537+
json = '["\\u00e9\\u0026\\u003c\\u003e"]'
538+
assert_equal json, generate(data, ascii_only: true, escape_html_entities: true)
539+
#
540+
data = ['abc123']
541+
json = '["abc123"]'
542+
assert_equal json, generate(data, ascii_only: true, escape_html_entities: true)
543+
#
544+
data = ['倩瀨']
545+
json = '["\\u5029\\u7028"]'
546+
assert_equal json, generate(data, ascii_only: true, escape_html_entities: true)
547+
end
548+
484549
def test_string_subclass
485550
s = Class.new(String) do
486551
def to_s; self; end

0 commit comments

Comments
 (0)