Skip to content

Commit 3bc6ea5

Browse files
committed
Auto merge of #152954 - Kmeakin:km/unicode-data/case-mapping, r=Mark-Simulacrum
Unicode data: reduce size of to_lower/to_upper tables Reduces the combined size of to_lower and to_upper from 25,364 bytes to 3,110 bytes. Explained in detail in the doc comments
2 parents 595f14b + 902199b commit 3bc6ea5

File tree

9 files changed

+3848
-996
lines changed

9 files changed

+3848
-996
lines changed

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1919
pub(crate) mod printable;
2020

2121
#[allow(unreachable_pub)]
22-
mod unicode_data;
22+
pub mod unicode_data;
2323

2424
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2525
/// `char` and `str` methods are based on.

library/core/src/unicode/unicode_data.rs

Lines changed: 385 additions & 822 deletions
Large diffs are not rendered by default.

library/coretests/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
#![feature(uint_bit_width)]
121121
#![feature(uint_carryless_mul)]
122122
#![feature(uint_gather_scatter_bits)]
123+
#![feature(unicode_internals)]
123124
#![feature(unsize)]
124125
#![feature(unwrap_infallible)]
125126
#![feature(widening_mul)]

library/coretests/tests/unicode.rs

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,102 @@
1+
use core::unicode::unicode_data;
2+
use std::ops::RangeInclusive;
3+
4+
mod test_data;
5+
16
#[test]
27
pub fn version() {
38
let (major, _minor, _update) = core::char::UNICODE_VERSION;
49
assert!(major >= 10);
510
}
11+
12+
#[track_caller]
13+
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
14+
let mut start = '\u{80}';
15+
for range in ranges {
16+
for c in start..*range.start() {
17+
assert!(!lookup(c), "{c:?}");
18+
}
19+
for c in range.clone() {
20+
assert!(lookup(c), "{c:?}");
21+
}
22+
start = char::from_u32(*range.end() as u32 + 1).unwrap();
23+
}
24+
for c in start..=char::MAX {
25+
assert!(!lookup(c), "{c:?}");
26+
}
27+
}
28+
29+
#[track_caller]
30+
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
31+
let mut start = '\u{80}';
32+
for &(key, val) in ranges {
33+
for c in start..key {
34+
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
35+
}
36+
assert_eq!(lookup(key), val, "{key:?}");
37+
start = char::from_u32(key as u32 + 1).unwrap();
38+
}
39+
for c in start..=char::MAX {
40+
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
41+
}
42+
}
43+
44+
#[test]
45+
#[cfg_attr(miri, ignore)] // Miri is too slow
46+
fn alphabetic() {
47+
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
48+
}
49+
50+
#[test]
51+
#[cfg_attr(miri, ignore)] // Miri is too slow
52+
fn case_ignorable() {
53+
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
54+
}
55+
56+
#[test]
57+
#[cfg_attr(miri, ignore)] // Miri is too slow
58+
fn cased() {
59+
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
60+
}
61+
62+
#[test]
63+
#[cfg_attr(miri, ignore)] // Miri is too slow
64+
fn grapheme_extend() {
65+
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
66+
}
67+
68+
#[test]
69+
#[cfg_attr(miri, ignore)] // Miri is too slow
70+
fn lowercase() {
71+
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
72+
}
73+
74+
#[test]
75+
#[cfg_attr(miri, ignore)] // Miri is too slow
76+
fn n() {
77+
test_boolean_property(test_data::N, unicode_data::n::lookup);
78+
}
79+
80+
#[test]
81+
#[cfg_attr(miri, ignore)] // Miri is too slow
82+
fn uppercase() {
83+
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
84+
}
85+
86+
#[test]
87+
#[cfg_attr(miri, ignore)] // Miri is too slow
88+
fn white_space() {
89+
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
90+
}
91+
92+
#[test]
93+
#[cfg_attr(miri, ignore)] // Miri is too slow
94+
fn to_lowercase() {
95+
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
96+
}
97+
98+
#[test]
99+
#[cfg_attr(miri, ignore)] // Miri is too slow
100+
fn to_uppercase() {
101+
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
102+
}

library/coretests/tests/unicode/test_data.rs

Lines changed: 2902 additions & 0 deletions
Large diffs are not rendered by default.

src/bootstrap/src/core/build_steps/run.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,8 @@ impl Step for GenerateCompletions {
358358
}
359359
}
360360

361+
/// The build step for generating the tables in `core/src/char/unicode/unicode_data.rs`
362+
/// and the tests in `library/coretests/tests/unicode/test_data.rs`.
361363
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
362364
pub struct UnicodeTableGenerator;
363365

@@ -375,7 +377,9 @@ impl Step for UnicodeTableGenerator {
375377

376378
fn run(self, builder: &Builder<'_>) {
377379
let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
380+
// Generated files that are checked into git:
378381
cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
382+
cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
379383
cmd.run(builder);
380384
}
381385
}

0 commit comments

Comments
 (0)