icu_provider_source/characters/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use core::ops::Deref;
use std::collections::HashSet;

use crate::cldr_serde;
use crate::IterableDataProviderCached;
use crate::SourceDataProvider;
use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu::locale::provider::*;
use icu_provider::prelude::*;
use itertools::Itertools;

macro_rules! exemplar_chars_impls {
    ($data_marker_name:ident, $cldr_serde_field_name:ident) => {
        impl DataProvider<$data_marker_name> for SourceDataProvider {
            fn load(&self, req: DataRequest) -> Result<DataResponse<$data_marker_name>, DataError> {
                self.check_req::<$data_marker_name>(req)?;

                let data: &cldr_serde::exemplar_chars::Resource = self
                    .cldr()?
                    .misc()
                    .read_and_parse(req.id.locale, "characters.json")?;

                Ok(DataResponse {
                    metadata: Default::default(),
                    payload: DataPayload::from_owned(string_to_prop_unicodeset(
                        data.main
                            .value
                            .characters
                            .$cldr_serde_field_name
                            .as_deref()
                            .unwrap_or("[]"),
                    )),
                })
            }
        }

        impl IterableDataProviderCached<$data_marker_name> for SourceDataProvider {
            fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
                Ok(self
                    .cldr()?
                    .misc()
                    .list_locales()?
                    .map(DataIdentifierCow::from_locale)
                    .collect())
            }
        }
    };
}

exemplar_chars_impls!(ExemplarCharactersMainV1Marker, main);
exemplar_chars_impls!(ExemplarCharactersAuxiliaryV1Marker, auxiliary);
exemplar_chars_impls!(ExemplarCharactersPunctuationV1Marker, punctuation);
exemplar_chars_impls!(ExemplarCharactersNumbersV1Marker, numbers);
exemplar_chars_impls!(ExemplarCharactersIndexV1Marker, index);

fn string_to_prop_unicodeset(s: &str) -> ExemplarCharactersV1<'static> {
    ExemplarCharactersV1(CodePointInversionListAndStringList::from_iter(
        parse_exemplar_char_string(s)
            .iter()
            .map(Deref::deref)
            .sorted(),
    ))
}

/// In the occurrence of subsequences that are used to represent character literals,
/// like "\\\\:" or "\\\\\\\\[", excise the subsequence from the input string
/// and prepopulate the set with the corresponding characters like ":" and "[".
/// But since Unicode code point escape sequences, like "\\\\\\\\U00011000" can & should
/// be handled in a later step by the TOML parser, leave those subsequences alone.
fn preprocess_char_literal_notation(set: &mut HashSet<String>, input: &mut String) {
    let mut result = input.to_string();

    // These are backslash substrings sometimes used to escape character literals like punctuation.
    let possible_slash_strs = ["\\\\\\\\", "\\\\\\", "\\\\"];

    // Iterate in order of largest to smallest. Guarantee this with `.sorted().rev()`.
    for slash_str in possible_slash_strs.iter().sorted().rev() {
        let mut slash_result = result.clone();

        for match_tuple in result.rmatch_indices(slash_str) {
            let slash_idx = match_tuple.0;

            // find returns a byte index, so temporarily use a byte index just for size check
            let maybe_next_char_idx = slash_idx + slash_str.len();
            if maybe_next_char_idx < slash_result.len() {
                let char_literal = slash_result[maybe_next_char_idx..].chars().next().unwrap();
                let char_literal_str = char_literal.to_string();

                // Skip if we're looking at a Unicode code point escape sequence (ex: "\\\\Uxxxxxxxx")
                // rather than a Unix/bash-style escaped character literal (ex: "\\\\:", "\\\\-").
                // Also skip if we're seeing a suprious result, ex: we are looking for a double backslash
                // (ex: "\\\\" in the presence of quad backslashes like "\\\\\\\\Uxxxxxxxx") that should
                // be left alone.
                // Also skip if there is whitespace after the backslahses. Let's assume that this
                // is part of a token of all backslashes. Allow that to be fully parsed and
                // handled later in `unescape_exemplar_chars()`.
                if char_literal_str == "U"
                    || char_literal_str == "u"
                    || char_literal_str == "\\"
                    || char_literal.is_whitespace()
                {
                    continue;
                }

                let char_literal_byte_len = char_literal_str.len();
                set.insert(char_literal_str);

                // Remove the slash and the char literal following it from the original string.
                let mut new_slash_result = slash_result[..slash_idx].to_string();
                new_slash_result
                    .push_str(&slash_result[(maybe_next_char_idx + char_literal_byte_len)..]);
                slash_result = new_slash_result;
            }
        }
        result.clear();
        result.push_str(&slash_result);
    }
    input.clear();
    input.push_str(&result);
}

/// Predicate fn that returns whether a character should be used in `.split()` to tokenize
/// the exemplar characters JSON string.
fn is_exemplar_string_split_char(c: char) -> bool {
    // Don't include the close brace in the split criteria so that, after we split,
    // we know where the `{...}` sequence ends.
    c.is_whitespace() || c == '{'
}

/// Unescape a (sub-)string of exemplar character data
fn unescape_exemplar_chars(char_block: &str) -> String {
    // Exit early with degenerate case that interferes with TOML parser workaround.
    // Also handle a char block solely consisting of all backslashes (ex: "\\\\\\\\") as a backslash literal.
    if char_block.chars().all(|ch| ch == '\\') {
        return "\\".to_string();
    } else if char_block
        .chars()
        .all(|ch| ch == '\"' || ch == '"' || ch == '\\')
    {
        return char_block.replace('\\', "");
    }

    // Workaround for literal values like "\\-" that cause problems for the TOML parser.
    // In such cases, remove the '\\' character preceding the non-Unicode-escape-sequence character.
    let mut ch_vec = char_block.chars().collect::<Vec<char>>();
    let mut ch_indices_to_remove: Vec<usize> = vec![];
    for (idx, ch) in ch_vec.iter().enumerate().rev() {
        if ch == &'\\' {
            let ch_after_slash = ch_vec.get(idx + 1).unwrap();
            if ch_after_slash != &'u' && ch_after_slash != &'U' {
                ch_indices_to_remove.push(idx);
            }
        }
    }
    for idx in ch_indices_to_remove {
        ch_vec.remove(idx);
    }
    let ch_for_toml = ch_vec.iter().collect::<String>();

    // Workaround for double quotation mark literal values, which can appear in a string as a backslash followed
    // by the quotation mark itself (\"), but for the purposes of the TOML parser, should be escaped to be 2
    // slashes followed by the quotation mark (\\").
    // Start by removing all preceding backslashes from quotation marks, and finally add back 2 backslashes.
    let mut ch_for_toml = ch_for_toml.to_string();
    // Remove up to 3 consecutive backslashes preceding a quotation mark. Preprocessing should have already
    // removed 4-/6-/8-fold preceding backslashes before a character.
    for _i in 1..=3 {
        ch_for_toml = ch_for_toml.replace("\\\"", "\"");
    }
    ch_for_toml = ch_for_toml.replace('\"', "\\\"");

    // Unescape the escape sequences like \uXXXX and \UXXXXXXXX into the proper code points.
    // Also, workaround errant extra backslash escaping.
    // Because JSON does not support \UXXXXXXXX Unicode code point escaping, use the TOML parser.
    let ch_for_toml = format!("x=\"{ch_for_toml}\"");

    let ch_lite_t_val: toml::Value =
        toml::from_str(&ch_for_toml).unwrap_or_else(|_| panic!("{char_block:?}"));
    let ch_lite = if let toml::Value::Table(t) = ch_lite_t_val {
        if let Some(toml::Value::String(s)) = t.get("x") {
            s.to_owned()
        } else {
            panic!();
        }
    } else {
        panic!();
    };

    let result = ch_lite.trim().to_string();

    result
}

/// Parse the input string, and insert the represented exemplar "characters" (each of
/// which could either be an individual code point or a code point sequence) into the set.
fn insert_chars_from_string(set: &mut HashSet<String>, input: &str) {
    let s = if input.chars().count() > 1 && input.starts_with('\\') {
        input
            .chars()
            .skip_while(|ch| ch == &'\\')
            .collect::<String>()
    } else {
        input.to_string()
    };
    // A range of consecutive code point characters can be represented as <char_start>-<char_end>.
    if s.contains('-') && s.find('-').unwrap() > 0 {
        let (begin, end) = s.split_once('-').unwrap();
        let begin_char = begin.chars().next_back().unwrap();
        let end_char = end.chars().next().unwrap();

        for code_point in (begin_char as u32)..=(end_char as u32) {
            let char_str = char::from_u32(code_point)
                .expect("Character range should not span non-Unicode-scalar-value code points")
                .to_string();
            set.insert(char_str);
        }

        // After handling the range substring, recursively handle any chars/ranges in the remaining
        // parts of the string.
        let rem_begin_str = &begin[..(begin.len() - begin_char.len_utf8())];
        let rem_end_str = &end[end_char.len_utf8()..];
        insert_chars_from_string(set, rem_begin_str);
        insert_chars_from_string(set, rem_end_str);
    } else {
        for ch in s.chars() {
            set.insert(ch.to_string());
        }
    }
}

/// Parse the input CLDR JSON string representing exemplar character data and return a
/// set of strings representing each code point or string represented by the CLDR JSON
/// serialized form.
fn parse_exemplar_char_string(s: &str) -> HashSet<String> {
    debug_assert!(s.starts_with('['));
    debug_assert!(s.ends_with(']'));
    let mut transformed_input = s.split_at(1).1.split_at(s.len() - 2).0.to_string();

    if transformed_input.is_empty() {
        return HashSet::new();
    }

    // Initialize result collection of parsed element strings of exemplar character data.
    // Note: We want to use the hashset to dedup in case of space (U+0020) literal being included in exemplar char set.
    let mut dedup_chars = HashSet::<String>::new();

    // CLDR JSON uses an "over"-escaped notation to indicate a character literal, including
    // for characters that overlap with notational syntax characters. Since these are special
    // cases, handle them first before proceeding.
    preprocess_char_literal_notation(&mut dedup_chars, &mut transformed_input);

    transformed_input
        .split(is_exemplar_string_split_char)
        .filter(|t| !t.is_empty())
        .for_each(|token| {
            let mut string_and_chars = token.split('}');

            if let Some(maybe_char_string) = string_and_chars.next() {
                if !maybe_char_string.is_empty() {
                    if token.contains('}') {
                        // If we see a '}', then we assume it was the ending of a string
                        // denoted by `{...}` in a well-formed input.
                        // We need to unescape first so that we turn a substring like "...{ɛ\\u0300}..."
                        // into "...ɛ̀..."
                        let unescaped_char_string = unescape_exemplar_chars(maybe_char_string);
                        dedup_chars.insert(unescaped_char_string);
                    } else {
                        // If we don't see '}', it means we have a string that was whitespace delimited
                        let unescaped_char_block = unescape_exemplar_chars(maybe_char_string);
                        insert_chars_from_string(&mut dedup_chars, &unescaped_char_block);
                    }
                }

                // since we already split on '{' in order to create `token`, then only the first
                // subarray split could contain '}'. all other subarray splits should be considered
                // as strings of one or more consecutive characters
                for char_block in string_and_chars.filter(|t| !t.is_empty()) {
                    let unescaped_char_block = unescape_exemplar_chars(char_block);
                    insert_chars_from_string(&mut dedup_chars, &unescaped_char_block);
                }
            }
        });

    dedup_chars
}

#[cfg(test)]
mod tests {
    use super::*;
    use icu::locale::langid;

    #[test]
    fn test_parse_exemplar_chars() {
        let af_numbers = "[  \\- ‑ , % ‰ + 0 1 2 3 4 5 6 7 8 9]";
        let expected: HashSet<String> = [
            "-", "‑", ",", "%", "‰", "+", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        ]
        .iter()
        .copied()
        .map(std::string::String::from)
        .collect();
        let actual = parse_exemplar_char_string(af_numbers);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_exemplar_char_sequences() {
        let sr_main = "[a b c č ć d {dž} đ e f g h i j k l {lj} m n {nj} o p r s š t u v z ž]";
        let expected: HashSet<String> = [
            "a", "b", "c", "č", "ć", "d", "dž", "đ", "e", "f", "g", "h", "i", "j", "k", "l", "lj",
            "m", "n", "nj", "o", "p", "r", "s", "š", "t", "u", "v", "z", "ž",
        ]
        .iter()
        .copied()
        .map(std::string::String::from)
        .collect();
        let actual = parse_exemplar_char_string(sr_main);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_exemplar_char_ranges() {
        let ja_main_subset_range = "[万-下]";
        let expected: HashSet<String> = ["万", "丈", "三", "上", "下"]
            .iter()
            .copied()
            .map(std::string::String::from)
            .collect();
        let actual = parse_exemplar_char_string(ja_main_subset_range);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_exemplar_char_ranges_no_whitespace() {
        let range_amid_chars = "[a万-下z]";
        let expected: HashSet<String> = ["万", "丈", "三", "上", "下", "a", "z"]
            .iter()
            .copied()
            .map(std::string::String::from)
            .collect();
        let actual = parse_exemplar_char_string(range_amid_chars);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_splits() {
        let sr_main = "[a b cčć d{dž}đ    e\u{00A0}f  \u{202F}   ghijkl{lj}mn{nj}oprsštuvzž]";
        let expected: HashSet<String> = [
            "a", "b", "c", "č", "ć", "d", "dž", "đ", "e", "f", "g", "h", "i", "j", "k", "l", "lj",
            "m", "n", "nj", "o", "p", "r", "s", "š", "t", "u", "v", "z", "ž",
        ]
        .iter()
        .copied()
        .map(std::string::String::from)
        .collect();
        let actual = parse_exemplar_char_string(sr_main);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_unescape() {
        let ar_eg_auxiliary = "[ـ\\u200C\\u200D\\u200E\\u200F پ چ ژ ڜ ڢ ڤ ڥ ٯ ڧ ڨ ک گ ی]";
        let expected: HashSet<String> = [
            "ـ", "\u{200C}", "\u{200D}", "\u{200E}", "\u{200F}", "پ", "چ", "ژ", "ڜ", "ڢ", "ڤ", "ڥ",
            "ٯ", "ڧ", "ڨ", "ک", "گ", "ی",
        ]
        .iter()
        .copied()
        .map(std::string::String::from)
        .collect();
        let actual = parse_exemplar_char_string(ar_eg_auxiliary);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_quotes() {
        let quotes = "[\""]";
        let expected: HashSet<String> = ["\"", """]
            .iter()
            .copied()
            .map(std::string::String::from)
            .collect();
        let actual = parse_exemplar_char_string(quotes);

        assert_eq!(actual, expected);
    }

    #[test]
    fn test_parse_escaped_punctuation() {
        let ja_punctuation = "[‾ __ \\\\\\\\-- ‐ ‑ — ― 〜 ・ ・ ,, 、、 ;; \\\\\\\\:: !! ?? .. ‥ … 。。 ' ‘ ’ \\\\\\\"" “ ” (( )) \\\\\\\\[[ \\\\\\\\]] \\\\\\\\{{ \\\\\\\\}} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ** // \\\\\\\\\ \\\\\\\\&& ## %% ‰ † ‡ ′ ″ 〃 ※]";

        let actual = parse_exemplar_char_string(ja_punctuation);

        let any_backslashes = actual.iter().any(|parsed_str| parsed_str.contains('\\'));

        assert!(!any_backslashes);
        assert!(actual.contains("-"));
        assert!(actual.contains(":"));
        assert!(actual.contains("\""));
    }

    #[test]
    fn test_parse_escaped_punctuation_preserve_code_point_notation() {
        let ccp_main = "[\\\\\\\\U00011100 \\\\\\\\U00011101 \\\\\\\\U00011102 𑄃 𑄄 𑄅 𑄆 𑄇 𑄈 𑄉 𑄊 𑄋 𑄌 𑄍 𑄎 𑄏 𑄐 𑄑 𑄒 𑄓 𑄔 𑄕 𑄖 𑄗 𑄘 𑄙 𑄚 𑄛 𑄜 𑄝 𑄞 𑄟 𑄠 𑄡 𑄢 𑄣 𑄤 𑄥 𑄦 \\\\\\\\U00011127 \\\\\\\\U00011128 \\\\\\\\U00011129 \\\\\\\\U0001112A \\\\\\\\U0001112B 𑄬 \\\\\\\\U0001112D \\\\\\\\U0001112E \\\\\\\\U0001112F \\\\\\\\U00011130 \\\\\\\\U00011131 \\\\\\\\U00011132 \\\\\\\\U00011133 \\\\\\\\U00011134]";

        let actual = parse_exemplar_char_string(ccp_main);

        assert!(actual.contains("\u{11100}"));
        assert!(actual.contains("𑄃"));
    }

    #[test]
    fn test_parse_escaped_punctuation_allow_backslash_literal() {
        let es_puncutation = "[\\\\\\\\- ‐ ‑ – — , ; \\\\\\\\: ! ¡ ? ¿ . … ' ‘ ’ \\\\\\\" “ ” « » ( ) \\\\\\\\[ \\\\\\\\] § @ * / \\\\\\\\ \\\\& # † ‡ ′ ″]";

        let actual = parse_exemplar_char_string(es_puncutation);

        assert!(actual.contains("\\"));
        assert!(!actual.contains(" "));
    }

    #[test]
    fn test_parse_unescape_in_strings() {
        let bn_main = "[\\\\u09BC ৺ অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ ং ঃ \\\\u0981 ক {ক\\\\u09CDষ} খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড {ড\\u09BC} ঢ {ঢ\\\\u09BC} ণ ত ৎ থ দ ধ ন প ফ ব ভ ম য {য\\\\u09BC} র ল শ ষ স হ ঽ া ি ী \\\\u09C1 \\\\u09C2 \\\\u09C3 \\\\u09C4 \\\\u09E2 \\\\u09E3 ে ৈ ো ৌ \\\\u09CD ৗ]";

        let actual = parse_exemplar_char_string(bn_main);

        assert!(actual.contains("\u{0981}"));
        assert!(actual.contains("ক\u{09CD}ষ"));
    }

    #[test]
    fn test_parse_consecutive_main_chars_without_spaces() {
        let en_aux = "[áàăâåäãā æ ç éèĕêëē íìĭîïī ñ óòŏôöøō œ úùŭûüū ÿ]";

        let actual = parse_exemplar_char_string(en_aux);

        assert!(actual.contains("æ"));
        assert!(actual.contains("ç"));
        assert!(actual.contains("á"));
        assert!(!actual.contains("áàăâåäãā"));
    }

    #[test]
    fn test_parse_consecutive_punct_chars_subset() {
        let input = "[\"“”]";
        let actual = parse_exemplar_char_string(input);
        assert!(actual.contains("\""));
        assert!(actual.contains("“"));
        assert!(actual.contains("”"));
        assert!(!actual.contains("\"“”"));
    }

    #[test]
    fn test_parse_all_punct_chars() {
        let en_punct = "[\\- ‐‑ – — , ; \\: ! ? . … '‘’ \"“” ( ) \\[ \\] § @ * / \\& # † ‡ ′ ″]";

        let actual = parse_exemplar_char_string(en_punct);

        assert!(actual.contains("‐"));
        assert!(actual.contains("‑"));
        assert!(actual.contains("'"));
        assert!(actual.contains("‘"));
        assert!(actual.contains("’"));
        assert!(!actual.contains("'‘’"));
    }

    #[test]
    fn test_basic() {
        let provider = SourceDataProvider::new_testing();

        let exp_chars = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q",
            "r", "s", "t", "u", "v", "w", "x", "y", "z",
        ];
        let exp_chars_cpilsl = CodePointInversionListAndStringList::from_iter(exp_chars);

        let actual = icu::locale::exemplar_chars::ExemplarCharacters::try_new_main_unstable(
            &provider,
            &langid!("en-001").into(),
        )
        .unwrap();

        assert_eq!(&*actual.as_borrowed(), &exp_chars_cpilsl);
    }
}