icu_provider_source/locale/
aliases.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::cldr_serde;
6use crate::SourceDataProvider;
7use icu::locale::provider::*;
8use icu::locale::{
9    subtags::{self, language},
10    LanguageIdentifier,
11};
12use icu_provider::prelude::*;
13use std::collections::{BTreeMap, HashSet};
14use tinystr::TinyAsciiStr;
15use zerovec::ZeroSlice;
16
17impl DataProvider<LocaleAliasesV1> for SourceDataProvider {
18    fn load(&self, req: DataRequest) -> Result<DataResponse<LocaleAliasesV1>, DataError> {
19        self.check_req::<LocaleAliasesV1>(req)?;
20        let data: &cldr_serde::aliases::Resource = self
21            .cldr()?
22            .core()
23            .read_and_parse("supplemental/aliases.json")?;
24        Ok(DataResponse {
25            metadata: Default::default(),
26            payload: DataPayload::from_owned(Aliases::from(data)),
27        })
28    }
29}
30
31impl crate::IterableDataProviderCached<LocaleAliasesV1> for SourceDataProvider {
32    fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
33        Ok(HashSet::from_iter([Default::default()]))
34    }
35}
36
37// Sort rules following algorithm in Preprocessing, step 5 of Appendix C:
38//   - the size of the union of all field value sets, with largest size first
39//   - alphabetically by each field
40fn appendix_c_cmp(langid: &LanguageIdentifier) -> impl Ord {
41    let mut union_size = langid.variants.len() as i8;
42    if !langid.language.is_default() {
43        union_size += 1;
44    }
45    if langid.script.is_some() {
46        union_size += 1;
47    }
48    if langid.region.is_some() {
49        union_size += 1;
50    }
51    (
52        -union_size,
53        langid.language,
54        langid.script,
55        langid.region,
56        langid.variants.clone(),
57    )
58}
59
60impl From<&cldr_serde::aliases::Resource> for Aliases<'_> {
61    // Step 1. Load the rules from aliases.json
62    fn from(other: &cldr_serde::aliases::Resource) -> Self {
63        // These all correspond to language aliases in the CLDR data. By storing known
64        // special cases in the CLDR data, we can minimize the number of comparisons done
65        // for commonly used languages. With the current CLDR data, all aliases end up in
66        // a special case, but we retain the catchall language category in case new or
67        // customized CLDR data is used.
68        let mut language_variants = Vec::new();
69        let mut sgn_region = BTreeMap::new();
70        let mut language_len2 = BTreeMap::new();
71        let mut language_len3 = BTreeMap::new();
72        let mut language = Vec::new();
73
74        let mut script = BTreeMap::new();
75
76        // There are many more aliases for numeric region codes than for alphabetic,
77        // so by storing them separately, we can minimize comparisons for alphabetic codes.
78        let mut region_alpha = BTreeMap::new();
79        let mut region_num = BTreeMap::new();
80
81        // Complex regions are cases similar to the Soviet Union, where an old region
82        // is replaced by multiple new regions. Determining the new region requires using
83        // likely subtags. Many implementations preprocess the complex regions into simple
84        // regions as part of data import, but that would introduce a dependency between
85        // CDLR providers that we're not currently set up to handle.
86        let mut complex_region = BTreeMap::new();
87
88        let mut variant = BTreeMap::new();
89
90        let mut subdivision = BTreeMap::new();
91
92        // Step 2. Capture all languageAlias rules where the type is an invalid languageId
93        // into a set of BCP47 LegacyRules. This implementation discards these.
94        // Step 3. Discard all rules where the type is an invalid languageId
95        for (from, to) in other.supplemental.metadata.alias.language_aliases.iter() {
96            if let Ok(langid) = from.parse::<LanguageIdentifier>() {
97                if let Ok(replacement) = to.replacement.parse::<LanguageIdentifier>() {
98                    match (
99                        langid.language,
100                        langid.script,
101                        langid.region,
102                        !langid.variants.is_empty(),
103                    ) {
104                        // Anything that has a variant needs to be parsed at runtime, so we isolate
105                        // these in their own map.
106                        (_, None, None, true) => language_variants.push((langid, replacement)),
107                        // <language> -> <language identifier>
108                        (lang, None, None, false) if !lang.is_default() => {
109                            // Relatively few aliases exist for two character language identifiers,
110                            // so we store them separately to not slow down canonicalization of
111                            // common identifiers.
112                            let lang = langid.language.to_tinystr();
113                            if lang.len() == 2 {
114                                language_len2.insert(lang.resize(), to.replacement.as_str());
115                            } else {
116                                language_len3.insert(lang, to.replacement.as_str());
117                            }
118                        }
119                        // sgn-<region> -> <language>
120                        (language, None, Some(region), false)
121                            if language == language!("sgn")
122                                && !replacement.language.is_default()
123                                && replacement.script.is_none()
124                                && replacement.region.is_none()
125                                && replacement.variants.is_empty() =>
126                        {
127                            sgn_region.insert(region.to_tinystr(), replacement.language);
128                        }
129                        _ => language.push((langid, replacement)),
130                    }
131                }
132            }
133        }
134
135        if !language.is_empty() {
136            panic!("Aliases contain a non-special-cased rule. Remove this check if that is intended behaviour.")
137        }
138
139        for (from, to) in other.supplemental.metadata.alias.script_aliases.iter() {
140            // Don't store data for invalid script codes, we only canonicalize valid locales, so we
141            // would never see these anyways.
142            if from.parse::<subtags::Script>().is_err() {
143                continue;
144            }
145
146            if let Ok(to) = to.replacement.parse::<subtags::Script>() {
147                script.insert(from, to);
148            }
149        }
150
151        for (from, to) in other.supplemental.metadata.alias.region_aliases.iter() {
152            // Don't store data for invalid region codes, we only canonicalize valid locales, so we
153            // would never see these anyways.
154            if from.parse::<subtags::Region>().is_err() {
155                continue;
156            }
157
158            if let Ok(replacement) = to.replacement.parse::<subtags::Region>() {
159                if from.is_ascii_alphabetic() {
160                    region_alpha.insert(from.resize(), replacement);
161                } else {
162                    region_num.insert(from, replacement);
163                }
164            } else {
165                complex_region.insert(
166                    from,
167                    to.replacement
168                        .split(' ')
169                        .filter_map(|r| r.parse::<subtags::Region>().ok())
170                        .collect::<Box<[_]>>(),
171                );
172            }
173        }
174
175        for (from, to) in other.supplemental.metadata.alias.variant_aliases.iter() {
176            if let Ok(to) = to.replacement.parse::<subtags::Variant>() {
177                variant.insert(from, to);
178            }
179        }
180
181        for (from, to) in other.supplemental.metadata.alias.subdivision_aliases.iter() {
182            if let Some(replacement) = to.replacement.split(' ').find_map(|r| {
183                if r.len() == 2 {
184                    // Following http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers,
185                    // append "zzzz" to make this syntactically correct.
186                    let replacement = r.to_string().to_ascii_lowercase() + "zzzz";
187                    TinyAsciiStr::<7>::try_from_str(&replacement).ok()
188                } else {
189                    TinyAsciiStr::<7>::try_from_str(r).ok()
190                }
191            }) {
192                subdivision.insert(from, replacement);
193            }
194        }
195
196        // 5. Sort the non-special-cased rules
197        language_variants.sort_unstable_by_key(|(langid, _)| appendix_c_cmp(langid));
198        language.sort_unstable_by_key(|(langid, _)| appendix_c_cmp(langid));
199
200        let language_variants = language_variants
201            .iter()
202            .map(|(from, to)| {
203                LanguageStrStrPair(
204                    from.language,
205                    from.variants.to_string().into(),
206                    to.to_string().into(),
207                )
208            })
209            .collect::<Vec<_>>();
210        let language = language
211            .iter()
212            .map(|(from, to)| StrStrPair(from.to_string().into(), to.to_string().into()))
213            .collect::<Vec<_>>();
214
215        Self {
216            language_variants: language_variants.as_slice().into(),
217            sgn_region: sgn_region
218                .into_iter()
219                .map(|(k, v)| (k.to_unvalidated(), v))
220                .collect(),
221            language_len2: language_len2
222                .into_iter()
223                .map(|(k, v)| (k.to_unvalidated(), v))
224                .collect(),
225            language_len3: language_len3
226                .into_iter()
227                .map(|(k, v)| (k.to_unvalidated(), v))
228                .collect(),
229            language: language.as_slice().into(),
230
231            script: script
232                .into_iter()
233                .map(|(k, v)| (k.to_unvalidated(), v))
234                .collect(),
235
236            region_alpha: region_alpha
237                .into_iter()
238                .map(|(k, v)| (k.to_unvalidated(), v))
239                .collect(),
240            region_num: region_num
241                .into_iter()
242                .map(|(k, v)| (k.to_unvalidated(), v))
243                .collect(),
244            complex_region: complex_region
245                .into_iter()
246                .map(|(k, v)| (k.to_unvalidated(), ZeroSlice::from_boxed_slice(v)))
247                .collect(),
248
249            variant: variant
250                .into_iter()
251                .map(|(k, v)| (k.to_unvalidated(), v))
252                .collect(),
253
254            subdivision: subdivision
255                .into_iter()
256                .map(|(k, v)| (k.to_unvalidated(), v))
257                .collect(),
258        }
259    }
260}
261
262#[test]
263fn test_appendix_c_cmp() {
264    let en = icu::locale::langid!("en-GB");
265    let ca = icu::locale::langid!("ca");
266    let und = "und-hepburn-heploc".parse::<LanguageIdentifier>().unwrap();
267    let fr = icu::locale::langid!("fr-CA");
268
269    let mut rules = vec![&en, &ca, &und, &fr];
270    rules.sort_unstable_by_key(|&l| appendix_c_cmp(l));
271
272    assert_eq!(rules, &[&en, &fr, &und, &ca]);
273}
274
275#[test]
276fn test_basic() {
277    use icu::locale::subtags::{language, region, script};
278
279    let provider = SourceDataProvider::new_testing();
280    let data: DataResponse<LocaleAliasesV1> = provider.load(Default::default()).unwrap();
281
282    // We should handle all language rules as special cases, leaving the generic category empty.
283    assert!(data.payload.get().language.is_empty());
284
285    // We should have data in all other categories
286    assert!(!data.payload.get().language_variants.is_empty());
287    assert!(!data.payload.get().sgn_region.is_empty());
288    assert!(!data.payload.get().language_len2.is_empty());
289    assert!(!data.payload.get().language_len3.is_empty());
290    assert!(!data.payload.get().script.is_empty());
291    assert!(!data.payload.get().region_alpha.is_empty());
292    assert!(!data.payload.get().region_num.is_empty());
293    assert!(!data.payload.get().complex_region.is_empty());
294    assert!(!data.payload.get().variant.is_empty());
295    assert!(!data.payload.get().subdivision.is_empty());
296
297    // Spot check a few expected results. There are more extensive tests in the
298    // locale canonicalizer itself.
299    assert_eq!(
300        data.payload
301            .get()
302            .language_len2
303            .get(&language!("iw").to_tinystr().resize().to_unvalidated())
304            .unwrap(),
305        "he"
306    );
307
308    assert!(data
309        .payload
310        .get()
311        .language_len3
312        .get(&language!("iw").to_tinystr().to_unvalidated())
313        .is_none());
314
315    assert_eq!(
316        data.payload.get().script.iter().next().unwrap(),
317        (
318            &script!("Qaai").to_tinystr().to_unvalidated(),
319            &script!("Zinh")
320        )
321    );
322
323    assert_eq!(
324        data.payload
325            .get()
326            .region_num
327            .get(&region!("768").to_tinystr().to_unvalidated())
328            .unwrap(),
329        &region!("TG")
330    );
331}