1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45use crate::cldr_serde;
6use crate::SourceDataProvider;
7use icu::locale::provider::*;
8use icu::locale::{
9 subtags::{self, language},
10 LanguageIdentifier,
11};
12use icu_provider::prelude::*;
13use std::collections::{BTreeMap, HashSet};
14use tinystr::TinyAsciiStr;
15use zerovec::ZeroSlice;
1617impl DataProvider<LocaleAliasesV1> for SourceDataProvider {
18fn load(&self, req: DataRequest) -> Result<DataResponse<LocaleAliasesV1>, DataError> {
19self.check_req::<LocaleAliasesV1>(req)?;
20let data: &cldr_serde::aliases::Resource = self
21.cldr()?
22.core()
23 .read_and_parse("supplemental/aliases.json")?;
24Ok(DataResponse {
25 metadata: Default::default(),
26 payload: DataPayload::from_owned(Aliases::from(data)),
27 })
28 }
29}
3031impl crate::IterableDataProviderCached<LocaleAliasesV1> for SourceDataProvider {
32fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
33Ok(HashSet::from_iter([Default::default()]))
34 }
35}
3637// Sort rules following algorithm in Preprocessing, step 5 of Appendix C:
38// - the size of the union of all field value sets, with largest size first
39// - alphabetically by each field
40fn appendix_c_cmp(langid: &LanguageIdentifier) -> impl Ord {
41let mut union_size = langid.variants.len() as i8;
42if !langid.language.is_default() {
43 union_size += 1;
44 }
45if langid.script.is_some() {
46 union_size += 1;
47 }
48if langid.region.is_some() {
49 union_size += 1;
50 }
51 (
52 -union_size,
53 langid.language,
54 langid.script,
55 langid.region,
56 langid.variants.clone(),
57 )
58}
5960impl From<&cldr_serde::aliases::Resource> for Aliases<'_> {
61// Step 1. Load the rules from aliases.json
62fn from(other: &cldr_serde::aliases::Resource) -> Self {
63// These all correspond to language aliases in the CLDR data. By storing known
64 // special cases in the CLDR data, we can minimize the number of comparisons done
65 // for commonly used languages. With the current CLDR data, all aliases end up in
66 // a special case, but we retain the catchall language category in case new or
67 // customized CLDR data is used.
68let mut language_variants = Vec::new();
69let mut sgn_region = BTreeMap::new();
70let mut language_len2 = BTreeMap::new();
71let mut language_len3 = BTreeMap::new();
72let mut language = Vec::new();
7374let mut script = BTreeMap::new();
7576// There are many more aliases for numeric region codes than for alphabetic,
77 // so by storing them separately, we can minimize comparisons for alphabetic codes.
78let mut region_alpha = BTreeMap::new();
79let mut region_num = BTreeMap::new();
8081// Complex regions are cases similar to the Soviet Union, where an old region
82 // is replaced by multiple new regions. Determining the new region requires using
83 // likely subtags. Many implementations preprocess the complex regions into simple
84 // regions as part of data import, but that would introduce a dependency between
85 // CDLR providers that we're not currently set up to handle.
86let mut complex_region = BTreeMap::new();
8788let mut variant = BTreeMap::new();
8990let mut subdivision = BTreeMap::new();
9192// Step 2. Capture all languageAlias rules where the type is an invalid languageId
93 // into a set of BCP47 LegacyRules. This implementation discards these.
94 // Step 3. Discard all rules where the type is an invalid languageId
95for (from, to) in other.supplemental.metadata.alias.language_aliases.iter() {
96if let Ok(langid) = from.parse::<LanguageIdentifier>() {
97if let Ok(replacement) = to.replacement.parse::<LanguageIdentifier>() {
98match (
99 langid.language,
100 langid.script,
101 langid.region,
102 !langid.variants.is_empty(),
103 ) {
104// Anything that has a variant needs to be parsed at runtime, so we isolate
105 // these in their own map.
106(_, None, None, true) => language_variants.push((langid, replacement)),
107// <language> -> <language identifier>
108(lang, None, None, false) if !lang.is_default() => {
109// Relatively few aliases exist for two character language identifiers,
110 // so we store them separately to not slow down canonicalization of
111 // common identifiers.
112let lang = langid.language.to_tinystr();
113if lang.len() == 2 {
114 language_len2.insert(lang.resize(), to.replacement.as_str());
115 } else {
116 language_len3.insert(lang, to.replacement.as_str());
117 }
118 }
119// sgn-<region> -> <language>
120(language, None, Some(region), false)
121if language == language!("sgn")
122 && !replacement.language.is_default()
123 && replacement.script.is_none()
124 && replacement.region.is_none()
125 && replacement.variants.is_empty() =>
126 {
127 sgn_region.insert(region.to_tinystr(), replacement.language);
128 }
129_ => language.push((langid, replacement)),
130 }
131 }
132 }
133 }
134135if !language.is_empty() {
136panic!("Aliases contain a non-special-cased rule. Remove this check if that is intended behaviour.")
137 }
138139for (from, to) in other.supplemental.metadata.alias.script_aliases.iter() {
140// Don't store data for invalid script codes, we only canonicalize valid locales, so we
141 // would never see these anyways.
142if from.parse::<subtags::Script>().is_err() {
143continue;
144 }
145146if let Ok(to) = to.replacement.parse::<subtags::Script>() {
147 script.insert(from, to);
148 }
149 }
150151for (from, to) in other.supplemental.metadata.alias.region_aliases.iter() {
152// Don't store data for invalid region codes, we only canonicalize valid locales, so we
153 // would never see these anyways.
154if from.parse::<subtags::Region>().is_err() {
155continue;
156 }
157158if let Ok(replacement) = to.replacement.parse::<subtags::Region>() {
159if from.is_ascii_alphabetic() {
160 region_alpha.insert(from.resize(), replacement);
161 } else {
162 region_num.insert(from, replacement);
163 }
164 } else {
165 complex_region.insert(
166 from,
167 to.replacement
168 .split(' ')
169 .filter_map(|r| r.parse::<subtags::Region>().ok())
170 .collect::<Box<[_]>>(),
171 );
172 }
173 }
174175for (from, to) in other.supplemental.metadata.alias.variant_aliases.iter() {
176if let Ok(to) = to.replacement.parse::<subtags::Variant>() {
177 variant.insert(from, to);
178 }
179 }
180181for (from, to) in other.supplemental.metadata.alias.subdivision_aliases.iter() {
182if let Some(replacement) = to.replacement.split(' ').find_map(|r| {
183if r.len() == 2 {
184// Following http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers,
185 // append "zzzz" to make this syntactically correct.
186let replacement = r.to_string().to_ascii_lowercase() + "zzzz";
187 TinyAsciiStr::<7>::try_from_str(&replacement).ok()
188 } else {
189 TinyAsciiStr::<7>::try_from_str(r).ok()
190 }
191 }) {
192 subdivision.insert(from, replacement);
193 }
194 }
195196// 5. Sort the non-special-cased rules
197language_variants.sort_unstable_by_key(|(langid, _)| appendix_c_cmp(langid));
198 language.sort_unstable_by_key(|(langid, _)| appendix_c_cmp(langid));
199200let language_variants = language_variants
201 .iter()
202 .map(|(from, to)| {
203 LanguageStrStrPair(
204 from.language,
205 from.variants.to_string().into(),
206 to.to_string().into(),
207 )
208 })
209 .collect::<Vec<_>>();
210let language = language
211 .iter()
212 .map(|(from, to)| StrStrPair(from.to_string().into(), to.to_string().into()))
213 .collect::<Vec<_>>();
214215Self {
216 language_variants: language_variants.as_slice().into(),
217 sgn_region: sgn_region
218 .into_iter()
219 .map(|(k, v)| (k.to_unvalidated(), v))
220 .collect(),
221 language_len2: language_len2
222 .into_iter()
223 .map(|(k, v)| (k.to_unvalidated(), v))
224 .collect(),
225 language_len3: language_len3
226 .into_iter()
227 .map(|(k, v)| (k.to_unvalidated(), v))
228 .collect(),
229 language: language.as_slice().into(),
230231 script: script
232 .into_iter()
233 .map(|(k, v)| (k.to_unvalidated(), v))
234 .collect(),
235236 region_alpha: region_alpha
237 .into_iter()
238 .map(|(k, v)| (k.to_unvalidated(), v))
239 .collect(),
240 region_num: region_num
241 .into_iter()
242 .map(|(k, v)| (k.to_unvalidated(), v))
243 .collect(),
244 complex_region: complex_region
245 .into_iter()
246 .map(|(k, v)| (k.to_unvalidated(), ZeroSlice::from_boxed_slice(v)))
247 .collect(),
248249 variant: variant
250 .into_iter()
251 .map(|(k, v)| (k.to_unvalidated(), v))
252 .collect(),
253254 subdivision: subdivision
255 .into_iter()
256 .map(|(k, v)| (k.to_unvalidated(), v))
257 .collect(),
258 }
259 }
260}
261262#[test]
263fn test_appendix_c_cmp() {
264let en = icu::locale::langid!("en-GB");
265let ca = icu::locale::langid!("ca");
266let und = "und-hepburn-heploc".parse::<LanguageIdentifier>().unwrap();
267let fr = icu::locale::langid!("fr-CA");
268269let mut rules = vec![&en, &ca, &und, &fr];
270 rules.sort_unstable_by_key(|&l| appendix_c_cmp(l));
271272assert_eq!(rules, &[&en, &fr, &und, &ca]);
273}
274275#[test]
276fn test_basic() {
277use icu::locale::subtags::{language, region, script};
278279let provider = SourceDataProvider::new_testing();
280let data: DataResponse<LocaleAliasesV1> = provider.load(Default::default()).unwrap();
281282// We should handle all language rules as special cases, leaving the generic category empty.
283assert!(data.payload.get().language.is_empty());
284285// We should have data in all other categories
286assert!(!data.payload.get().language_variants.is_empty());
287assert!(!data.payload.get().sgn_region.is_empty());
288assert!(!data.payload.get().language_len2.is_empty());
289assert!(!data.payload.get().language_len3.is_empty());
290assert!(!data.payload.get().script.is_empty());
291assert!(!data.payload.get().region_alpha.is_empty());
292assert!(!data.payload.get().region_num.is_empty());
293assert!(!data.payload.get().complex_region.is_empty());
294assert!(!data.payload.get().variant.is_empty());
295assert!(!data.payload.get().subdivision.is_empty());
296297// Spot check a few expected results. There are more extensive tests in the
298 // locale canonicalizer itself.
299assert_eq!(
300 data.payload
301 .get()
302 .language_len2
303 .get(&language!("iw").to_tinystr().resize().to_unvalidated())
304 .unwrap(),
305"he"
306);
307308assert!(data
309 .payload
310 .get()
311 .language_len3
312 .get(&language!("iw").to_tinystr().to_unvalidated())
313 .is_none());
314315assert_eq!(
316 data.payload.get().script.iter().next().unwrap(),
317 (
318&script!("Qaai").to_tinystr().to_unvalidated(),
319&script!("Zinh")
320 )
321 );
322323assert_eq!(
324 data.payload
325 .get()
326 .region_num
327 .get(®ion!("768").to_tinystr().to_unvalidated())
328 .unwrap(),
329®ion!("TG")
330 );
331}