icu_casemap/casemapper.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::internals::{CaseMapLocale, FoldOptions, FullCaseWriteable, StringAndWriteable};
6use crate::provider::data::MappingKind;
7use crate::provider::CaseMap;
8use crate::provider::CaseMapV1;
9use crate::set::ClosureSink;
10use crate::titlecase::{LeadingAdjustment, TitlecaseOptions, TrailingCase};
11use alloc::borrow::Cow;
12use icu_locale_core::LanguageIdentifier;
13use icu_provider::prelude::*;
14use writeable::Writeable;
15
16/// A struct with the ability to convert characters and strings to uppercase or lowercase,
17/// or fold them to a normalized form for case-insensitive comparison.
18///
19/// Most methods for this type live on [`CaseMapperBorrowed`], which you can obtain via
20/// [`CaseMapper::new()`] or [`CaseMapper::as_borrowed()`].
21///
22/// # Examples
23///
24/// ```rust
25/// use icu::casemap::CaseMapper;
26/// use icu::locale::langid;
27///
28/// let cm = CaseMapper::new();
29///
30/// assert_eq!(
31/// cm.uppercase_to_string("hello world", &langid!("und")),
32/// "HELLO WORLD"
33/// );
34/// assert_eq!(
35/// cm.lowercase_to_string("Γειά σου Κόσμε", &langid!("und")),
36/// "γειά σου κόσμε"
37/// );
38/// ```
39#[derive(Clone, Debug)]
40pub struct CaseMapper {
41 pub(crate) data: DataPayload<CaseMapV1>,
42}
43
44impl AsRef<CaseMapper> for CaseMapper {
45 fn as_ref(&self) -> &CaseMapper {
46 self
47 }
48}
49
50/// A struct with the ability to convert characters and strings to uppercase or lowercase,
51/// or fold them to a normalized form for case-insensitive comparison, borrowed version.
52///
53/// See methods or [`CaseMapper`] for examples.
54#[derive(Clone, Debug, Copy)]
55pub struct CaseMapperBorrowed<'a> {
56 pub(crate) data: &'a CaseMap<'a>,
57}
58
59impl CaseMapperBorrowed<'static> {
60 /// Cheaply converts a [`CaseMapperBorrowed<'static>`] into a [`CaseMapper`].
61 ///
62 /// Note: Due to branching and indirection, using [`CaseMapper`] might inhibit some
63 /// compile-time optimizations that are possible with [`CaseMapperBorrowed`].
64 pub const fn static_to_owned(self) -> CaseMapper {
65 CaseMapper {
66 data: DataPayload::from_static_ref(self.data),
67 }
68 }
69 /// Creates a [`CaseMapperBorrowed`] using compiled data.
70 ///
71 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
72 ///
73 /// [📚 Help choosing a constructor](icu_provider::constructors)
74 ///
75 /// # Examples
76 ///
77 /// ```rust
78 /// use icu::casemap::CaseMapper;
79 /// use icu::locale::langid;
80 ///
81 /// let cm = CaseMapper::new();
82 ///
83 /// assert_eq!(
84 /// cm.uppercase_to_string("hello world", &langid!("und")),
85 /// "HELLO WORLD"
86 /// );
87 /// ```
88 #[cfg(feature = "compiled_data")]
89 pub const fn new() -> Self {
90 Self {
91 data: crate::provider::Baked::SINGLETON_CASE_MAP_V1,
92 }
93 }
94}
95
96#[cfg(feature = "compiled_data")]
97impl Default for CaseMapperBorrowed<'static> {
98 fn default() -> Self {
99 Self::new()
100 }
101}
102
103impl<'a> CaseMapperBorrowed<'a> {
104 /// Returns the full lowercase mapping of the given string as a [`Writeable`].
105 /// This function is context and language sensitive. Callers should pass the text's language
106 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
107 /// `Default::default()` for the root locale.
108 ///
109 /// See [`Self::lowercase_to_string()`] for the equivalent convenience function that returns a string,
110 /// as well as for an example.
111 pub fn lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
112 self.data.full_helper_writeable::<false>(
113 src,
114 CaseMapLocale::from_langid(langid),
115 MappingKind::Lower,
116 TrailingCase::default(),
117 )
118 }
119
120 /// Returns the full uppercase mapping of the given string as a [`Writeable`].
121 /// This function is context and language sensitive. Callers should pass the text's language
122 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
123 /// `Default::default()` for the root locale.
124 ///
125 /// See [`Self::uppercase_to_string()`] for the equivalent convenience function that returns a string,
126 /// as well as for an example.
127 pub fn uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
128 self.data.full_helper_writeable::<false>(
129 src,
130 CaseMapLocale::from_langid(langid),
131 MappingKind::Upper,
132 TrailingCase::default(),
133 )
134 }
135
136 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
137 /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
138 /// the specified leading adjustment behavior from the options without loading additional data.
139 ///
140 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
141 /// by the application, for example one can titlecase on a per-word basis by mixing this with
142 /// a `WordSegmenter`.
143 ///
144 /// This function is context and language sensitive. Callers should pass the text's language
145 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
146 /// `Default::default()` for the root locale.
147 ///
148 /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
149 /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
150 /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
151 /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
152 /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
153 /// is [`LeadingAdjustment::None`].
154 ///
155 /// See [`Self::titlecase_segment_with_only_case_data_to_string()`] for the equivalent convenience function that returns a string,
156 /// as well as for an example.
157 ///
158 /// [`TitlecaseMapper`]: crate::TitlecaseMapper
159 pub fn titlecase_segment_with_only_case_data(
160 self,
161 src: &'a str,
162 langid: &LanguageIdentifier,
163 options: TitlecaseOptions,
164 ) -> impl Writeable + 'a {
165 self.titlecase_segment_with_adjustment(src, langid, options, |data, ch| data.is_cased(ch))
166 }
167
168 /// Helper to support different leading adjustment behaviors,
169 /// `char_is_lead` is a function that returns true for a character that is allowed to be the
170 /// first relevant character in a titlecasing string, when `leading_adjustment != None`
171 ///
172 /// We return a concrete type instead of `impl Trait` so the return value can be mixed with that of other calls
173 /// to this function with different closures
174 pub(crate) fn titlecase_segment_with_adjustment(
175 self,
176 src: &'a str,
177 langid: &LanguageIdentifier,
178 options: TitlecaseOptions,
179 char_is_lead: impl Fn(&CaseMap, char) -> bool,
180 ) -> StringAndWriteable<'a, FullCaseWriteable<'a, 'a, true>> {
181 let (head, rest) = match options.leading_adjustment.unwrap_or_default() {
182 LeadingAdjustment::Auto | LeadingAdjustment::ToCased => {
183 let first_cased = src
184 .char_indices()
185 .find(|(_i, ch)| char_is_lead(self.data, *ch));
186 if let Some((first_cased, _ch)) = first_cased {
187 (
188 src.get(..first_cased).unwrap_or(""),
189 src.get(first_cased..).unwrap_or(""),
190 )
191 } else {
192 (src, "")
193 }
194 }
195 LeadingAdjustment::None => ("", src),
196 };
197 let writeable = self.data.full_helper_writeable::<true>(
198 rest,
199 CaseMapLocale::from_langid(langid),
200 MappingKind::Title,
201 options.trailing_case.unwrap_or_default(),
202 );
203 StringAndWriteable {
204 string: head,
205 writeable,
206 }
207 }
208 /// Case-folds the characters in the given string as a [`Writeable`].
209 /// This function is locale-independent and context-insensitive.
210 ///
211 /// Can be used to test if two strings are case-insensitively equivalent.
212 ///
213 /// See [`Self::fold_string()`] for the equivalent convenience function that returns a string,
214 /// as well as for an example.
215 pub fn fold(self, src: &'a str) -> impl Writeable + 'a {
216 self.data.full_helper_writeable::<false>(
217 src,
218 CaseMapLocale::Root,
219 MappingKind::Fold,
220 TrailingCase::default(),
221 )
222 }
223
224 /// Case-folds the characters in the given string as a [`Writeable`],
225 /// using Turkic (T) mappings for dotted/dotless I.
226 /// This function is locale-independent and context-insensitive.
227 ///
228 /// Can be used to test if two strings are case-insensitively equivalent.
229 ///
230 /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a string,
231 /// as well as for an example.
232 pub fn fold_turkic(self, src: &'a str) -> impl Writeable + 'a {
233 self.data.full_helper_writeable::<false>(
234 src,
235 CaseMapLocale::Turkish,
236 MappingKind::Fold,
237 TrailingCase::default(),
238 )
239 }
240
241 /// Returns the full lowercase mapping of the given string as a string.
242 ///
243 /// This function is context and language sensitive. Callers should pass the text's language
244 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
245 /// `Default::default()` for the root locale.
246 ///
247 /// See [`Self::lowercase()`] for the equivalent lower-level function that returns a [`Writeable`]
248 ///
249 /// # Examples
250 ///
251 /// ```rust
252 /// use icu::casemap::CaseMapper;
253 /// use icu::locale::langid;
254 ///
255 /// let cm = CaseMapper::new();
256 /// let root = langid!("und");
257 ///
258 /// assert_eq!(cm.lowercase_to_string("hEllO WorLd", &root), "hello world");
259 /// assert_eq!(cm.lowercase_to_string("Γειά σου Κόσμε", &root), "γειά σου κόσμε");
260 /// assert_eq!(cm.lowercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
261 /// assert_eq!(cm.lowercase_to_string("Привет мир", &root), "привет мир");
262 ///
263 /// // Some behavior is language-sensitive
264 /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &root), "constantinople");
265 /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &langid!("tr")), "constantınople");
266 /// ```
267 pub fn lowercase_to_string<'s>(
268 self,
269 src: &'s str,
270 langid: &LanguageIdentifier,
271 ) -> Cow<'s, str> {
272 writeable::to_string_or_borrow(&self.lowercase(src, langid), src.as_bytes())
273 }
274
275 /// Returns the full uppercase mapping of the given string as a string.
276 ///
277 /// This function is context and language sensitive. Callers should pass the text's language
278 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
279 /// `Default::default()` for the root locale.
280 ///
281 /// See [`Self::uppercase()`] for the equivalent lower-level function that returns a [`Writeable`]
282 ///
283 /// # Examples
284 ///
285 /// ```rust
286 /// use icu::casemap::CaseMapper;
287 /// use icu::locale::langid;
288 ///
289 /// let cm = CaseMapper::new();
290 /// let root = langid!("und");
291 ///
292 /// assert_eq!(cm.uppercase_to_string("hEllO WorLd", &root), "HELLO WORLD");
293 /// assert_eq!(cm.uppercase_to_string("Γειά σου Κόσμε", &root), "ΓΕΙΆ ΣΟΥ ΚΌΣΜΕ");
294 /// assert_eq!(cm.uppercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
295 /// assert_eq!(cm.uppercase_to_string("Привет мир", &root), "ПРИВЕТ МИР");
296 ///
297 /// // Some behavior is language-sensitive
298 /// assert_eq!(cm.uppercase_to_string("istanbul", &root), "ISTANBUL");
299 /// assert_eq!(cm.uppercase_to_string("istanbul", &langid!("tr")), "İSTANBUL"); // Turkish dotted i
300 ///
301 /// assert_eq!(cm.uppercase_to_string("և Երևանի", &root), "ԵՒ ԵՐԵՒԱՆԻ");
302 /// assert_eq!(cm.uppercase_to_string("և Երևանի", &langid!("hy")), "ԵՎ ԵՐԵՎԱՆԻ"); // Eastern Armenian ech-yiwn ligature
303 /// ```
304 pub fn uppercase_to_string<'s>(
305 self,
306 src: &'s str,
307 langid: &LanguageIdentifier,
308 ) -> Cow<'s, str> {
309 writeable::to_string_or_borrow(&self.uppercase(src, langid), src.as_bytes())
310 }
311
312 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
313 /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
314 /// the specified leading adjustment behavior from the options without loading additional data.
315 ///
316 /// Note that [`TitlecaseMapper`] has better behavior, most users should consider using
317 /// it instead. This method primarily exists for people who care about the amount of data being loaded.
318 ///
319 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
320 /// by the application, for example one can titlecase on a per-word basis by mixing this with
321 /// a `WordSegmenter`.
322 ///
323 /// This function is context and language sensitive. Callers should pass the text's language
324 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
325 /// `Default::default()` for the root locale.
326 ///
327 /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
328 /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
329 /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
330 /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
331 /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
332 /// is [`LeadingAdjustment::None`].
333 ///
334 /// See [`Self::titlecase_segment_with_only_case_data()`] for the equivalent lower-level function that returns a [`Writeable`]
335 ///
336 /// # Examples
337 ///
338 /// ```rust
339 /// use icu::casemap::CaseMapper;
340 /// use icu::locale::langid;
341 ///
342 /// let cm = CaseMapper::new();
343 /// let root = langid!("und");
344 ///
345 /// let default_options = Default::default();
346 ///
347 /// // note that the subsequent words are not titlecased, this function assumes
348 /// // that the entire string is a single segment and only titlecases at the beginning.
349 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("hEllO WorLd", &root, default_options), "Hello world");
350 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
351 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
352 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Привет мир", &root, default_options), "Привет мир");
353 ///
354 /// // Some behavior is language-sensitive
355 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &root, default_options), "Istanbul");
356 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
357 ///
358 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
359 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
360 ///
361 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &root, default_options), "Ijkdijk");
362 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
363 /// ```
364 ///
365 /// [`TitlecaseMapper`]: crate::TitlecaseMapper
366 pub fn titlecase_segment_with_only_case_data_to_string<'s>(
367 self,
368 src: &'s str,
369 langid: &LanguageIdentifier,
370 options: TitlecaseOptions,
371 ) -> Cow<'s, str> {
372 writeable::to_string_or_borrow(
373 &self.titlecase_segment_with_only_case_data(src, langid, options),
374 src.as_bytes(),
375 )
376 }
377
378 /// Case-folds the characters in the given string as a String.
379 /// This function is locale-independent and context-insensitive.
380 ///
381 /// Can be used to test if two strings are case-insensitively equivalent.
382 ///
383 /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`]
384 ///s s
385 /// # Examples
386 ///
387 /// ```rust
388 /// use icu::casemap::CaseMapper;
389 ///
390 /// let cm = CaseMapper::new();
391 ///
392 /// // Check if two strings are equivalent case insensitively
393 /// assert_eq!(cm.fold_string("hEllO WorLd"), cm.fold_string("HELLO worlD"));
394 ///
395 /// assert_eq!(cm.fold_string("hEllO WorLd"), "hello world");
396 /// assert_eq!(cm.fold_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
397 /// assert_eq!(cm.fold_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
398 /// assert_eq!(cm.fold_string("Привет мир"), "привет мир");
399 /// ```
400 pub fn fold_string(self, src: &str) -> Cow<str> {
401 writeable::to_string_or_borrow(&self.fold(src), src.as_bytes())
402 }
403
404 /// Case-folds the characters in the given string as a String,
405 /// using Turkic (T) mappings for dotted/dotless I.
406 /// This function is locale-independent and context-insensitive.
407 ///
408 /// Can be used to test if two strings are case-insensitively equivalent.
409 ///
410 /// See [`Self::fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`]
411 ///
412 /// # Examples
413 ///
414 /// ```rust
415 /// use icu::casemap::CaseMapper;
416 ///
417 /// let cm = CaseMapper::new();
418 ///
419 /// // Check if two strings are equivalent case insensitively
420 /// assert_eq!(cm.fold_turkic_string("İstanbul"), cm.fold_turkic_string("iSTANBUL"));
421 ///
422 /// assert_eq!(cm.fold_turkic_string("İstanbul not Constantinople"), "istanbul not constantinople");
423 /// assert_eq!(cm.fold_turkic_string("Istanbul not Constantınople"), "ıstanbul not constantınople");
424 ///
425 /// assert_eq!(cm.fold_turkic_string("hEllO WorLd"), "hello world");
426 /// assert_eq!(cm.fold_turkic_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
427 /// assert_eq!(cm.fold_turkic_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
428 /// assert_eq!(cm.fold_turkic_string("Привет мир"), "привет мир");
429 /// ```
430 pub fn fold_turkic_string(self, src: &str) -> Cow<str> {
431 writeable::to_string_or_borrow(&self.fold_turkic(src), src.as_bytes())
432 }
433
434 /// Adds all simple case mappings and the full case folding for `c` to `set`.
435 /// Also adds special case closure mappings.
436 ///
437 /// Identical to [`CaseMapCloserBorrowed::add_case_closure_to()`], see docs there for more information.
438 /// This method is duplicated so that one does not need to load extra unfold data
439 /// if they only need this and not also [`CaseMapCloserBorrowed::add_string_case_closure_to()`].
440 ///
441 ///
442 /// # Examples
443 ///
444 /// ```rust
445 /// use icu::casemap::CaseMapper;
446 /// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
447 ///
448 /// let cm = CaseMapper::new();
449 /// let mut builder = CodePointInversionListBuilder::new();
450 /// cm.add_case_closure_to('s', &mut builder);
451 ///
452 /// let set = builder.build();
453 ///
454 /// assert!(set.contains('S'));
455 /// assert!(set.contains('ſ'));
456 /// assert!(!set.contains('s')); // does not contain itself
457 /// ```
458 ///
459 /// [`CaseMapCloserBorrowed::add_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_case_closure_to
460 /// [`CaseMapCloserBorrowed::add_string_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_string_case_closure_to
461 pub fn add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S) {
462 self.data.add_case_closure_to(c, set);
463 }
464
465 /// Returns the lowercase mapping of the given `char`.
466 /// This function only implements simple and common mappings. Full mappings,
467 /// which can map one `char` to a string, are not included.
468 /// For full mappings, use [`CaseMapperBorrowed::lowercase`].
469 ///
470 /// # Examples
471 ///
472 /// ```rust
473 /// use icu::casemap::CaseMapper;
474 ///
475 /// let cm = CaseMapper::new();
476 ///
477 /// assert_eq!(cm.simple_lowercase('C'), 'c');
478 /// assert_eq!(cm.simple_lowercase('c'), 'c');
479 /// assert_eq!(cm.simple_lowercase('Ć'), 'ć');
480 /// assert_eq!(cm.simple_lowercase('Γ'), 'γ');
481 /// ```
482 pub fn simple_lowercase(self, c: char) -> char {
483 self.data.simple_lower(c)
484 }
485
486 /// Returns the uppercase mapping of the given `char`.
487 /// This function only implements simple and common mappings. Full mappings,
488 /// which can map one `char` to a string, are not included.
489 /// For full mappings, use [`CaseMapperBorrowed::uppercase`].
490 ///
491 /// # Examples
492 ///
493 /// ```rust
494 /// use icu::casemap::CaseMapper;
495 ///
496 /// let cm = CaseMapper::new();
497 ///
498 /// assert_eq!(cm.simple_uppercase('c'), 'C');
499 /// assert_eq!(cm.simple_uppercase('C'), 'C');
500 /// assert_eq!(cm.simple_uppercase('ć'), 'Ć');
501 /// assert_eq!(cm.simple_uppercase('γ'), 'Γ');
502 ///
503 /// assert_eq!(cm.simple_uppercase('dz'), 'DZ');
504 /// ```
505 pub fn simple_uppercase(self, c: char) -> char {
506 self.data.simple_upper(c)
507 }
508
509 /// Returns the titlecase mapping of the given `char`.
510 /// This function only implements simple and common mappings. Full mappings,
511 /// which can map one `char` to a string, are not included.
512 ///
513 /// # Examples
514 ///
515 /// ```rust
516 /// use icu::casemap::CaseMapper;
517 ///
518 /// let cm = CaseMapper::new();
519 ///
520 /// assert_eq!(cm.simple_titlecase('dz'), 'Dz');
521 ///
522 /// assert_eq!(cm.simple_titlecase('c'), 'C');
523 /// assert_eq!(cm.simple_titlecase('C'), 'C');
524 /// assert_eq!(cm.simple_titlecase('ć'), 'Ć');
525 /// assert_eq!(cm.simple_titlecase('γ'), 'Γ');
526 /// ```
527 pub fn simple_titlecase(self, c: char) -> char {
528 self.data.simple_title(c)
529 }
530
531 /// Returns the simple case folding of the given char.
532 /// For full mappings, use [`CaseMapperBorrowed::fold`].
533 ///
534 /// This function can be used to perform caseless matches on
535 /// individual characters.
536 /// > *Note:* With Unicode 15.0 data, there are three
537 /// > pairs of characters for which equivalence under this
538 /// > function is inconsistent with equivalence of the
539 /// > one-character strings under [`CaseMapperBorrowed::fold`].
540 /// > This is resolved in Unicode 15.1 and later.
541 ///
542 /// For compatibility applications where simple case folding
543 /// of strings is required, this function can be applied to
544 /// each character of a string. Note that the resulting
545 /// equivalence relation is different from that obtained
546 /// by [`CaseMapperBorrowed::fold`]:
547 /// The strings "Straße" and "STRASSE" are distinct
548 /// under simple case folding, but are equivalent under
549 /// default (full) case folding.
550 ///
551 /// # Examples
552 ///
553 /// ```rust
554 /// use icu::casemap::CaseMapper;
555 ///
556 /// let cm = CaseMapper::new();
557 ///
558 /// // perform case insensitive checks
559 /// assert_eq!(cm.simple_fold('σ'), cm.simple_fold('ς'));
560 /// assert_eq!(cm.simple_fold('Σ'), cm.simple_fold('ς'));
561 ///
562 /// assert_eq!(cm.simple_fold('c'), 'c');
563 /// assert_eq!(cm.simple_fold('Ć'), 'ć');
564 /// assert_eq!(cm.simple_fold('Γ'), 'γ');
565 /// assert_eq!(cm.simple_fold('ς'), 'σ');
566 ///
567 /// assert_eq!(cm.simple_fold('ß'), 'ß');
568 /// assert_eq!(cm.simple_fold('I'), 'i');
569 /// assert_eq!(cm.simple_fold('İ'), 'İ');
570 /// assert_eq!(cm.simple_fold('ı'), 'ı');
571 /// ```
572 pub fn simple_fold(self, c: char) -> char {
573 self.data.simple_fold(c, FoldOptions::default())
574 }
575
576 /// Returns the simple case folding of the given char, using Turkic (T) mappings for
577 /// dotted/dotless i. This function does not fold `i` and `I` to the same character. Instead,
578 /// `I` will fold to `ı`, and `İ` will fold to `i`. Otherwise, this is the same as
579 /// [`CaseMapperBorrowed::fold()`].
580 ///
581 /// You can use the case folding to perform Turkic caseless matches on characters
582 /// provided they don't full-casefold to strings. To avoid that situation,
583 /// convert to a string and use [`CaseMapperBorrowed::fold_turkic`].
584 ///
585 ///
586 /// # Examples
587 ///
588 /// ```rust
589 /// use icu::casemap::CaseMapper;
590 ///
591 /// let cm = CaseMapper::new();
592 ///
593 /// assert_eq!(cm.simple_fold_turkic('I'), 'ı');
594 /// assert_eq!(cm.simple_fold_turkic('İ'), 'i');
595 /// ```
596 pub fn simple_fold_turkic(self, c: char) -> char {
597 self.data
598 .simple_fold(c, FoldOptions::with_turkic_mappings())
599 }
600}
601
602impl CaseMapper {
603 /// Creates a [`CaseMapperBorrowed`] using compiled data.
604 ///
605 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
606 ///
607 /// [📚 Help choosing a constructor](icu_provider::constructors)
608 ///
609 /// # Examples
610 ///
611 /// ```rust
612 /// use icu::casemap::CaseMapper;
613 /// use icu::locale::langid;
614 ///
615 /// let cm = CaseMapper::new();
616 ///
617 /// assert_eq!(
618 /// cm.uppercase_to_string("hello world", &langid!("und")),
619 /// "HELLO WORLD"
620 /// );
621 /// ```
622 #[cfg(feature = "compiled_data")]
623 #[allow(clippy::new_ret_no_self)] // Intentional
624 pub const fn new() -> CaseMapperBorrowed<'static> {
625 CaseMapperBorrowed::new()
626 }
627
628 /// Constructs a borrowed version of this type for more efficient querying.
629 pub fn as_borrowed(&self) -> CaseMapperBorrowed<'_> {
630 CaseMapperBorrowed {
631 data: self.data.get(),
632 }
633 }
634
635 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
636 functions: [
637 new: skip,
638 try_new_with_buffer_provider,
639 try_new_unstable,
640 Self,
641 ]);
642
643 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
644 pub fn try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError>
645 where
646 P: DataProvider<CaseMapV1> + ?Sized,
647 {
648 let data = provider.load(Default::default())?.payload;
649 Ok(Self { data })
650 }
651}
652
653#[cfg(test)]
654mod tests {
655 use super::*;
656 use icu_locale_core::langid;
657
658 #[test]
659 /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven
660 fn test_special_cases() {
661 let cm = CaseMapper::new();
662 let root = langid!("und");
663 let default_options = Default::default();
664
665 // Ligatures
666
667 // U+FB00 LATIN SMALL LIGATURE FF
668 assert_eq!(cm.uppercase_to_string("ff", &root), "FF");
669 // U+FB05 LATIN SMALL LIGATURE LONG S T
670 assert_eq!(cm.uppercase_to_string("ſt", &root), "ST");
671
672 // No corresponding uppercased character
673
674 // U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
675 assert_eq!(cm.uppercase_to_string("ʼn", &root), "ʼN");
676
677 // U+1F50 GREEK SMALL LETTER UPSILON WITH PSILI
678 assert_eq!(cm.uppercase_to_string("ὐ", &root), "Υ̓");
679 // U+1FF6 GREEK SMALL LETTER OMEGA WITH PERISPOMENI
680 assert_eq!(cm.uppercase_to_string("ῶ", &root), "Ω͂");
681
682 // YPOGEGRAMMENI / PROSGEGRAMMENI special cases
683
684 // E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
685 assert_eq!(
686 cm.uppercase_to_string("α\u{0313}\u{0345}", &root),
687 "Α\u{0313}Ι"
688 );
689 // but the YPOGEGRAMMENI should not titlecase
690 assert_eq!(
691 cm.titlecase_segment_with_only_case_data_to_string(
692 "α\u{0313}\u{0345}",
693 &root,
694 default_options
695 ),
696 "Α\u{0313}\u{0345}"
697 );
698
699 // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
700 assert_eq!(
701 cm.titlecase_segment_with_only_case_data_to_string("ᾀ", &root, default_options),
702 "ᾈ"
703 );
704 assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ");
705
706 // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
707 assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ");
708 assert_eq!(
709 cm.titlecase_segment_with_only_case_data_to_string("ῼ", &root, default_options),
710 "ῼ"
711 );
712 assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ");
713
714 // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
715 assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ");
716 assert_eq!(
717 cm.titlecase_segment_with_only_case_data_to_string("ᾘ", &root, default_options),
718 "ᾘ"
719 );
720 assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ");
721
722 // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
723 assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ");
724 assert_eq!(
725 cm.titlecase_segment_with_only_case_data_to_string("ᾲ", &root, default_options),
726 "Ὰ\u{345}"
727 );
728 assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ");
729
730 // Final sigma test
731 // U+03A3 GREEK CAPITAL LETTER SIGMA in Final_Sigma context
732 assert_eq!(cm.lowercase_to_string("ΙΙΙΣ", &root), "ιιις");
733
734 // Turkish / Azeri
735 let tr = langid!("tr");
736 let az = langid!("az");
737 // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
738 assert_eq!(cm.lowercase_to_string("İ", &tr), "i");
739 assert_eq!(cm.lowercase_to_string("İ", &az), "i");
740 assert_eq!(
741 cm.titlecase_segment_with_only_case_data_to_string("İ", &tr, default_options),
742 "İ"
743 );
744 assert_eq!(
745 cm.titlecase_segment_with_only_case_data_to_string("İ", &az, default_options),
746 "İ"
747 );
748 assert_eq!(cm.uppercase_to_string("İ", &tr), "İ");
749 assert_eq!(cm.uppercase_to_string("İ", &az), "İ");
750
751 // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE
752 assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i");
753 assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i");
754 assert_eq!(
755 cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &tr, default_options),
756 "I\u{0307}"
757 );
758 assert_eq!(
759 cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &az, default_options),
760 "I\u{0307}"
761 );
762 assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}");
763 assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}");
764
765 // U+0049 LATIN CAPITAL LETTER I
766 assert_eq!(cm.lowercase_to_string("I", &tr), "ı");
767 assert_eq!(cm.lowercase_to_string("I", &az), "ı");
768 assert_eq!(
769 cm.titlecase_segment_with_only_case_data_to_string("I", &tr, default_options),
770 "I"
771 );
772 assert_eq!(
773 cm.titlecase_segment_with_only_case_data_to_string("I", &az, default_options),
774 "I"
775 );
776 assert_eq!(cm.uppercase_to_string("I", &tr), "I");
777 assert_eq!(cm.uppercase_to_string("I", &az), "I");
778
779 // U+0069 LATIN SMALL LETTER I
780 assert_eq!(cm.lowercase_to_string("i", &tr), "i");
781 assert_eq!(cm.lowercase_to_string("i", &az), "i");
782 assert_eq!(
783 cm.titlecase_segment_with_only_case_data_to_string("i", &tr, default_options),
784 "İ"
785 );
786 assert_eq!(
787 cm.titlecase_segment_with_only_case_data_to_string("i", &az, default_options),
788 "İ"
789 );
790 assert_eq!(cm.uppercase_to_string("i", &tr), "İ");
791 assert_eq!(cm.uppercase_to_string("i", &az), "İ");
792 }
793
794 #[test]
795 fn test_cherokee_case_folding() {
796 let case_mapping = CaseMapper::new();
797 assert_eq!(case_mapping.simple_fold('Ꭰ'), 'Ꭰ');
798 assert_eq!(case_mapping.simple_fold('ꭰ'), 'Ꭰ');
799 assert_eq!(case_mapping.simple_fold_turkic('Ꭰ'), 'Ꭰ');
800 assert_eq!(case_mapping.simple_fold_turkic('ꭰ'), 'Ꭰ');
801 assert_eq!(case_mapping.fold_string("Ꭰ"), "Ꭰ");
802 assert_eq!(case_mapping.fold_string("ꭰ"), "Ꭰ");
803 assert_eq!(case_mapping.fold_turkic_string("Ꭰ"), "Ꭰ");
804 assert_eq!(case_mapping.fold_turkic_string("ꭰ"), "Ꭰ");
805 }
806}