icu_casemap/internals.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains most of the actual algorithms for case mapping.
//!
//! Primarily, it implements methods on `CaseMapV1`, which contains the data model.
use crate::greek_to_me::{
self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData,
GreekVowel,
};
use crate::provider::data::{DotType, MappingKind};
use crate::provider::exception_helpers::ExceptionSlot;
use crate::provider::{CaseMapUnfoldV1, CaseMapV1};
use crate::set::ClosureSink;
use crate::titlecase::TrailingCase;
use core::fmt;
use icu_locale_core::LanguageIdentifier;
use writeable::Writeable;
const ACUTE: char = '\u{301}';
// Used to control the behavior of CaseMapper::fold.
// Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
#[derive(Copy, Clone, Default)]
pub struct FoldOptions {
exclude_special_i: bool,
}
impl FoldOptions {
pub fn with_turkic_mappings() -> Self {
Self {
exclude_special_i: true,
}
}
}
/// Helper type that wraps a writeable in a prefix string
pub(crate) struct StringAndWriteable<'a, W> {
pub string: &'a str,
pub writeable: W,
}
impl<Wr: Writeable> Writeable for StringAndWriteable<'_, Wr> {
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
sink.write_str(self.string)?;
self.writeable.write_to(sink)
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint()
}
}
pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> {
data: &'a CaseMapV1<'a>,
src: &'a str,
locale: CaseMapLocale,
mapping: MappingKind,
titlecase_tail_casing: TrailingCase,
}
impl<const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'_, IS_TITLE_CONTEXT> {
#[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
let src = self.src;
let mut mapping = self.mapping;
let mut iter = src.char_indices();
for (i, c) in &mut iter {
let context = ContextIterator::new(&src[..i], &src[i..]);
self.data
.full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
if IS_TITLE_CONTEXT {
if self.titlecase_tail_casing == TrailingCase::Lower {
mapping = MappingKind::Lower;
} else {
break;
}
}
}
// Write the rest of the string
if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged {
sink.write_str(iter.as_str())?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::at_least(self.src.len())
}
}
impl<'data> CaseMapV1<'data> {
fn simple_helper(&self, c: char, kind: MappingKind) -> char {
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_relevant_to(kind) {
let folded = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
char::from_u32(folded as u32).unwrap_or(c)
} else {
c
}
} else {
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if data.is_relevant_to(kind) {
if let Some(simple) = exception.get_simple_case_slot_for(c) {
return simple;
}
}
exception.slot_char_for_kind(kind).unwrap_or(c)
}
}
// Returns the lowercase mapping of the given `char`.
#[inline]
pub(crate) fn simple_lower(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Lower)
}
// Returns the uppercase mapping of the given `char`.
#[inline]
pub(crate) fn simple_upper(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Upper)
}
// Returns the titlecase mapping of the given `char`.
#[inline]
pub(crate) fn simple_title(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Title)
}
// Return the simple case folding mapping of the given char.
#[inline]
pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char {
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_upper_or_title() {
let folded = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
char::from_u32(folded as u32).unwrap_or(c)
} else {
c
}
} else {
// TODO: if we move conditional fold and no_simple_case_folding into
// simple_helper, this function can just call simple_helper.
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if exception.bits.has_conditional_fold() {
self.simple_fold_special_case(c, options)
} else if exception.bits.no_simple_case_folding() {
c
} else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) {
// unwrap_or case should never happen but best to avoid panics
exception.get_simple_case_slot_for(c).unwrap_or('\0')
} else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) {
slot_char
} else {
c
}
}
}
fn dot_type(&self, c: char) -> DotType {
let data = self.lookup_data(c);
if !data.has_exception() {
data.dot_type()
} else {
let idx = data.exception_index();
self.exceptions.get(idx).bits.dot_type()
}
}
// Returns true if this code point is is case-sensitive.
// This is not currently exposed.
#[allow(dead_code)]
fn is_case_sensitive(&self, c: char) -> bool {
let data = self.lookup_data(c);
if !data.has_exception() {
data.is_sensitive()
} else {
let idx = data.exception_index();
self.exceptions.get(idx).bits.is_sensitive()
}
}
/// Returns whether the character is cased
pub(crate) fn is_cased(&self, c: char) -> bool {
self.lookup_data(c).case_type().is_some()
}
#[inline(always)]
// IS_TITLE_CONTEXT must be true if kind is MappingKind::Title
// The kind may be a different kind with IS_TITLE_CONTEXT still true because
// titlecasing a segment involves switching to lowercase later
fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
kind: MappingKind,
sink: &mut W,
) -> fmt::Result {
// If using a title mapping IS_TITLE_CONTEXT must be true
debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT);
// In a title context, kind MUST be Title or Lower
debug_assert!(
!IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
);
// ICU4C's non-standard extension for Dutch IJ titlecasing
// handled here instead of in full_lower_special_case because J does not have conditional
// special casemapping.
if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
// When titlecasing, a J found immediately after an I at the beginning of the segment
// should also uppercase. They are both allowed to have an acute accent but it must
// be present on both letters or neither. They may not have any other combining marks.
if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
return sink.write_char('J');
}
}
// ICU4C's non-standard extension for Greek uppercasing:
// https://icu.unicode.org/design/case/greek-upper.
// Effectively removes Greek accents from Greek vowels during uppercasing,
// whilst attempting to preserve additional marks like the dialytika (diæresis)
// and ypogegrammeni (combining small iota).
if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper {
// Remove all combining diacritics on a Greek letter.
// Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into
// a capital iota).
// The dialytika is removed here, but it may be added again when the base letter is being processed.
if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c)
&& context.preceded_by_greek_letter()
{
return Ok(());
}
let data = greek_to_me::get_data(c);
// Check if the character is a Greek vowel
match data {
Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
// Get the diacritics on the character itself, and add any further combining diacritics
// from the context.
let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
// If the previous vowel had an accent (which would be removed) but no dialytika,
// and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
// the now-unaccented adjacent vowels from a digraph/diphthong.
// Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
// if the accent was combining, so as to map NFD to NFD and NFC to NFC.
if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
{
if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
if !preceding_vowel.combining.dialytika
&& !preceding_vowel.precomposed.dialytika
{
if preceding_vowel.combining.accented {
diacritics.dialytika = true;
} else {
precomposed_diacritics.dialytika =
preceding_vowel.precomposed.accented;
}
}
}
}
// Write the base of the uppercased combining character sequence.
// In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
// In some branches the base has a precomposed diacritic.
// In the case of the Greek disjunctive "or", a combining tonos may also be written.
match vowel {
GreekVowel::Η => {
// The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
// the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
//
// A lone η with an accent other than the oxia/tonos is not expected,
// so there is no need to special-case the oxia/tonos.
// The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
// so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
// (e.g. να είναι) since Byzantine times anyway.
if diacritics.accented
&& !context.followed_by_cased_letter(self)
&& !context.preceded_by_cased_letter(self)
&& !diacritics.ypogegrammeni
{
if precomposed_diacritics.accented {
sink.write_char('Ή')?;
} else {
sink.write_char('Η')?;
sink.write_char(greek_to_me::TONOS)?;
}
} else {
sink.write_char('Η')?;
}
}
GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϊ'
} else {
vowel.into()
})?,
GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϋ'
} else {
vowel.into()
})?,
_ => sink.write_char(vowel.into())?,
};
if diacritics.dialytika {
sink.write_char(greek_to_me::DIALYTIKA)?;
}
if precomposed_diacritics.ypogegrammeni {
sink.write_char('Ι')?;
}
return Ok(());
}
// Rho might have breathing marks, we handle it specially
// to remove them
Some(GreekPrecomposedLetterData::Consonant(true)) => {
sink.write_char(greek_to_me::CAPITAL_RHO)?;
return Ok(());
}
_ => (),
}
}
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_relevant_to(kind) {
let mapped = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
let mapped = char::from_u32(mapped as u32).unwrap_or(c);
sink.write_char(mapped)
} else {
sink.write_char(c)
}
} else {
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if exception.bits.has_conditional_special() {
if let Some(special) = match kind {
MappingKind::Lower => {
self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale)
}
MappingKind::Fold => self.full_fold_special_case(c, context, locale),
MappingKind::Upper | MappingKind::Title => self
.full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale),
} {
return special.write_to(sink);
}
}
if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) {
if !mapped_string.is_empty() {
return sink.write_str(mapped_string);
}
}
if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() {
return sink.write_char(c);
}
if data.is_relevant_to(kind) {
if let Some(simple) = exception.get_simple_case_slot_for(c) {
return sink.write_char(simple);
}
}
if let Some(slot_char) = exception.slot_char_for_kind(kind) {
sink.write_char(slot_char)
} else {
sink.write_char(c)
}
}
}
// These constants are used for hardcoded locale-specific foldings.
const I_DOT: &'static str = "\u{69}\u{307}";
const J_DOT: &'static str = "\u{6a}\u{307}";
const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}";
const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}";
const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}";
const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}";
// Special case folding mappings, hardcoded.
// This handles the special Turkic mappings for uppercase I and dotted uppercase I
// For non-Turkic languages, this mapping is normally not used.
// For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char {
debug_assert!(c == '\u{49}' || c == '\u{130}');
let is_turkic = options.exclude_special_i;
match (c, is_turkic) {
// Turkic mappings
('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I
('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
// Default mappings
('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I
// There is no simple case folding for U+130.
(c, _) => c,
}
}
fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
if locale == CaseMapLocale::Lithuanian {
// Lithuanian retains the dot in a lowercase i when followed by accents.
// Introduce an explicit dot above when lowercasing capital I's and J's
// whenever there are more accents above (of the accents used in
// Lithuanian: grave, acute, and tilde above).
// Check for accents above I, J, and I-with-ogonek.
if c == 'I' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::I_DOT));
} else if c == 'J' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::J_DOT));
} else if c == '\u{12e}' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::I_OGONEK_DOT));
}
// These characters are precomposed with accents above, so we don't
// have to look at the context.
if c == '\u{cc}' {
return Some(FullMappingResult::String(Self::I_DOT_GRAVE));
} else if c == '\u{cd}' {
return Some(FullMappingResult::String(Self::I_DOT_ACUTE));
} else if c == '\u{128}' {
return Some(FullMappingResult::String(Self::I_DOT_TILDE));
}
}
if locale == CaseMapLocale::Turkish {
if c == '\u{130}' {
// I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
return Some(FullMappingResult::CodePoint('i'));
} else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) {
// When lowercasing, remove dot_above in the sequence I + dot_above,
// which will turn into i. This matches the behaviour of the
// canonically equivalent I-dot_above.
//
// In a titlecase context, we do not want to apply this behavior to cases where the I
// was at the beginning of the string, as that I and its marks should be handled by the
// uppercasing rules (which ignore it, see below)
return Some(FullMappingResult::Remove);
} else if c == 'I' && !context.followed_by_dot_above(self) {
// When lowercasing, unless an I is before a dot_above, it turns
// into a dotless i.
return Some(FullMappingResult::CodePoint('\u{131}'));
}
}
if c == '\u{130}' {
// Preserve canonical equivalence for I with dot. Turkic is handled above.
return Some(FullMappingResult::String(Self::I_DOT));
}
if c == '\u{3a3}'
&& context.preceded_by_cased_letter(self)
&& !context.followed_by_cased_letter(self)
{
// Greek capital sigman maps depending on surrounding cased letters.
return Some(FullMappingResult::CodePoint('\u{3c2}'));
}
// No relevant special case mapping. Use a normal mapping.
None
}
fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
if locale == CaseMapLocale::Turkish && c == 'i' {
// In Turkic languages, i turns into a dotted capital I.
return Some(FullMappingResult::CodePoint('\u{130}'));
}
if locale == CaseMapLocale::Lithuanian
&& c == '\u{307}'
&& context.preceded_by_soft_dotted(self)
{
// Lithuanian retains the dot in a lowercase i when followed by accents.
// Remove dot_above after i with upper or titlecase.
return Some(FullMappingResult::Remove);
}
// ICU4C's non-standard extension for Armenian ligature ech-yiwn.
if c == '\u{587}' {
return match (locale, IS_TITLE_CONTEXT) {
(CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")),
(CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")),
(_, false) => Some(FullMappingResult::String("ԵՒ")),
(_, true) => Some(FullMappingResult::String("Եւ")),
};
}
None
}
fn full_fold_special_case(
&self,
c: char,
_context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
let is_turkic = locale == CaseMapLocale::Turkish;
match (c, is_turkic) {
// Turkic mappings
('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')),
('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')),
// Default mappings
('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')),
('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)),
(_, _) => None,
}
}
/// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists
/// to avoid perf impacts on other more common modes of operation
///
/// titlecase_tail_casing is only read in IS_TITLE_CONTEXT
pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>(
&'a self,
src: &'a str,
locale: CaseMapLocale,
mapping: MappingKind,
titlecase_tail_casing: TrailingCase,
) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
// Ensure that they are either both true or both false, i.e. an XNOR operation
debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title)));
FullCaseWriteable::<IS_TITLE_CONTEXT> {
data: self,
src,
locale,
mapping,
titlecase_tail_casing,
}
}
/// Adds all simple case mappings and the full case folding for `c` to `set`.
/// Also adds special case closure mappings.
/// The character itself is not added.
/// For example, the mappings
/// - for s include long s
/// - for sharp s include ss
/// - for k include the Kelvin sign
pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) {
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters.
// The Turkic dotless i and dotted I with their case mapping conditions
// and case folding option make the related characters behave specially.
// This code matches their closure behavior to their case folding behavior.
match c {
// Regular i and I are in one equivalence class.
'\u{49}' => {
set.add_char('\u{69}');
return;
}
'\u{69}' => {
set.add_char('\u{49}');
return;
}
// Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>)
'\u{130}' => {
set.add_string(Self::I_DOT);
return;
}
// Dotless i is in a class by itself
'\u{131}' => {
return;
}
_ => {}
}
let data = self.lookup_data(c);
if !data.has_exception() {
if data.case_type().is_some() {
let delta = data.delta() as i32;
if delta != 0 {
// Add the one simple case mapping, no matter what type it is.
let codepoint = c as i32 + delta;
// GIGO: delta should be valid
let mapped = char::from_u32(codepoint as u32).unwrap_or(c);
set.add_char(mapped);
}
}
return;
}
// c has exceptions, so there may be multiple simple and/or full case mappings.
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
// Add all simple case mappings.
for slot in [
ExceptionSlot::Lower,
ExceptionSlot::Fold,
ExceptionSlot::Upper,
ExceptionSlot::Title,
] {
if let Some(simple) = exception.get_char_slot(slot) {
set.add_char(simple);
}
}
if let Some(simple) = exception.get_simple_case_slot_for(c) {
set.add_char(simple);
}
exception.add_full_and_closure_mappings(set);
}
/// Maps the string to single code points and adds the associated case closure
/// mappings.
///
/// (see docs on CaseMapper::add_string_case_closure_to)
pub(crate) fn add_string_case_closure_to<S: ClosureSink>(
&self,
s: &str,
set: &mut S,
unfold_data: &CaseMapUnfoldV1,
) -> bool {
if s.chars().count() <= 1 {
// The string is too short to find any match.
return false;
}
match unfold_data.get(s) {
Some(closure_string) => {
for c in closure_string.chars() {
set.add_char(c);
self.add_case_closure_to(c, set);
}
true
}
None => false,
}
}
}
// An internal representation of locale. Non-Root values of this
// enumeration imply that hard-coded special cases exist for this
// language.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum CaseMapLocale {
Root,
Turkish,
Lithuanian,
Greek,
Dutch,
Armenian,
}
impl CaseMapLocale {
pub const fn from_langid(langid: &LanguageIdentifier) -> Self {
use icu_locale_core::subtags::{language, Language};
const TR: Language = language!("tr");
const AZ: Language = language!("az");
const LT: Language = language!("lt");
const EL: Language = language!("el");
const NL: Language = language!("nl");
const HY: Language = language!("hy");
match langid.language {
TR | AZ => Self::Turkish,
LT => Self::Lithuanian,
EL => Self::Greek,
NL => Self::Dutch,
HY => Self::Armenian,
_ => Self::Root,
}
}
}
pub enum FullMappingResult<'a> {
Remove,
CodePoint(char),
String(&'a str),
}
impl FullMappingResult<'_> {
#[allow(dead_code)]
fn add_to_set<S: ClosureSink>(&self, set: &mut S) {
match *self {
FullMappingResult::CodePoint(c) => set.add_char(c),
FullMappingResult::String(s) => set.add_string(s),
FullMappingResult::Remove => {}
}
}
}
impl Writeable for FullMappingResult<'_> {
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
match *self {
FullMappingResult::CodePoint(c) => sink.write_char(c),
FullMappingResult::String(s) => sink.write_str(s),
FullMappingResult::Remove => Ok(()),
}
}
}
pub(crate) struct ContextIterator<'a> {
before: &'a str,
after: &'a str,
}
impl<'a> ContextIterator<'a> {
// Returns a context iterator with the characters before
// and after the character at a given index, given the preceding
// string and the succeeding string including the character itself
pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
let mut char_and_after = char_and_after.chars();
char_and_after.next(); // skip the character itself
let after = char_and_after.as_str();
Self { before, after }
}
fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics {
diacritics.consume_greek_diacritics(self.after);
diacritics
}
fn preceded_by_greek_letter(&self) -> bool {
greek_to_me::preceded_by_greek_letter(self.before)
}
fn preceding_greek_vowel_diacritics(
&self,
) -> Option<GreekCombiningCharacterSequenceDiacritics> {
greek_to_me::preceding_greek_vowel_diacritics(self.before)
}
fn preceded_by_soft_dotted(&self, mapping: &CaseMapV1) -> bool {
for c in self.before.chars().rev() {
match mapping.dot_type(c) {
DotType::SoftDotted => return true,
DotType::OtherAccent => continue,
_ => return false,
}
}
false
}
/// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between.
///
/// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string
fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>(
&self,
mapping: &CaseMapV1,
) -> bool {
let mut iter = self.before.chars().rev();
while let Some(c) = iter.next() {
if c == 'I' {
if I_MUST_NOT_START_STRING {
return iter.next().is_some();
} else {
return true;
}
}
if mapping.dot_type(c) != DotType::OtherAccent {
break;
}
}
false
}
fn preceded_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
for c in self.before.chars().rev() {
let data = mapping.lookup_data(c);
if !data.is_ignorable() {
return data.case_type().is_some();
}
}
false
}
fn followed_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
for c in self.after.chars() {
let data = mapping.lookup_data(c);
if !data.is_ignorable() {
return data.case_type().is_some();
}
}
false
}
fn followed_by_more_above(&self, mapping: &CaseMapV1) -> bool {
for c in self.after.chars() {
match mapping.dot_type(c) {
DotType::Above => return true,
DotType::OtherAccent => continue,
_ => return false,
}
}
false
}
fn followed_by_dot_above(&self, mapping: &CaseMapV1) -> bool {
for c in self.after.chars() {
if c == '\u{307}' {
return true;
}
if mapping.dot_type(c) != DotType::OtherAccent {
return false;
}
}
false
}
/// Checks the preceding and surrounding context of a j or J
/// and returns true if it is preceded by an i or I at the start of the string.
/// If one has an acute accent,
/// both must have the accent for this to return true. No other accents are handled.
fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMapV1) -> bool {
let mut before = self.before.chars().rev();
let mut i_has_acute = false;
loop {
match before.next() {
Some('i') | Some('I') => break,
Some('í') | Some('Í') => {
i_has_acute = true;
break;
}
Some(ACUTE) => i_has_acute = true,
_ => return false,
}
}
if before.next().is_some() {
// not at the beginning of a string, doesn't matter
return false;
}
let mut j_has_acute = false;
for c in self.after.chars() {
if c == ACUTE {
j_has_acute = true;
continue;
}
// We are supposed to check that `j` has no other combining marks aside
// from potentially an acute accent. Once we hit the first non-combining mark
// we are done.
//
// ICU4C checks for `gc=Mn` to determine if something is a combining mark,
// however this requires extra data (and is the *only* point in the casemapping algorithm
// where there is a direct dependency on properties data not mediated by the casemapping data trie).
//
// Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
//
// See https://unicode-org.atlassian.net/browse/ICU-22429
match mapping.dot_type(c) {
// Not a combining character; ccc = 0
DotType::NoDot | DotType::SoftDotted => break,
// found combining character, bail
_ => return false,
}
}
// either both should have an acute accent, or none. this is an XNOR operation
!(j_has_acute ^ i_has_acute)
}
}