icu_casemap/provider/
data.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! The primary per-codepoint casefolding data
6
7#[cfg(feature = "datagen")]
8use alloc::collections::BTreeMap;
9use core::num::TryFromIntError;
10use icu_collections::codepointtrie::TrieValue;
11use zerovec::ule::{AsULE, RawBytesULE, UleError, ULE};
12
13/// The case of a Unicode character
14///
15/// <div class="stab unstable">
16/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
17/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
18/// to be stable, their Rust representation might not be. Use with caution.
19/// </div>
20#[derive(Copy, Clone, Debug, Eq, PartialEq)]
21#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
22#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
23#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
24pub enum CaseType {
25    /// Lowercase letter
26    Lower = 1,
27    /// Uppercase letter
28    Upper = 2,
29    /// Titlecase letter
30    Title = 3,
31}
32
33impl CaseType {
34    pub(crate) const CASE_MASK: u16 = 0x3;
35
36    // The casetype is stored in the codepoint trie as two bits.
37    // After masking them to get a value between 0 and 3, this
38    // function converts to `CaseType`.
39    //
40    // Returns `None` for uncased
41    #[inline]
42    pub(crate) fn from_masked_bits(b: u16) -> Option<Self> {
43        debug_assert!(b & Self::CASE_MASK == b);
44        match b {
45            0 => None,
46            1 => Some(CaseType::Lower),
47            2 => Some(CaseType::Upper),
48            _ => Some(CaseType::Title),
49        }
50    }
51}
52
53/// The dot type of a Unicode character. This indicates how dotted
54/// letters (like `i` and `j`) combine with accents placed above the
55/// letter.
56///
57/// <div class="stab unstable">
58/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
59/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
60/// to be stable, their Rust representation might not be. Use with caution.
61/// </div>
62#[derive(Copy, Clone, Debug, Eq, PartialEq)]
63#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
64#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
65#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
66#[derive(Default)]
67pub enum DotType {
68    /// Normal characters with combining class 0
69    #[default]
70    NoDot = 0,
71    /// Soft-dotted characters with combining class 0
72    SoftDotted = 1,
73    /// "Above" accents with combining class 230
74    Above = 2,
75    /// Other accent characters
76    OtherAccent = 3,
77}
78
79impl DotType {
80    pub(crate) const DOT_MASK: u16 = 0x3;
81
82    // The dot type is stored in either the codepoint trie or the
83    // exception table as two bits.  After shifting and masking them
84    // to get a value between 0 and 3, this function converts to
85    // DotType.
86    #[inline]
87    pub(crate) fn from_masked_bits(b: u16) -> Self {
88        debug_assert!(b & Self::DOT_MASK == b);
89        match b {
90            0 => DotType::NoDot,
91            1 => DotType::SoftDotted,
92            2 => DotType::Above,
93            _ => DotType::OtherAccent,
94        }
95    }
96}
97
98#[derive(Copy, Clone, Debug, Eq, PartialEq)]
99pub(crate) enum MappingKind {
100    Lower = 0,
101    Fold = 1,
102    Upper = 2,
103    Title = 3,
104}
105
106/// Case mapping data associated with a single code point
107///
108/// <div class="stab unstable">
109/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
110/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
111/// to be stable, their Rust representation might not be. Use with caution.
112/// </div>
113#[derive(Copy, Clone, Debug, Eq, PartialEq)]
114#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
115#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
116#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
117pub struct CaseMapData {
118    /// Whether this is default-ignoreable
119    pub ignoreable: bool,
120    /// The rest of the case mapping data
121    pub kind: CaseMapDataKind,
122}
123
124/// A subset of case mapping data associated with a single code point
125///
126/// <div class="stab unstable">
127/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
128/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
129/// to be stable, their Rust representation might not be. Use with caution.
130/// </div>
131#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
132#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
133#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
134#[derive(Copy, Clone, Debug, Eq, PartialEq)]
135pub enum CaseMapDataKind {
136    /// This code point is an exception. Provides the case type of its own case
137    /// and the exception index stored in [`CaseMapExceptions`]
138    ///
139    /// [`CaseMapExceptions`]: crate::provider::exceptions::CaseMapExceptions
140    Exception(Option<CaseType>, u16),
141    /// This code point is uncased, and has the following extra data
142    Uncased(NonExceptionData),
143    /// This code point is cased. We store the extra data, its case type, and a *delta*
144    /// that can be used to get its casemapped codepoint.
145    Delta(NonExceptionData, CaseType, i16),
146}
147
148/// Data that is stored in CaseMapData when it is *not* an exception
149///
150/// <div class="stab unstable">
151/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
152/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
153/// to be stable, their Rust representation might not be. Use with caution.
154/// </div>
155#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
156#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
157#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
158#[derive(Copy, Clone, Debug, Eq, PartialEq)]
159pub struct NonExceptionData {
160    /// Whether or not the type is case-sensitive
161    pub sensitive: bool,
162    /// The "dot type"
163    pub dot_type: DotType,
164}
165
166impl CaseMapData {
167    #[inline]
168    pub(crate) fn case_type(self) -> Option<CaseType> {
169        match self.kind {
170            CaseMapDataKind::Exception(case_type, ..) => case_type,
171            CaseMapDataKind::Delta(_, case_type, _) => Some(case_type),
172            CaseMapDataKind::Uncased(..) => None,
173        }
174    }
175
176    #[inline]
177    pub(crate) fn is_upper_or_title(self) -> bool {
178        match self.case_type() {
179            None | Some(CaseType::Lower) => false,
180            Some(CaseType::Upper) | Some(CaseType::Title) => true,
181        }
182    }
183
184    #[inline]
185    pub(crate) fn is_relevant_to(self, kind: MappingKind) -> bool {
186        match kind {
187            MappingKind::Lower | MappingKind::Fold => self.is_upper_or_title(),
188            MappingKind::Upper | MappingKind::Title => self.case_type() == Some(CaseType::Lower),
189        }
190    }
191
192    #[inline]
193    pub(crate) fn is_ignorable(self) -> bool {
194        self.ignoreable
195    }
196
197    #[inline]
198    pub(crate) fn has_exception(self) -> bool {
199        matches!(self.kind, CaseMapDataKind::Exception(..))
200    }
201
202    // Returns true if this code point is case-sensitive.
203    // only in the non-exception case
204    // This is not currently exposed.
205    #[inline]
206    pub(crate) fn is_sensitive(self) -> bool {
207        match self.kind {
208            CaseMapDataKind::Exception(..) => false,
209            CaseMapDataKind::Delta(ned, ..) => ned.sensitive,
210            CaseMapDataKind::Uncased(ned) => ned.sensitive,
211        }
212    }
213
214    #[inline]
215    pub(crate) fn dot_type(self) -> DotType {
216        match self.kind {
217            CaseMapDataKind::Exception(..) => DotType::NoDot,
218            CaseMapDataKind::Delta(ned, ..) => ned.dot_type,
219            CaseMapDataKind::Uncased(ned) => ned.dot_type,
220        }
221    }
222
223    // The delta between this code point and its upper/lowercase equivalent.
224    // This should only be called for codepoints without exception data.
225    //
226    // Returns 0 for uncased types
227    #[inline]
228    pub(crate) fn delta(self) -> i16 {
229        debug_assert!(!self.has_exception());
230        match self.kind {
231            CaseMapDataKind::Exception(..) => 0,
232            CaseMapDataKind::Delta(.., delta) => delta,
233            CaseMapDataKind::Uncased(..) => 0,
234        }
235    }
236
237    // The index of the exception data for this codepoint in the exception
238    // table. This should only be called for codepoints with exception data.
239    #[inline]
240    pub(crate) fn exception_index(self) -> u16 {
241        debug_assert!(self.has_exception());
242        if let CaseMapDataKind::Exception(_, i) = self.kind {
243            i
244        } else {
245            0
246        }
247    }
248
249    // CaseMapExceptionsBuilder moves the full mapping and closure
250    // strings out of the exception table itself. This means that the
251    // exception index for a code point in ICU4X will be different
252    // from the exception index for the same codepoint in ICU4C. Given
253    // a mapping from old to new, this function updates the exception
254    // index if necessary.
255    #[cfg(feature = "datagen")]
256    pub(crate) fn with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self {
257        let kind = if let CaseMapDataKind::Exception(ty, index) = self.kind {
258            if let Some(updated_exception) = updates.get(&index) {
259                CaseMapDataKind::Exception(ty, *updated_exception)
260            } else {
261                self.kind
262            }
263        } else {
264            self.kind
265        };
266
267        Self { kind, ..self }
268    }
269
270    /// Attempt to construct from ICU-format integer
271    #[cfg(any(feature = "datagen", test))]
272    pub(crate) fn try_from_icu_integer(int: u16) -> Result<Self, UleError> {
273        let raw = int.to_unaligned();
274        CaseMapDataULE::validate_bytes(raw.as_bytes())?;
275
276        let this = Self::from_unaligned(CaseMapDataULE(raw));
277        Ok(this)
278    }
279}
280
281impl TrieValue for CaseMapData {
282    type TryFromU32Error = TryFromIntError;
283
284    fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
285        u16::try_from(i).map(|u| AsULE::from_unaligned(CaseMapDataULE(u.to_unaligned())))
286    }
287
288    fn to_u32(self) -> u32 {
289        u32::from(self.to_unaligned().0.as_unsigned_int())
290    }
291}
292
293/// Packed casemappingdata type
294///
295/// Data format, copied from ICU4C casepropsbuilder.cpp:
296///
297/// ```text
298/// Trie data word:
299/// Bits
300/// if(exception) {
301///     15..4   unsigned exception index
302/// } else {
303///     if(not uncased) {
304///         15..7   signed delta to simple case mapping code point
305///                 (add delta to input code point)
306///     } else {
307///         15..7   reserved, 0
308///     }
309///      6..5   0 normal character with cc=0
310///             1 soft-dotted character
311///             2 cc=230
312///             3 other cc
313///             The runtime code relies on these two bits to be adjacent with this encoding.
314/// }
315///     4   case-sensitive
316///     3   exception
317///     2   case-ignorable
318///  1..0   0 uncased
319///         1 lowercase
320///         2 uppercase
321///         3 titlecase
322///         The runtime code relies on the case-ignorable and case type bits 2..0
323///         to be the lowest bits with this encoding.
324/// ```
325///
326/// <div class="stab unstable">
327/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
328/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
329/// to be stable, their Rust representation might not be. Use with caution.
330/// </div>
331#[derive(Copy, Clone, Debug, Eq, PartialEq)]
332#[repr(transparent)]
333pub struct CaseMapDataULE(RawBytesULE<2>);
334
335impl CaseMapDataULE {
336    // 1..0 case type
337    const CASE_TYPE_BITS: u16 = 0x3;
338    // 2 case-ignorable
339    const CASE_IGNOREABLE_BIT: u16 = 0x4;
340    // 3 exception
341    const EXCEPTION_BIT: u16 = 0x8;
342    // 4 case-sensitive
343    const CASE_SENSITIVE_BIT: u16 = 0x10;
344    // 15..4 unsigned exception index
345    const EXCEPTION_SHIFT: u16 = 4;
346    // 15..7 signed-delta to simple case mapping code point (or reserved)
347    const DELTA_SHIFT: u16 = 7;
348    // 6..5 dot type
349    const DOT_TYPE_BITS: u16 = 0x60;
350    const DOT_SHIFT: u16 = 5;
351}
352
353/// # Safety
354///
355/// Safety checklist for `ULE`:
356///
357/// 1. The type *must not* include any uninitialized or padding bytes: repr(transparent)
358///    wrapper around ULE type
359/// 2. The type must have an alignment of 1 byte: repr(transparent) wrapper around ULE type
360/// 3. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
361///    would not represent a valid slice of this type: It does
362/// 4. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
363///    cannot be used in its entirety (if its length is not a multiple of `size_of::<Self>()`):
364///    it does, due to the RawBytesULE parse call
365/// 5. All other methods *must* be left with their default impl, or else implemented according to
366///    their respective safety guidelines: They have been
367/// 6. The equality invariant is satisfied
368unsafe impl ULE for CaseMapDataULE {
369    fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
370        let sixteens = RawBytesULE::<2>::parse_bytes_to_slice(bytes)?;
371
372        for sixteen in sixteens {
373            let sixteen = sixteen.as_unsigned_int();
374            // The type has reserved bits in the
375            // uncased + not exception case
376            if sixteen & Self::EXCEPTION_BIT == 0 {
377                // not an exception
378                if sixteen & Self::CASE_TYPE_BITS == 0 {
379                    // uncased
380                    if sixteen >> Self::DELTA_SHIFT != 0 {
381                        // We have some used bits in the reserved zone!
382                        return Err(UleError::parse::<Self>());
383                    }
384                }
385            }
386        }
387        Ok(())
388    }
389}
390
391impl AsULE for CaseMapData {
392    type ULE = CaseMapDataULE;
393
394    fn from_unaligned(ule: Self::ULE) -> Self {
395        let sixteen = ule.0.as_unsigned_int();
396
397        let ignoreable = (sixteen & CaseMapDataULE::CASE_IGNOREABLE_BIT) != 0;
398        let exception = (sixteen & CaseMapDataULE::EXCEPTION_BIT) != 0;
399
400        let case_type = sixteen & CaseMapDataULE::CASE_TYPE_BITS;
401        let case_type = CaseType::from_masked_bits(case_type);
402        let kind = if exception {
403            // No need to mask first since the exception bits start at 15
404            let exception = sixteen >> CaseMapDataULE::EXCEPTION_SHIFT;
405            CaseMapDataKind::Exception(case_type, exception)
406        } else {
407            let dot_type = (sixteen & CaseMapDataULE::DOT_TYPE_BITS) >> CaseMapDataULE::DOT_SHIFT;
408            let dot_type = DotType::from_masked_bits(dot_type);
409            let sensitive = (sixteen & CaseMapDataULE::CASE_SENSITIVE_BIT) != 0;
410            let ned = NonExceptionData {
411                dot_type,
412                sensitive,
413            };
414            if let Some(case_type) = case_type {
415                // no need to mask first since the delta bits start at 15
416                // We can also cast as i16 first so we do not have to
417                // sign-extend later
418                let delta = (sixteen as i16) >> CaseMapDataULE::DELTA_SHIFT;
419                CaseMapDataKind::Delta(ned, case_type, delta)
420            } else {
421                CaseMapDataKind::Uncased(ned)
422            }
423        };
424        CaseMapData { ignoreable, kind }
425    }
426
427    fn to_unaligned(self) -> Self::ULE {
428        let mut sixteen = 0;
429        if self.ignoreable {
430            sixteen |= CaseMapDataULE::CASE_IGNOREABLE_BIT;
431        }
432        match self.kind {
433            CaseMapDataKind::Exception(case_type, e) => {
434                sixteen |= CaseMapDataULE::EXCEPTION_BIT;
435                sixteen |= e << CaseMapDataULE::EXCEPTION_SHIFT;
436                sixteen |= case_type.map(|c| c as u16).unwrap_or(0);
437            }
438            CaseMapDataKind::Uncased(ned) => {
439                sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
440                if ned.sensitive {
441                    sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
442                }
443                // Remaining bytes are left at zero
444                // case_type is Uncased (0)
445            }
446            CaseMapDataKind::Delta(ned, case_type, delta) => {
447                // First shift (which keeps the signedness), then cast to the
448                // right type
449                sixteen |= (delta << CaseMapDataULE::DELTA_SHIFT) as u16;
450                sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
451                if ned.sensitive {
452                    sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
453                }
454                sixteen |= case_type as u16;
455            }
456        }
457        CaseMapDataULE(sixteen.to_unaligned())
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464
465    #[test]
466    fn test_roundtrip() {
467        const TESTCASES: &[CaseMapData] = &[
468            CaseMapData {
469                ignoreable: true,
470                kind: CaseMapDataKind::Exception(Some(CaseType::Title), 923),
471            },
472            CaseMapData {
473                ignoreable: false,
474                kind: CaseMapDataKind::Exception(None, 923),
475            },
476            CaseMapData {
477                ignoreable: true,
478                kind: CaseMapDataKind::Delta(
479                    NonExceptionData {
480                        sensitive: true,
481                        dot_type: DotType::SoftDotted,
482                    },
483                    CaseType::Upper,
484                    50,
485                ),
486            },
487            CaseMapData {
488                ignoreable: false,
489                kind: CaseMapDataKind::Delta(
490                    NonExceptionData {
491                        sensitive: true,
492                        dot_type: DotType::SoftDotted,
493                    },
494                    CaseType::Upper,
495                    -50,
496                ),
497            },
498            CaseMapData {
499                ignoreable: false,
500                kind: CaseMapDataKind::Uncased(NonExceptionData {
501                    sensitive: false,
502                    dot_type: DotType::SoftDotted,
503                }),
504            },
505        ];
506
507        for case in TESTCASES {
508            let ule = case.to_unaligned();
509            let roundtrip = CaseMapData::from_unaligned(ule);
510            assert_eq!(*case, roundtrip);
511            let integer = ule.0.as_unsigned_int();
512            let roundtrip2 = CaseMapData::try_from_icu_integer(integer).unwrap();
513            assert_eq!(*case, roundtrip2);
514        }
515    }
516    #[test]
517    fn test_integer_roundtrip() {
518        // Buggy roundtrip cases go here
519        fn test_single_integer(int: u16) {
520            let cmd = CaseMapData::try_from_icu_integer(int).unwrap();
521            assert_eq!(int, cmd.to_unaligned().0.as_unsigned_int())
522        }
523
524        test_single_integer(84);
525        test_single_integer(2503);
526    }
527}