icu_casemap/provider/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15// Provider structs must be stable
16#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18use icu_provider::prelude::*;
19
20use crate::provider::data::CaseMapData;
21use crate::provider::exceptions::CaseMapExceptions;
22use icu_collections::codepointtrie::CodePointTrie;
23#[cfg(feature = "datagen")]
24use icu_collections::codepointtrie::CodePointTrieHeader;
25
26pub mod data;
27pub mod exception_helpers;
28pub mod exceptions;
29#[cfg(feature = "datagen")]
30mod exceptions_builder;
31mod unfold;
32
33#[cfg(feature = "compiled_data")]
34#[derive(Debug)]
35/// Baked data
36///
37/// <div class="stab unstable">
38/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
39/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
40/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
41/// </div>
42pub struct Baked;
43
44#[cfg(feature = "compiled_data")]
45#[allow(unused_imports)]
46const _: () = {
47    use icu_casemap_data::*;
48    pub mod icu {
49        pub use crate as casemap;
50        pub use icu_collections as collections;
51    }
52    make_provider!(Baked);
53    impl_case_map_v1!(Baked);
54    impl_case_map_unfold_v1!(Baked);
55};
56
57icu_provider::data_marker!(
58    /// Marker for casemapping data.
59    CaseMapV1,
60    "case/map/v1",
61    CaseMap<'static>,
62    is_singleton = true
63);
64
65icu_provider::data_marker!(
66    /// Reverse case mapping data.
67    CaseMapUnfoldV1,
68    "case/map/unfold/v1",
69    CaseMapUnfold<'static>,
70    is_singleton = true
71);
72
73#[cfg(feature = "datagen")]
74/// The latest minimum set of markers required by this component.
75pub const MARKERS: &[DataMarkerInfo] = &[CaseMapUnfoldV1::INFO, CaseMapV1::INFO];
76
77pub use self::unfold::CaseMapUnfold;
78
79/// This type contains all of the casemapping data
80///
81/// The methods in the provider module are primarily about accessing its data,
82/// however the full algorithms are also implemented as methods on this type in
83/// the `internals` module of this crate.
84///
85/// <div class="stab unstable">
86/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
87/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
88/// to be stable, their Rust representation might not be. Use with caution.
89/// </div>
90#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
91#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
92#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider))]
93#[yoke(prove_covariance_manually)]
94/// CaseMapper provides low-level access to the data necessary to
95/// convert characters and strings to upper, lower, or title case.
96pub struct CaseMap<'data> {
97    /// Case mapping data
98    pub trie: CodePointTrie<'data, CaseMapData>,
99    /// Exceptions to the case mapping data
100    pub exceptions: CaseMapExceptions<'data>,
101}
102
103icu_provider::data_struct!(
104    CaseMap<'_>,
105    #[cfg(feature = "datagen")]
106);
107
108#[cfg(feature = "serde")]
109impl<'de> serde::Deserialize<'de> for CaseMap<'de> {
110    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
111        #[derive(serde::Deserialize)]
112        pub struct Raw<'data> {
113            #[serde(borrow)]
114            pub trie: CodePointTrie<'data, CaseMapData>,
115            #[serde(borrow)]
116            pub exceptions: CaseMapExceptions<'data>,
117        }
118
119        let Raw { trie, exceptions } = Raw::deserialize(deserializer)?;
120        let result = Self { trie, exceptions };
121        debug_assert!(result.validate().is_ok());
122        Ok(result)
123    }
124}
125
126impl CaseMap<'_> {
127    /// Creates a new CaseMap using data exported by the
128    // `icuexportdata` tool in ICU4C. Validates that the data is
129    // consistent.
130    #[cfg(feature = "datagen")]
131    pub fn try_from_icu(
132        trie_header: CodePointTrieHeader,
133        trie_index: &[u16],
134        trie_data: &[u16],
135        exceptions: &[u16],
136    ) -> Result<Self, DataError> {
137        use self::exceptions_builder::CaseMapExceptionsBuilder;
138        use zerovec::ZeroVec;
139        let exceptions_builder = CaseMapExceptionsBuilder::new(exceptions);
140        let (exceptions, idx_map) = exceptions_builder.build()?;
141
142        let trie_index = ZeroVec::alloc_from_slice(trie_index);
143
144        #[allow(clippy::unwrap_used)] // datagen only
145        let trie_data = trie_data
146            .iter()
147            .map(|&i| {
148                CaseMapData::try_from_icu_integer(i)
149                    .unwrap()
150                    .with_updated_exception(&idx_map)
151            })
152            .collect::<ZeroVec<_>>();
153
154        let trie = CodePointTrie::try_new(trie_header, trie_index, trie_data)
155            .map_err(|_| DataError::custom("Casemapping data does not form valid trie"))?;
156
157        let result = Self { trie, exceptions };
158        result.validate().map_err(DataError::custom)?;
159        Ok(result)
160    }
161
162    /// Given an existing CaseMapper, validates that the data is
163    /// consistent. A CaseMapper created by the ICU transformer has
164    /// already been validated. Calling this function is only
165    /// necessary if you are concerned about data corruption after
166    /// deserializing.
167    #[cfg(any(feature = "serde", feature = "datagen"))]
168    #[allow(unused)] // is only used in debug mode for serde
169    pub(crate) fn validate(&self) -> Result<(), &'static str> {
170        // First, validate that exception data is well-formed.
171        let valid_exception_indices = self.exceptions.validate()?;
172
173        let validate_delta = |c: char, delta: i32| -> Result<(), &'static str> {
174            let new_c =
175                u32::try_from(c as i32 + delta).map_err(|_| "Delta larger than character")?;
176            char::from_u32(new_c).ok_or("Invalid delta")?;
177            Ok(())
178        };
179
180        for i in 0..char::MAX as u32 {
181            if let Some(c) = char::from_u32(i) {
182                let data = self.lookup_data(c);
183                if data.has_exception() {
184                    let idx = data.exception_index();
185                    let exception = self.exceptions.get(idx);
186                    // Verify that the exception index points to a valid exception header.
187                    if !valid_exception_indices.contains(&idx) {
188                        return Err("Invalid exception index in trie data");
189                    }
190                    exception.validate()?;
191                } else {
192                    validate_delta(c, data.delta() as i32)?;
193                }
194            }
195        }
196        Ok(())
197    }
198
199    pub(crate) fn lookup_data(&self, c: char) -> CaseMapData {
200        self.trie.get32(c as u32)
201    }
202}