icu_casemap/provider/
exceptions_builder.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::provider::exception_helpers::{
    ExceptionBits, ExceptionBitsULE, ExceptionSlot, SlotPresence,
};
use crate::provider::exceptions::{CaseMapExceptions, DecodedException};
use alloc::borrow::Cow;
use alloc::collections::BTreeMap;
use alloc::string::String;
use alloc::vec::Vec;
use icu_provider::DataError;
use zerovec::ule::{AsULE, ULE};

/// The header for exception types as found in ICU4C data. See [`ExceptionHeaderULE`]
/// for the wire format
#[derive(Copy, Clone, PartialEq, Eq)]
pub struct ExceptionHeader {
    /// The various slots that are present, masked by ExceptionSlot
    ///
    /// We still store this as a bitmask since it's more convenient to access as one
    pub slot_presence: SlotPresence,
    pub bits: ExceptionBits,
}

impl ExceptionHeader {
    /// Construct from an ICU4C-format u16.
    pub(crate) fn from_integer(int: u16) -> Self {
        let slot_presence =
            SlotPresence(u8::try_from(int & ExceptionHeaderULE::SLOTS_MASK).unwrap_or(0));
        let bits = ExceptionBits::from_integer(
            u8::try_from(int >> ExceptionHeaderULE::BITS_SHIFT).unwrap_or(0),
        );
        Self {
            slot_presence,
            bits,
        }
    }

    // Returns true if the given slot exists for this exception
    pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool {
        self.slot_presence.has_slot(slot)
    }
}

/// Packed exception header (format from icu4c, documented in casepropsbuilder.cpp)
///
/// ```text
///       Bits:
///         0..7  Flag bits indicating which optional slots are present (if any):
///               0: Lowercase mapping (code point)
///               1: Case folding (code point)
///               2: Uppercase mapping (code point)
///               3: Titlecase mapping (code point)
///               4: Delta to simple case mapping (code point) (sign stored separately)
///               5: RESERVED
///               6: Closure mappings (string; see below)
///               7: Full mappings (strings; see below)
///            8  Double-width slots. If set, then each optional slot is stored as two
///               elements of the array (high and low halves of 32-bit values) instead of
///               a single element.
///            9  Has no simple case folding, even if there is a simple lowercase mapping
///           10  The value in the delta slot is negative
///           11  Is case-sensitive (not exposed)
///       12..13  Dot type
///           14  Has conditional special casing
///           15  Has conditional case folding
/// ```
///
/// In this struct the RESERVED bit is still allowed to be set, and it will produce a different
/// exception header, but it will not have any other effects.
#[derive(Copy, Clone, PartialEq, Eq, ULE)]
#[repr(C, packed)]
pub struct ExceptionHeaderULE {
    slot_presence: SlotPresence,
    bits: ExceptionBitsULE,
}

impl ExceptionHeaderULE {
    const SLOTS_MASK: u16 = 0xff;
    const BITS_SHIFT: u16 = 8;
}

impl AsULE for ExceptionHeader {
    type ULE = ExceptionHeaderULE;
    fn from_unaligned(u: ExceptionHeaderULE) -> Self {
        Self {
            slot_presence: u.slot_presence,
            bits: ExceptionBits::from_integer(u.bits.0),
        }
    }

    fn to_unaligned(self) -> ExceptionHeaderULE {
        ExceptionHeaderULE {
            slot_presence: self.slot_presence,
            bits: ExceptionBitsULE(self.bits.to_integer()),
        }
    }
}
// CaseMapExceptionsBuilder consumes the exceptions data produced by
// casepropsbuilder.cpp in ICU4C. It generates an instance of CaseMapExceptions. The
// primary difference is that the ICU4C representation stores full mapping and closure
// strings inline in the data, while CaseMapExceptions uses a side table. As a result,
// the starting index of each exception in the resulting CaseMapExceptions may have
// changed, so we also produce a map from old indices to new indices that will be used to
// update the data stored in the code point trie.
pub struct CaseMapExceptionsBuilder<'a> {
    raw_data: &'a [u16],
    raw_data_idx: usize,
    double_slots: bool,
}

impl<'a> CaseMapExceptionsBuilder<'a> {
    const MAPPINGS_ALL_LENGTHS_MASK: u32 = 0xffff;
    const FULL_MAPPINGS_LENGTH_MASK: u32 = 0xf;
    const FULL_MAPPINGS_LENGTH_SHIFT: u32 = 4;

    const CLOSURE_MAX_LENGTH: u32 = 0xf;

    pub fn new(raw_data: &'a [u16]) -> Self {
        Self {
            raw_data,
            raw_data_idx: 0,
            double_slots: false,
        }
    }

    fn done(&self) -> bool {
        self.raw_data_idx >= self.raw_data.len()
    }
    fn read_raw(&mut self) -> Result<u16, DataError> {
        let result = self
            .raw_data
            .get(self.raw_data_idx)
            .ok_or(DataError::custom("Incomplete exception data"))?;
        self.raw_data_idx += 1;
        Ok(*result)
    }

    fn read_slot(&mut self) -> Result<u32, DataError> {
        if self.double_slots {
            let hi = self.read_raw()? as u32;
            let lo = self.read_raw()? as u32;
            Ok(hi << 16 | lo)
        } else {
            Ok(self.read_raw()? as u32)
        }
    }

    // After reading a string out of the raw data, advance raw_data_idx.
    fn skip_string(&mut self, s: &str) {
        for c in s.chars() {
            self.raw_data_idx += c.len_utf16();
        }
    }

    pub(crate) fn build(
        mut self,
    ) -> Result<(CaseMapExceptions<'static>, BTreeMap<u16, u16>), DataError> {
        let mut exceptions = Vec::new();
        let mut idx_map = BTreeMap::new();
        // The format of the raw data from ICU4C is the same as the format described in
        // exceptions.rs, with the exception of full mapping and closure strings. The
        // header and non-string slots can be copied over without modification. For string
        // slots, we read the length information from the ICU4C slot (described below),
        // read the strings, add the strings to the CaseMapExceptions string table,
        // and write an updated slot value containing the index of the string in the
        // table. In the case of full mappings, we store the index of the lowercase
        // mapping; the remaining mappings are stored at sequential indices.
        //
        // Full mappings: If there is at least one full (string) case mapping, then the
        // lengths of the mappings are encoded as nibbles in the full mappings slot:
        //     Bits:
        //        0..4   Length of lowercase string
        //        5..7   Length of case folding string
        //        8..11  Length of uppercase string
        //        12..15 Length of titlecase string
        // Mappings that do not exist have length 0. The strings themselves are stored in
        // the above order immediately following the last optional slot, encoded as UTF16.
        //
        // Case closure: If the case closure for a code point includes code points that
        // are not included in the simple or full mappings, then bits 0..3 of the closure
        // mappings slot will contain the number of codepoints in the closure string.
        // (Other bits are reserved.) The closure string itself is encoded as UTF16 and
        // stored following the full mappings data (if it exists) or the final optional
        // slot.
        while !self.done() {
            let old_idx = self.raw_data_idx as u16;

            let mut exception = DecodedException::default();

            // Copy header.
            let header = ExceptionHeader::from_integer(self.read_raw()?);
            self.double_slots = header.bits.double_width_slots;

            // Copy unmodified slots.
            for (slot, output) in [
                (ExceptionSlot::Lower, &mut exception.lowercase),
                (ExceptionSlot::Fold, &mut exception.casefold),
                (ExceptionSlot::Upper, &mut exception.uppercase),
                (ExceptionSlot::Title, &mut exception.titlecase),
            ] {
                if header.has_slot(slot) {
                    let value = self.read_slot()?;
                    if let Ok(ch) = char::try_from(value) {
                        *output = Some(ch)
                    } else {
                        return Err(DataError::custom(
                            "Found non-char value in casemapping exceptions data",
                        ));
                    }
                }
            }
            if header.has_slot(ExceptionSlot::Delta) {
                let delta = self.read_slot()?;

                exception.simple_case_delta = Some(delta)
            }

            // Read the closure and full mappings slots, if they exist.
            let closure_length = if header.has_slot(ExceptionSlot::Closure) {
                Some((self.read_slot()? & Self::CLOSURE_MAX_LENGTH) as usize)
            } else {
                None
            };
            let mappings_lengths = if header.has_slot(ExceptionSlot::FullMappings) {
                Some(self.read_slot()? & Self::MAPPINGS_ALL_LENGTHS_MASK)
            } else {
                None
            };

            // Copy the full mappings strings into the strings table, if they exist.
            if let Some(mut lengths) = mappings_lengths {
                let mut arr: [Cow<_>; 4] = Default::default();
                for mapping in &mut arr {
                    let len = lengths & Self::FULL_MAPPINGS_LENGTH_MASK;
                    lengths >>= Self::FULL_MAPPINGS_LENGTH_SHIFT;

                    let start = self.raw_data_idx;
                    let end = start + len as usize;
                    let slice = &self
                        .raw_data
                        .get(start..end)
                        .ok_or(DataError::custom("Incomplete string data"))?;
                    let string = char::decode_utf16(slice.iter().copied())
                        .collect::<Result<String, _>>()
                        .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?;
                    self.skip_string(&string);
                    *mapping = string.into()
                }
                exception.full = Some(arr)
            }

            // Copy the closure string into the strings table, if it exists.
            if let Some(len) = closure_length {
                let start = self.raw_data_idx;
                let slice = &self
                    .raw_data
                    .get(start..)
                    .ok_or(DataError::custom("Incomplete string data"))?;
                let string = char::decode_utf16(slice.iter().copied())
                    .take(len)
                    .collect::<Result<String, _>>()
                    .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?;
                self.skip_string(&string);
                exception.closure = Some(string.into())
            }

            exception.bits = header.bits;
            // unused bits in ICU4X
            exception.bits.double_width_slots = false;

            let new_exception_index = if let Ok(idx) = u16::try_from(exceptions.len()) {
                idx
            } else {
                return Err(DataError::custom("More than u16 exceptions"));
            };
            idx_map.insert(old_idx, new_exception_index);
            exceptions.push(exception.encode());
        }

        Ok((
            CaseMapExceptions {
                exceptions: (&exceptions).into(),
            },
            idx_map,
        ))
    }
}