icu_codepointtrie_builder/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! `icu_codepointtrie_builder` is a utility crate of the [`ICU4X`] project.
6//!
7//! This crate exposes functionality to build a [`CodePointTrie`] from values provided at runtime.
8//! Because it is normally expected for [`CodePointTrie`] data to be pre-compiled, this crate is not
9//! optimized for speed; it should be used during a build phase.
10//!
11//! Under the hood, this crate uses the CodePointTrie builder code from ICU4C, [`UMutableCPTrie`].
12//! For more context, see <https://github.com/unicode-org/icu4x/issues/1837>.
13//!
14//! Unlike most of ICU4X, due in large part to the native dependency, this crate is not guaranteed
15//! to be panic-free.
16//!
17//! # Build configuration
18//!
19//! This crate has two primary modes it can be used in: `"wasm"` and `"icu4c"`, exposed as
20//! Cargo features. If both are enabled, the code will internally use the wasm codepath.
21//!
22//! The `"wasm"` mode uses a Wasm module packaged into this Rust crate that contains
23//! pre-compiled ICU4C CodePointTrie builder code. It evaluates the Wasm module using
24//! the Wasmer runtime, which "just works", but it requires a large number of
25//! Rust/Cargo dependencies.
26//!
27//! The `"icu4c"` mode reduces the number of Rust dependencies, but it requires having a local copy
28//! of ICU4C available. To configure `"icu4c"` mode in Cargo, set the following environment variables:
29//!
30//! - Set `ICU4C_LIB_PATH` to a directory full of ICU4C static or shared libraries.
31//! - Set `ICU4C_LINK_STATICALLY` to any value to use the static libraries.
32//! - Set `ICU4C_RENAME_VERSION` to the integer ICU4C version if ICU4C has renaming
33//!   enabled. By default, we attempt to link non-renamed symbols.
34//!
35//! If using dynamic linking, at runtime, you may need to set `[DY]LD_LIBRARY_PATH`
36//! to the `ICU4C_LIB_PATH`.
37//!
38//! If _not_ using Cargo, make sure to pass `ICU4C_LIB_PATH` to the linker via `-L`, link against
39//! `icuuc`, `icui18n` and `icudata` via `-l` flags, and set `--cfg icu4c_enable_renaming` if you need
40//! renamed ICU4C symbols.
41//!
42//! # Examples
43//!
44//! ```
45//! use icu::collections::codepointtrie::CodePointTrie;
46//! use icu::collections::codepointtrie::TrieType;
47//! use icu_codepointtrie_builder::CodePointTrieBuilder;
48//! use icu_codepointtrie_builder::CodePointTrieBuilderData;
49//!
50//! let default_value = 1;
51//! let error_value = 2;
52//! let values_by_code_point = &[3, 4, 5, 6];
53//!
54//! let cpt: CodePointTrie<u8> = CodePointTrieBuilder {
55//!     data: CodePointTrieBuilderData::ValuesByCodePoint(values_by_code_point),
56//!     default_value,
57//!     error_value,
58//!     trie_type: TrieType::Small,
59//! }
60//! .build();
61//!
62//! assert_eq!(cpt.get32(0), 3);
63//! assert_eq!(cpt.get32(1), 4);
64//! assert_eq!(cpt.get32(2), 5);
65//! assert_eq!(cpt.get32(3), 6);
66//! assert_eq!(cpt.get32(4), 1); // default value
67//! assert_eq!(cpt.get32(u32::MAX), 2); // error value
68//! ```
69//!
70//! [`ICU4X`]: ../icu/index.html
71//! [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
72//! [`UMutableCPTrie`]: (https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/umutablecptrie_8h.html#ad8945cf34ca9d40596a66a1395baa19b)
73
74#![cfg_attr(
75    not(test),
76    deny(
77        // The crate is documented to allow panics.
78        // clippy::indexing_slicing,
79        // clippy::unwrap_used,
80        // clippy::expect_used,
81        // clippy::panic,
82        clippy::exhaustive_structs,
83        clippy::exhaustive_enums, clippy::trivially_copy_pass_by_ref,
84        missing_debug_implementations,
85    )
86)]
87
88use icu_collections::codepointtrie::TrieType;
89use icu_collections::codepointtrie::TrieValue;
90
91#[cfg(any(feature = "wasm", feature = "icu4c"))]
92mod common;
93
94#[cfg(feature = "wasm")]
95mod wasm;
96
97#[cfg(feature = "icu4c")]
98mod native;
99
100/// Wrapper over the data to be encoded into a [`CodePointTrie`].
101///
102/// There is currently only one variant, but more may be added in the future.
103///
104/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
105#[non_exhaustive]
106#[derive(Debug)]
107pub enum CodePointTrieBuilderData<'a, T> {
108    /// A list of values for each code point, starting from code point 0.
109    ///
110    /// For example, the value for U+0020 (space) should be at index 32 in the slice.
111    /// Index 0 sets the value for the U+0000 (NUL).
112    ValuesByCodePoint(&'a [T]),
113}
114
115/// Settings for building a [`CodePointTrie`].
116///
117/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
118#[allow(clippy::exhaustive_structs)]
119#[derive(Debug)]
120pub struct CodePointTrieBuilder<'a, T> {
121    /// The data to be encoded.
122    pub data: CodePointTrieBuilderData<'a, T>,
123    /// The default value for code points not specified in the data.
124    pub default_value: T,
125    /// The error value for invalid code points.
126    pub error_value: T,
127    /// The [`TrieType`]: fast or small.
128    pub trie_type: TrieType,
129}
130
131impl<T> CodePointTrieBuilder<'_, T>
132where
133    T: TrieValue,
134{
135    /// Build the [`CodePointTrie`].
136    ///
137    /// Under the hood, this function runs ICU4C code compiled into WASM,
138    /// or links natively to ICU4C as specified by the `ICU4C_LIB_PATH` env var
139    ///
140    /// ✨ *Enabled with either the `wasm` or the `icu4c` Cargo feature.*
141    ///
142    /// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
143    #[cfg(any(feature = "wasm", feature = "icu4c"))]
144    pub fn build(self) -> icu_collections::codepointtrie::CodePointTrie<'static, T> {
145        #[cfg(feature = "wasm")]
146        {
147            wasm::run_wasmi_ucptrie_wrap(&self)
148        }
149
150        #[cfg(all(feature = "icu4c", not(feature = "wasm")))]
151        {
152            native::run_native(&self)
153        }
154    }
155}
156
157#[test]
158#[cfg(any(feature = "wasm", feature = "icu4c"))]
159fn test_cpt_builder() {
160    // Buckets of ten characters for 0 to 100, and then some default values, and then heterogenous "last hex digit" for 0x100 to 0x200
161    let values: Vec<u32> = (0..100)
162        .map(|x| x / 10)
163        .chain((100..0x100).map(|_| 100))
164        .chain((0x100..0x200).map(|x| x % 16))
165        .collect();
166
167    let builder = CodePointTrieBuilder {
168        data: CodePointTrieBuilderData::ValuesByCodePoint(&values),
169        default_value: 100,
170        error_value: 0xFFFF,
171        trie_type: TrieType::Fast,
172    };
173
174    let cpt = builder.build();
175
176    assert_eq!(cpt.get32(0), 0);
177    assert_eq!(cpt.get32(10), 1);
178    assert_eq!(cpt.get32(20), 2);
179    assert_eq!(cpt.get32(21), 2);
180    assert_eq!(cpt.get32(99), 9);
181    assert_eq!(cpt.get32(0x101), 0x1);
182    assert_eq!(cpt.get32(0x102), 0x2);
183    assert_eq!(cpt.get32(0x105), 0x5);
184    assert_eq!(cpt.get32(0x125), 0x5);
185    assert_eq!(cpt.get32(0x135), 0x5);
186    assert_eq!(cpt.get32(0x13F), 0xF);
187    // default value
188    assert_eq!(cpt.get32(0x300), 100);
189}