icu_codepointtrie_builder/lib.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! `icu_codepointtrie_builder` is a utility crate of the [`ICU4X`] project.
6//!
7//! This crate exposes functionality to build a [`CodePointTrie`] from values provided at runtime.
8//! Because it is normally expected for [`CodePointTrie`] data to be pre-compiled, this crate is not
9//! optimized for speed; it should be used during a build phase.
10//!
11//! Under the hood, this crate uses the CodePointTrie builder code from ICU4C, [`UMutableCPTrie`].
12//! For more context, see <https://github.com/unicode-org/icu4x/issues/1837>.
13//!
14//! Unlike most of ICU4X, due in large part to the native dependency, this crate is not guaranteed
15//! to be panic-free.
16//!
17//! # Build configuration
18//!
19//! This crate has two primary modes it can be used in: `"wasm"` and `"icu4c"`, exposed as
20//! Cargo features. If both are enabled, the code will internally use the wasm codepath.
21//!
22//! The `"wasm"` mode uses a Wasm module packaged into this Rust crate that contains
23//! pre-compiled ICU4C CodePointTrie builder code. It evaluates the Wasm module using
24//! the Wasmer runtime, which "just works", but it requires a large number of
25//! Rust/Cargo dependencies.
26//!
27//! The `"icu4c"` mode reduces the number of Rust dependencies, but it requires having a local copy
28//! of ICU4C available. To configure `"icu4c"` mode in Cargo, set the following environment variables:
29//!
30//! - Set `ICU4C_LIB_PATH` to a directory full of ICU4C static or shared libraries.
31//! - Set `ICU4C_LINK_STATICALLY` to any value to use the static libraries.
32//! - Set `ICU4C_RENAME_VERSION` to the integer ICU4C version if ICU4C has renaming
33//! enabled. By default, we attempt to link non-renamed symbols.
34//!
35//! If using dynamic linking, at runtime, you may need to set `[DY]LD_LIBRARY_PATH`
36//! to the `ICU4C_LIB_PATH`.
37//!
38//! If _not_ using Cargo, make sure to pass `ICU4C_LIB_PATH` to the linker via `-L`, link against
39//! `icuuc`, `icui18n` and `icudata` via `-l` flags, and set `--cfg icu4c_enable_renaming` if you need
40//! renamed ICU4C symbols.
41//!
42//! # Examples
43//!
44//! ```
45//! use icu::collections::codepointtrie::CodePointTrie;
46//! use icu::collections::codepointtrie::TrieType;
47//! use icu_codepointtrie_builder::CodePointTrieBuilder;
48//! use icu_codepointtrie_builder::CodePointTrieBuilderData;
49//!
50//! let default_value = 1;
51//! let error_value = 2;
52//! let values_by_code_point = &[3, 4, 5, 6];
53//!
54//! let cpt: CodePointTrie<u8> = CodePointTrieBuilder {
55//! data: CodePointTrieBuilderData::ValuesByCodePoint(values_by_code_point),
56//! default_value,
57//! error_value,
58//! trie_type: TrieType::Small,
59//! }
60//! .build();
61//!
62//! assert_eq!(cpt.get32(0), 3);
63//! assert_eq!(cpt.get32(1), 4);
64//! assert_eq!(cpt.get32(2), 5);
65//! assert_eq!(cpt.get32(3), 6);
66//! assert_eq!(cpt.get32(4), 1); // default value
67//! assert_eq!(cpt.get32(u32::MAX), 2); // error value
68//! ```
69//!
70//! [`ICU4X`]: ../icu/index.html
71//! [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
72//! [`UMutableCPTrie`]: (https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/umutablecptrie_8h.html#ad8945cf34ca9d40596a66a1395baa19b)
73
74#![cfg_attr(
75 not(test),
76 deny(
77 // The crate is documented to allow panics.
78 // clippy::indexing_slicing,
79 // clippy::unwrap_used,
80 // clippy::expect_used,
81 // clippy::panic,
82 clippy::exhaustive_structs,
83 clippy::exhaustive_enums, clippy::trivially_copy_pass_by_ref,
84 missing_debug_implementations,
85 )
86)]
87
88use icu_collections::codepointtrie::TrieType;
89use icu_collections::codepointtrie::TrieValue;
90
91#[cfg(any(feature = "wasm", feature = "icu4c"))]
92mod common;
93
94#[cfg(feature = "wasm")]
95mod wasm;
96
97#[cfg(feature = "icu4c")]
98mod native;
99
100/// Wrapper over the data to be encoded into a [`CodePointTrie`].
101///
102/// There is currently only one variant, but more may be added in the future.
103///
104/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
105#[non_exhaustive]
106#[derive(Debug)]
107pub enum CodePointTrieBuilderData<'a, T> {
108 /// A list of values for each code point, starting from code point 0.
109 ///
110 /// For example, the value for U+0020 (space) should be at index 32 in the slice.
111 /// Index 0 sets the value for the U+0000 (NUL).
112 ValuesByCodePoint(&'a [T]),
113}
114
115/// Settings for building a [`CodePointTrie`].
116///
117/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
118#[allow(clippy::exhaustive_structs)]
119#[derive(Debug)]
120pub struct CodePointTrieBuilder<'a, T> {
121 /// The data to be encoded.
122 pub data: CodePointTrieBuilderData<'a, T>,
123 /// The default value for code points not specified in the data.
124 pub default_value: T,
125 /// The error value for invalid code points.
126 pub error_value: T,
127 /// The [`TrieType`]: fast or small.
128 pub trie_type: TrieType,
129}
130
131impl<T> CodePointTrieBuilder<'_, T>
132where
133 T: TrieValue,
134{
135 /// Build the [`CodePointTrie`].
136 ///
137 /// Under the hood, this function runs ICU4C code compiled into WASM,
138 /// or links natively to ICU4C as specified by the `ICU4C_LIB_PATH` env var
139 ///
140 /// ✨ *Enabled with either the `wasm` or the `icu4c` Cargo feature.*
141 ///
142 /// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
143 #[cfg(any(feature = "wasm", feature = "icu4c"))]
144 pub fn build(self) -> icu_collections::codepointtrie::CodePointTrie<'static, T> {
145 #[cfg(feature = "wasm")]
146 {
147 wasm::run_wasmi_ucptrie_wrap(&self)
148 }
149
150 #[cfg(all(feature = "icu4c", not(feature = "wasm")))]
151 {
152 native::run_native(&self)
153 }
154 }
155}
156
157#[test]
158#[cfg(any(feature = "wasm", feature = "icu4c"))]
159fn test_cpt_builder() {
160 // Buckets of ten characters for 0 to 100, and then some default values, and then heterogenous "last hex digit" for 0x100 to 0x200
161 let values: Vec<u32> = (0..100)
162 .map(|x| x / 10)
163 .chain((100..0x100).map(|_| 100))
164 .chain((0x100..0x200).map(|x| x % 16))
165 .collect();
166
167 let builder = CodePointTrieBuilder {
168 data: CodePointTrieBuilderData::ValuesByCodePoint(&values),
169 default_value: 100,
170 error_value: 0xFFFF,
171 trie_type: TrieType::Fast,
172 };
173
174 let cpt = builder.build();
175
176 assert_eq!(cpt.get32(0), 0);
177 assert_eq!(cpt.get32(10), 1);
178 assert_eq!(cpt.get32(20), 2);
179 assert_eq!(cpt.get32(21), 2);
180 assert_eq!(cpt.get32(99), 9);
181 assert_eq!(cpt.get32(0x101), 0x1);
182 assert_eq!(cpt.get32(0x102), 0x2);
183 assert_eq!(cpt.get32(0x105), 0x5);
184 assert_eq!(cpt.get32(0x125), 0x5);
185 assert_eq!(cpt.get32(0x135), 0x5);
186 assert_eq!(cpt.get32(0x13F), 0xF);
187 // default value
188 assert_eq!(cpt.get32(0x300), 100);
189}