icu_codepointtrie_builder/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! `icu_codepointtrie_builder` is a utility crate of the [`ICU4X`] project.
//!
//! This crate exposes functionality to build a [`CodePointTrie`] from values provided at runtime.
//! Because it is normally expected for [`CodePointTrie`] data to be pre-compiled, this crate is not
//! optimized for speed; it should be used during a build phase.
//!
//! Under the hood, this crate uses the CodePointTrie builder code from ICU4C, [`UMutableCPTrie`].
//! For more context, see <https://github.com/unicode-org/icu4x/issues/1837>.
//!
//! Unlike most of ICU4X, due in large part to the native dependency, this crate is not guaranteed
//! to be panic-free.
//!
//! # Build configuration
//!
//! This crate has two primary modes it can be used in: `"wasm"` and `"icu4c"`, exposed as
//! Cargo features. If both are enabled, the code will internally use the wasm codepath.
//!
//! The `"wasm"` mode uses a Wasm module packaged into this Rust crate that contains
//! pre-compiled ICU4C CodePointTrie builder code. It evaluates the Wasm module using
//! the Wasmer runtime, which "just works", but it requires a large number of
//! Rust/Cargo dependencies.
//!
//! The `"icu4c"` mode reduces the number of Rust dependencies, but it requires having a local copy
//! of ICU4C available. To configure `"icu4c"` mode in Cargo, set the following environment variables:
//!
//! - Set `ICU4C_LIB_PATH` to a directory full of ICU4C static or shared libraries.
//! - Set `ICU4C_LINK_STATICALLY` to any value to use the static libraries.
//! - Set `ICU4C_RENAME_VERSION` to the integer ICU4C version if ICU4C has renaming
//! enabled. By default, we attempt to link non-renamed symbols.
//!
//! If using dynamic linking, at runtime, you may need to set `[DY]LD_LIBRARY_PATH`
//! to the `ICU4C_LIB_PATH`.
//!
//! If _not_ using Cargo, make sure to pass `ICU4C_LIB_PATH` to the linker via `-L`, link against
//! `icuuc`, `icui18n` and `icudata` via `-l` flags, and set `--cfg icu4c_enable_renaming` if you need
//! renamed ICU4C symbols.
//!
//! # Examples
//!
//! ```
//! use icu::collections::codepointtrie::CodePointTrie;
//! use icu::collections::codepointtrie::TrieType;
//! use icu_codepointtrie_builder::CodePointTrieBuilder;
//! use icu_codepointtrie_builder::CodePointTrieBuilderData;
//!
//! let default_value = 1;
//! let error_value = 2;
//! let values_by_code_point = &[3, 4, 5, 6];
//!
//! let cpt: CodePointTrie<u8> = CodePointTrieBuilder {
//! data: CodePointTrieBuilderData::ValuesByCodePoint(values_by_code_point),
//! default_value,
//! error_value,
//! trie_type: TrieType::Small,
//! }
//! .build();
//!
//! assert_eq!(cpt.get32(0), 3);
//! assert_eq!(cpt.get32(1), 4);
//! assert_eq!(cpt.get32(2), 5);
//! assert_eq!(cpt.get32(3), 6);
//! assert_eq!(cpt.get32(4), 1); // default value
//! assert_eq!(cpt.get32(u32::MAX), 2); // error value
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
//! [`UMutableCPTrie`]: (https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/umutablecptrie_8h.html#ad8945cf34ca9d40596a66a1395baa19b)
#![cfg_attr(
not(test),
deny(
// The crate is documented to allow panics.
// clippy::indexing_slicing,
// clippy::unwrap_used,
// clippy::expect_used,
// clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
missing_debug_implementations,
)
)]
use icu_collections::codepointtrie::TrieType;
use icu_collections::codepointtrie::TrieValue;
#[cfg(any(feature = "wasm", feature = "icu4c"))]
mod common;
#[cfg(feature = "wasm")]
mod wasm;
#[cfg(feature = "icu4c")]
mod native;
/// Wrapper over the data to be encoded into a [`CodePointTrie`].
///
/// There is currently only one variant, but more may be added in the future.
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
#[non_exhaustive]
#[derive(Debug)]
pub enum CodePointTrieBuilderData<'a, T> {
/// A list of values for each code point, starting from code point 0.
///
/// For example, the value for U+0020 (space) should be at index 32 in the slice.
/// Index 0 sets the value for the U+0000 (NUL).
ValuesByCodePoint(&'a [T]),
}
/// Settings for building a [`CodePointTrie`].
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
#[allow(clippy::exhaustive_structs)]
#[derive(Debug)]
pub struct CodePointTrieBuilder<'a, T> {
/// The data to be encoded.
pub data: CodePointTrieBuilderData<'a, T>,
/// The default value for code points not specified in the data.
pub default_value: T,
/// The error value for invalid code points.
pub error_value: T,
/// The [`TrieType`]: fast or small.
pub trie_type: TrieType,
}
impl<T> CodePointTrieBuilder<'_, T>
where
T: TrieValue,
{
/// Build the [`CodePointTrie`].
///
/// Under the hood, this function runs ICU4C code compiled into WASM,
/// or links natively to ICU4C as specified by the `ICU4C_LIB_PATH` env var
///
/// ✨ *Enabled with either the `wasm` or the `icu4c` Cargo feature.*
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
#[cfg(any(feature = "wasm", feature = "icu4c"))]
pub fn build(self) -> icu_collections::codepointtrie::CodePointTrie<'static, T> {
#[cfg(feature = "wasm")]
{
wasm::run_wasmi_ucptrie_wrap(&self)
}
#[cfg(all(feature = "icu4c", not(feature = "wasm")))]
{
native::run_native(&self)
}
}
}
#[test]
#[cfg(any(feature = "wasm", feature = "icu4c"))]
fn test_cpt_builder() {
// Buckets of ten characters for 0 to 100, and then some default values, and then heterogenous "last hex digit" for 0x100 to 0x200
let values: Vec<u32> = (0..100)
.map(|x| x / 10)
.chain((100..0x100).map(|_| 100))
.chain((0x100..0x200).map(|x| x % 16))
.collect();
let builder = CodePointTrieBuilder {
data: CodePointTrieBuilderData::ValuesByCodePoint(&values),
default_value: 100,
error_value: 0xFFFF,
trie_type: TrieType::Fast,
};
let cpt = builder.build();
assert_eq!(cpt.get32(0), 0);
assert_eq!(cpt.get32(10), 1);
assert_eq!(cpt.get32(20), 2);
assert_eq!(cpt.get32(21), 2);
assert_eq!(cpt.get32(99), 9);
assert_eq!(cpt.get32(0x101), 0x1);
assert_eq!(cpt.get32(0x102), 0x2);
assert_eq!(cpt.get32(0x105), 0x5);
assert_eq!(cpt.get32(0x125), 0x5);
assert_eq!(cpt.get32(0x135), 0x5);
assert_eq!(cpt.get32(0x13F), 0xF);
// default value
assert_eq!(cpt.get32(0x300), 100);
}