icu_properties/code_point_set.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::provider::*;
6use core::ops::RangeInclusive;
7use icu_collections::codepointinvlist::CodePointInversionList;
8use icu_provider::marker::ErasedMarker;
9use icu_provider::prelude::*;
10
11/// A set of Unicode code points. Access its data via the borrowed version,
12/// [`CodePointSetDataBorrowed`].
13///
14/// # Example
15/// ```rust
16/// use icu::properties::CodePointSetData;
17/// use icu::properties::props::Alphabetic;
18///
19/// let alphabetic = CodePointSetData::new::<Alphabetic>();
20///
21/// assert!(!alphabetic.contains('3'));
22/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
23/// assert!(alphabetic.contains('A'));
24/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25/// ```
26#[derive(Debug)]
27pub struct CodePointSetData {
28 data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
29}
30
31impl CodePointSetData {
32 /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33 ///
34 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35 ///
36 /// [📚 Help choosing a constructor](icu_provider::constructors)
37 #[allow(clippy::new_ret_no_self)]
38 #[cfg(feature = "compiled_data")]
39 pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
40 CodePointSetDataBorrowed::new::<P>()
41 }
42
43 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44 pub fn try_new_unstable<P: BinaryProperty>(
45 provider: &(impl DataProvider<P::DataMarker> + ?Sized),
46 ) -> Result<CodePointSetData, DataError> {
47 Ok(CodePointSetData::from_data(
48 provider.load(Default::default())?.payload,
49 ))
50 }
51
52 /// Construct a borrowed version of this type that can be queried.
53 ///
54 /// This owned version if returned by functions that use a runtime data provider.
55 #[inline]
56 pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
57 CodePointSetDataBorrowed {
58 set: self.data.get(),
59 }
60 }
61
62 /// Construct a new one from loaded data
63 ///
64 /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
65 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
66 where
67 M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
68 {
69 Self { data: data.cast() }
70 }
71
72 /// Construct a new owned [`CodePointInversionList`]
73 pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
74 let set = PropertyCodePointSet::from_code_point_inversion_list(set);
75 CodePointSetData::from_data(
76 DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
77 )
78 }
79
80 /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
81 ///
82 /// The data backing this is extensible and supports multiple implementations.
83 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
84 /// added, and users may select which at data generation time.
85 ///
86 /// This method returns an `Option` in order to return `None` when the backing data provider
87 /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
88 /// constraint.
89 pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
90 self.data.get().as_code_point_inversion_list()
91 }
92
93 /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
94 /// otherwise allocating a new [`CodePointInversionList`].
95 ///
96 /// The data backing this is extensible and supports multiple implementations.
97 /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
98 /// added, and users may select which at data generation time.
99 ///
100 /// The performance of the conversion to this specific return type will vary
101 /// depending on the data structure that is backing `self`.
102 pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
103 self.data.get().to_code_point_inversion_list()
104 }
105}
106
107/// A borrowed wrapper around code point set data, returned by
108/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
109#[derive(Clone, Copy, Debug)]
110pub struct CodePointSetDataBorrowed<'a> {
111 set: &'a PropertyCodePointSet<'a>,
112}
113
114impl CodePointSetDataBorrowed<'static> {
115 /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
116 ///
117 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
118 ///
119 /// [📚 Help choosing a constructor](icu_provider::constructors)
120 #[inline]
121 #[cfg(feature = "compiled_data")]
122 pub const fn new<P: BinaryProperty>() -> Self {
123 CodePointSetDataBorrowed { set: P::SINGLETON }
124 }
125 /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
126 ///
127 /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
128 /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
129 pub const fn static_to_owned(self) -> CodePointSetData {
130 CodePointSetData {
131 data: DataPayload::from_static_ref(self.set),
132 }
133 }
134}
135
136impl<'a> CodePointSetDataBorrowed<'a> {
137 /// Check if the set contains a character
138 ///
139 /// ```rust
140 /// use icu::properties::CodePointSetData;
141 /// use icu::properties::props::Alphabetic;
142 ///
143 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
144 ///
145 /// assert!(!alphabetic.contains('3'));
146 /// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
147 /// assert!(alphabetic.contains('A'));
148 /// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
149 /// ```
150 #[inline]
151 pub fn contains(self, ch: char) -> bool {
152 self.set.contains(ch)
153 }
154
155 /// See [`Self::contains`].
156 #[inline]
157 pub fn contains32(self, ch: u32) -> bool {
158 self.set.contains32(ch)
159 }
160
161 // Yields an [`Iterator`] returning the ranges of the code points that are
162 /// included in the [`CodePointSetData`]
163 ///
164 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
165 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
166 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
167 ///
168 /// # Example
169 ///
170 /// ```
171 /// use icu::properties::props::Alphabetic;
172 /// use icu::properties::CodePointSetData;
173 ///
174 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
175 /// let mut ranges = alphabetic.iter_ranges();
176 ///
177 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
178 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
179 /// ```
180 #[inline]
181 pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
182 self.set.iter_ranges()
183 }
184
185 // Yields an [`Iterator`] returning the ranges of the code points that are
186 /// *not* included in the [`CodePointSetData`]
187 ///
188 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
189 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
190 /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
191 ///
192 /// # Example
193 ///
194 /// ```
195 /// use icu::properties::props::Alphabetic;
196 /// use icu::properties::CodePointSetData;
197 ///
198 /// let alphabetic = CodePointSetData::new::<Alphabetic>();
199 /// let mut ranges = alphabetic.iter_ranges();
200 ///
201 /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
202 /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
203 /// ```
204 #[inline]
205 pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
206 self.set.iter_ranges_complemented()
207 }
208}
209
210/// A binary Unicode character property.
211///
212/// The descriptions of most properties are taken from [`TR44`], the documentation for the
213/// Unicode Character Database. Some properties are instead defined in [`TR18`], the
214/// documentation for Unicode regular expressions. In particular, Annex C of this document
215/// defines properties for POSIX compatibility.
216///
217/// <div class="stab unstable">
218/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
219/// trait, please consider using a type from the implementors listed below.
220/// </div>
221///
222/// [`TR44`]: https://www.unicode.org/reports/tr44
223/// [`TR18`]: https://www.unicode.org/reports/tr18
224pub trait BinaryProperty: crate::private::Sealed + Sized {
225 #[doc(hidden)]
226 type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
227 #[doc(hidden)]
228 #[cfg(feature = "compiled_data")]
229 const SINGLETON: &'static PropertyCodePointSet<'static>;
230 /// The name of this property
231 const NAME: &'static [u8];
232 /// The abbreviated name of this property, if it exists, otherwise the name
233 const SHORT_NAME: &'static [u8];
234
235 /// Convenience method for `CodePointSetData::new().contains(ch)`
236 ///
237 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238 #[cfg(feature = "compiled_data")]
239 fn for_char(ch: char) -> bool {
240 CodePointSetData::new::<Self>().contains(ch)
241 }
242}
243
244#[cfg(test)]
245mod tests {
246 #[test]
247 fn test_general_category() {
248 use icu::properties::props::GeneralCategory;
249 use icu::properties::props::GeneralCategoryGroup;
250 use icu::properties::CodePointMapData;
251
252 let digits_data = CodePointMapData::<GeneralCategory>::new()
253 .get_set_for_value_group(GeneralCategoryGroup::Number);
254 let digits = digits_data.as_borrowed();
255
256 assert!(digits.contains('5'));
257 assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
258 assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
259
260 assert!(!digits.contains('A'));
261 }
262
263 #[test]
264 fn test_script() {
265 use icu::properties::props::Script;
266 use icu::properties::CodePointMapData;
267
268 let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
269 let thai = thai_data.as_borrowed();
270
271 assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
272 assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
273
274 assert!(!thai.contains('A'));
275 assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
276 }
277
278 #[test]
279 fn test_gc_groupings() {
280 use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
281 use icu::properties::CodePointMapData;
282 use icu_collections::codepointinvlist::CodePointInversionListBuilder;
283
284 let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
285 let category_set =
286 CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
287 let category_set = category_set
288 .as_code_point_inversion_list()
289 .expect("The data should be valid");
290
291 let mut builder = CodePointInversionListBuilder::new();
292 for &subcategory in subcategories {
293 let gc_set_data =
294 CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
295 let gc_set = gc_set_data.as_borrowed();
296 for range in gc_set.iter_ranges() {
297 builder.add_range32(range);
298 }
299 }
300 let combined_set = builder.build();
301 println!("{category:?} {subcategories:?}");
302 assert_eq!(
303 category_set.get_inversion_list_vec(),
304 combined_set.get_inversion_list_vec()
305 );
306 };
307
308 test_group(
309 GeneralCategoryGroup::Letter,
310 &[
311 GeneralCategory::UppercaseLetter,
312 GeneralCategory::LowercaseLetter,
313 GeneralCategory::TitlecaseLetter,
314 GeneralCategory::ModifierLetter,
315 GeneralCategory::OtherLetter,
316 ],
317 );
318 test_group(
319 GeneralCategoryGroup::Other,
320 &[
321 GeneralCategory::Control,
322 GeneralCategory::Format,
323 GeneralCategory::Unassigned,
324 GeneralCategory::PrivateUse,
325 GeneralCategory::Surrogate,
326 ],
327 );
328 test_group(
329 GeneralCategoryGroup::Mark,
330 &[
331 GeneralCategory::SpacingMark,
332 GeneralCategory::EnclosingMark,
333 GeneralCategory::NonspacingMark,
334 ],
335 );
336 test_group(
337 GeneralCategoryGroup::Number,
338 &[
339 GeneralCategory::DecimalNumber,
340 GeneralCategory::LetterNumber,
341 GeneralCategory::OtherNumber,
342 ],
343 );
344 test_group(
345 GeneralCategoryGroup::Punctuation,
346 &[
347 GeneralCategory::ConnectorPunctuation,
348 GeneralCategory::DashPunctuation,
349 GeneralCategory::ClosePunctuation,
350 GeneralCategory::FinalPunctuation,
351 GeneralCategory::InitialPunctuation,
352 GeneralCategory::OtherPunctuation,
353 GeneralCategory::OpenPunctuation,
354 ],
355 );
356 test_group(
357 GeneralCategoryGroup::Symbol,
358 &[
359 GeneralCategory::CurrencySymbol,
360 GeneralCategory::ModifierSymbol,
361 GeneralCategory::MathSymbol,
362 GeneralCategory::OtherSymbol,
363 ],
364 );
365 test_group(
366 GeneralCategoryGroup::Separator,
367 &[
368 GeneralCategory::LineSeparator,
369 GeneralCategory::ParagraphSeparator,
370 GeneralCategory::SpaceSeparator,
371 ],
372 );
373 }
374
375 #[test]
376 fn test_gc_surrogate() {
377 use icu::properties::props::GeneralCategory;
378 use icu::properties::CodePointMapData;
379
380 let surrogates_data = CodePointMapData::<GeneralCategory>::new()
381 .get_set_for_value(GeneralCategory::Surrogate);
382 let surrogates = surrogates_data.as_borrowed();
383
384 assert!(surrogates.contains32(0xd800));
385 assert!(surrogates.contains32(0xd900));
386 assert!(surrogates.contains32(0xdfff));
387
388 assert!(!surrogates.contains('A'));
389 }
390}