icu_properties/
code_point_set.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::provider::*;
6use core::ops::RangeInclusive;
7use icu_collections::codepointinvlist::CodePointInversionList;
8use icu_provider::marker::ErasedMarker;
9use icu_provider::prelude::*;
10
11/// A set of Unicode code points. Access its data via the borrowed version,
12/// [`CodePointSetDataBorrowed`].
13///
14/// # Example
15/// ```rust
16/// use icu::properties::CodePointSetData;
17/// use icu::properties::props::Alphabetic;
18///
19/// let alphabetic = CodePointSetData::new::<Alphabetic>();
20///
21/// assert!(!alphabetic.contains('3'));
22/// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
23/// assert!(alphabetic.contains('A'));
24/// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25/// ```
26#[derive(Debug)]
27pub struct CodePointSetData {
28    data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
29}
30
31impl CodePointSetData {
32    /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33    ///
34    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35    ///
36    /// [📚 Help choosing a constructor](icu_provider::constructors)
37    #[allow(clippy::new_ret_no_self)]
38    #[cfg(feature = "compiled_data")]
39    pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
40        CodePointSetDataBorrowed::new::<P>()
41    }
42
43    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44    pub fn try_new_unstable<P: BinaryProperty>(
45        provider: &(impl DataProvider<P::DataMarker> + ?Sized),
46    ) -> Result<CodePointSetData, DataError> {
47        Ok(CodePointSetData::from_data(
48            provider.load(Default::default())?.payload,
49        ))
50    }
51
52    /// Construct a borrowed version of this type that can be queried.
53    ///
54    /// This owned version if returned by functions that use a runtime data provider.
55    #[inline]
56    pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
57        CodePointSetDataBorrowed {
58            set: self.data.get(),
59        }
60    }
61
62    /// Construct a new one from loaded data
63    ///
64    /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
65    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
66    where
67        M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
68    {
69        Self { data: data.cast() }
70    }
71
72    /// Construct a new owned [`CodePointInversionList`]
73    pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
74        let set = PropertyCodePointSet::from_code_point_inversion_list(set);
75        CodePointSetData::from_data(
76            DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
77        )
78    }
79
80    /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
81    ///
82    /// The data backing this is extensible and supports multiple implementations.
83    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
84    /// added, and users may select which at data generation time.
85    ///
86    /// This method returns an `Option` in order to return `None` when the backing data provider
87    /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
88    /// constraint.
89    pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
90        self.data.get().as_code_point_inversion_list()
91    }
92
93    /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
94    /// otherwise allocating a new [`CodePointInversionList`].
95    ///
96    /// The data backing this is extensible and supports multiple implementations.
97    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
98    /// added, and users may select which at data generation time.
99    ///
100    /// The performance of the conversion to this specific return type will vary
101    /// depending on the data structure that is backing `self`.
102    pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
103        self.data.get().to_code_point_inversion_list()
104    }
105}
106
107/// A borrowed wrapper around code point set data, returned by
108/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
109#[derive(Clone, Copy, Debug)]
110pub struct CodePointSetDataBorrowed<'a> {
111    set: &'a PropertyCodePointSet<'a>,
112}
113
114impl CodePointSetDataBorrowed<'static> {
115    /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
116    ///
117    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
118    ///
119    /// [📚 Help choosing a constructor](icu_provider::constructors)
120    #[inline]
121    #[cfg(feature = "compiled_data")]
122    pub const fn new<P: BinaryProperty>() -> Self {
123        CodePointSetDataBorrowed { set: P::SINGLETON }
124    }
125    /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
126    ///
127    /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
128    /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
129    pub const fn static_to_owned(self) -> CodePointSetData {
130        CodePointSetData {
131            data: DataPayload::from_static_ref(self.set),
132        }
133    }
134}
135
136impl<'a> CodePointSetDataBorrowed<'a> {
137    /// Check if the set contains a character
138    ///
139    /// ```rust
140    /// use icu::properties::CodePointSetData;
141    /// use icu::properties::props::Alphabetic;
142    ///
143    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
144    ///
145    /// assert!(!alphabetic.contains('3'));
146    /// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
147    /// assert!(alphabetic.contains('A'));
148    /// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
149    /// ```
150    #[inline]
151    pub fn contains(self, ch: char) -> bool {
152        self.set.contains(ch)
153    }
154
155    /// See [`Self::contains`].
156    #[inline]
157    pub fn contains32(self, ch: u32) -> bool {
158        self.set.contains32(ch)
159    }
160
161    // Yields an [`Iterator`] returning the ranges of the code points that are
162    /// included in the [`CodePointSetData`]
163    ///
164    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
165    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
166    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
167    ///
168    /// # Example
169    ///
170    /// ```
171    /// use icu::properties::props::Alphabetic;
172    /// use icu::properties::CodePointSetData;
173    ///
174    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
175    /// let mut ranges = alphabetic.iter_ranges();
176    ///
177    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
178    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
179    /// ```
180    #[inline]
181    pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
182        self.set.iter_ranges()
183    }
184
185    // Yields an [`Iterator`] returning the ranges of the code points that are
186    /// *not* included in the [`CodePointSetData`]
187    ///
188    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
189    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
190    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
191    ///
192    /// # Example
193    ///
194    /// ```
195    /// use icu::properties::props::Alphabetic;
196    /// use icu::properties::CodePointSetData;
197    ///
198    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
199    /// let mut ranges = alphabetic.iter_ranges();
200    ///
201    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
202    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
203    /// ```
204    #[inline]
205    pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
206        self.set.iter_ranges_complemented()
207    }
208}
209
210/// A binary Unicode character property.
211///
212/// The descriptions of most properties are taken from [`TR44`], the documentation for the
213/// Unicode Character Database.  Some properties are instead defined in [`TR18`], the
214/// documentation for Unicode regular expressions. In particular, Annex C of this document
215/// defines properties for POSIX compatibility.
216///
217/// <div class="stab unstable">
218/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
219/// trait, please consider using a type from the implementors listed below.
220/// </div>
221///
222/// [`TR44`]: https://www.unicode.org/reports/tr44
223/// [`TR18`]: https://www.unicode.org/reports/tr18
224pub trait BinaryProperty: crate::private::Sealed + Sized {
225    #[doc(hidden)]
226    type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
227    #[doc(hidden)]
228    #[cfg(feature = "compiled_data")]
229    const SINGLETON: &'static PropertyCodePointSet<'static>;
230    /// The name of this property
231    const NAME: &'static [u8];
232    /// The abbreviated name of this property, if it exists, otherwise the name
233    const SHORT_NAME: &'static [u8];
234
235    /// Convenience method for `CodePointSetData::new().contains(ch)`
236    ///
237    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238    #[cfg(feature = "compiled_data")]
239    fn for_char(ch: char) -> bool {
240        CodePointSetData::new::<Self>().contains(ch)
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    #[test]
247    fn test_general_category() {
248        use icu::properties::props::GeneralCategory;
249        use icu::properties::props::GeneralCategoryGroup;
250        use icu::properties::CodePointMapData;
251
252        let digits_data = CodePointMapData::<GeneralCategory>::new()
253            .get_set_for_value_group(GeneralCategoryGroup::Number);
254        let digits = digits_data.as_borrowed();
255
256        assert!(digits.contains('5'));
257        assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
258        assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
259
260        assert!(!digits.contains('A'));
261    }
262
263    #[test]
264    fn test_script() {
265        use icu::properties::props::Script;
266        use icu::properties::CodePointMapData;
267
268        let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
269        let thai = thai_data.as_borrowed();
270
271        assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
272        assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
273
274        assert!(!thai.contains('A'));
275        assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
276    }
277
278    #[test]
279    fn test_gc_groupings() {
280        use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
281        use icu::properties::CodePointMapData;
282        use icu_collections::codepointinvlist::CodePointInversionListBuilder;
283
284        let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
285            let category_set =
286                CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
287            let category_set = category_set
288                .as_code_point_inversion_list()
289                .expect("The data should be valid");
290
291            let mut builder = CodePointInversionListBuilder::new();
292            for &subcategory in subcategories {
293                let gc_set_data =
294                    CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
295                let gc_set = gc_set_data.as_borrowed();
296                for range in gc_set.iter_ranges() {
297                    builder.add_range32(range);
298                }
299            }
300            let combined_set = builder.build();
301            println!("{category:?} {subcategories:?}");
302            assert_eq!(
303                category_set.get_inversion_list_vec(),
304                combined_set.get_inversion_list_vec()
305            );
306        };
307
308        test_group(
309            GeneralCategoryGroup::Letter,
310            &[
311                GeneralCategory::UppercaseLetter,
312                GeneralCategory::LowercaseLetter,
313                GeneralCategory::TitlecaseLetter,
314                GeneralCategory::ModifierLetter,
315                GeneralCategory::OtherLetter,
316            ],
317        );
318        test_group(
319            GeneralCategoryGroup::Other,
320            &[
321                GeneralCategory::Control,
322                GeneralCategory::Format,
323                GeneralCategory::Unassigned,
324                GeneralCategory::PrivateUse,
325                GeneralCategory::Surrogate,
326            ],
327        );
328        test_group(
329            GeneralCategoryGroup::Mark,
330            &[
331                GeneralCategory::SpacingMark,
332                GeneralCategory::EnclosingMark,
333                GeneralCategory::NonspacingMark,
334            ],
335        );
336        test_group(
337            GeneralCategoryGroup::Number,
338            &[
339                GeneralCategory::DecimalNumber,
340                GeneralCategory::LetterNumber,
341                GeneralCategory::OtherNumber,
342            ],
343        );
344        test_group(
345            GeneralCategoryGroup::Punctuation,
346            &[
347                GeneralCategory::ConnectorPunctuation,
348                GeneralCategory::DashPunctuation,
349                GeneralCategory::ClosePunctuation,
350                GeneralCategory::FinalPunctuation,
351                GeneralCategory::InitialPunctuation,
352                GeneralCategory::OtherPunctuation,
353                GeneralCategory::OpenPunctuation,
354            ],
355        );
356        test_group(
357            GeneralCategoryGroup::Symbol,
358            &[
359                GeneralCategory::CurrencySymbol,
360                GeneralCategory::ModifierSymbol,
361                GeneralCategory::MathSymbol,
362                GeneralCategory::OtherSymbol,
363            ],
364        );
365        test_group(
366            GeneralCategoryGroup::Separator,
367            &[
368                GeneralCategory::LineSeparator,
369                GeneralCategory::ParagraphSeparator,
370                GeneralCategory::SpaceSeparator,
371            ],
372        );
373    }
374
375    #[test]
376    fn test_gc_surrogate() {
377        use icu::properties::props::GeneralCategory;
378        use icu::properties::CodePointMapData;
379
380        let surrogates_data = CodePointMapData::<GeneralCategory>::new()
381            .get_set_for_value(GeneralCategory::Surrogate);
382        let surrogates = surrogates_data.as_borrowed();
383
384        assert!(surrogates.contains32(0xd800));
385        assert!(surrogates.contains32(0xd900));
386        assert!(surrogates.contains32(0xdfff));
387
388        assert!(!surrogates.contains('A'));
389    }
390}