icu_locale_core/zerovec.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Documentation on zero-copy deserialization of locale types.
//!
//! [`Locale`] and [`LanguageIdentifier`] are highly structured types that cannot be directly
//! stored in a zero-copy data structure, such as those provided by the [`zerovec`](crate::zerovec) module.
//! This page explains how to indirectly store these types in a [`zerovec`](crate::zerovec).
//!
//! There are two main use cases, which have different solutions:
//!
//! 1. **Lookup:** You need to locate a locale in a zero-copy vector, such as when querying a map.
//! 2. **Obtain:** You have a locale stored in a zero-copy vector, and you need to obtain a proper
//! [`Locale`] or [`LanguageIdentifier`] for use elsewhere in your program.
//!
//! # Lookup
//!
//! To perform lookup, store the stringified locale in a canonical BCP-47 form as a byte array,
//! and then use [`Locale::strict_cmp()`] to perform an efficient, zero-allocation lookup.
//!
//! To produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from locales to integers
//! let data: &[(&PotentialUtf8, u32)] = &[
//! ("de-DE-u-hc-h12".into(), 5),
//! ("en-US-u-ca-buddhist".into(), 10),
//! ("my-MM".into(), 15),
//! ("sr-Cyrl-ME".into(), 20),
//! ("zh-TW".into(), 25),
//! ];
//! let zm: ZeroMap<PotentialUtf8, u32> = data.iter().copied().collect();
//!
//! // Get the value associated with a locale
//! let loc: Locale = "en-US-u-ca-buddhist".parse().unwrap();
//! let value = zm.get_copied_by(|uvstr| loc.strict_cmp(uvstr).reverse());
//! assert_eq!(value, Some(10));
//! ```
//!
//! # Obtain
//!
//! Obtaining a [`Locale`] or [`LanguageIdentifier`] is not generally a zero-copy operation, since
//! both of these types may require memory allocation. If possible, architect your code such that
//! you do not need to obtain a structured type.
//!
//! If you need the structured type, such as if you need to manipulate it in some way, there are two
//! options: storing subtags, and storing a string for parsing.
//!
//! ## Storing Subtags
//!
//! If the data being stored only contains a limited number of subtags, you can store them as a
//! tuple, and then construct the [`LanguageIdentifier`] externally.
//!
//! ```
//! use icu::locale::subtags::{Language, Region, Script};
//! use icu::locale::LanguageIdentifier;
//! use icu::locale::{
//! langid,
//! subtags::{language, region, script},
//! };
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to LSR (language-script-region)
//! let zm: ZeroMap<u32, (Language, Option<Script>, Option<Region>)> = [
//! (5, (language!("de"), None, Some(region!("DE")))),
//! (10, (language!("en"), None, Some(region!("US")))),
//! (15, (language!("my"), None, Some(region!("MM")))),
//! (
//! 20,
//! (language!("sr"), Some(script!("Cyrl")), Some(region!("ME"))),
//! ),
//! (25, (language!("zh"), None, Some(region!("TW")))),
//! ]
//! .into_iter()
//! .collect();
//!
//! // Construct a LanguageIdentifier from a tuple entry
//! let lid: LanguageIdentifier =
//! zm.get_copied(&25).expect("element is present").into();
//!
//! assert_eq!(lid, langid!("zh-TW"));
//! ```
//!
//! ## Storing Strings
//!
//! If it is necessary to store and obtain an arbitrary locale, it is currently recommended to
//! store a BCP-47 string and parse it when needed.
//!
//! Since the string is stored in an unparsed state, it is not safe to `unwrap` the result from
//! `Locale::try_from_utf8()`. See [icu4x#831](https://github.com/unicode-org/icu4x/issues/831)
//! for a discussion on potential data models that could ensure that the locale is valid during
//! deserialization.
//!
//! As above, to produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::langid;
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to locale string
//! let data: &[(u32, &PotentialUtf8)] = &[
//! (5, "de-DE-u-hc-h12".into()),
//! (10, "en-US-u-ca-buddhist".into()),
//! (15, "my-MM".into()),
//! (20, "sr-Cyrl-ME".into()),
//! (25, "zh-TW".into()),
//! (30, "INVALID".into()),
//! ];
//! let zm: ZeroMap<u32, PotentialUtf8> = data.iter().copied().collect();
//!
//! // Construct a Locale by parsing the string.
//! let value = zm.get(&25).expect("element is present");
//! let loc = Locale::try_from_utf8(value);
//! assert_eq!(loc, Ok(langid!("zh-TW").into()));
//!
//! // Invalid entries are fallible
//! let err_value = zm.get(&30).expect("element is present");
//! let err_loc = Locale::try_from_utf8(err_value);
//! assert!(matches!(err_loc, Err(_)));
//! ```
//!
//! [`Locale`]: crate::Locale
//! [`Locale::strict_cmp()`]: crate::Locale::strict_cmp()
//! [`LanguageIdentifier`]: crate::LanguageIdentifier