icu_experimental/transliterate/transliterator/hardcoded.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module defines implementations for code-based transliterators that are part of
//! transform rules.
use crate::transliterate::transliterator::replaceable::{Forward, Replaceable, Utf8Matcher};
/// A transliterator that replaces every character with its `case`-case hexadecimal representation,
/// 0-padded to `min_length`, and surrounded by `prefix` and `suffix`.
#[derive(Debug)]
pub(super) struct HexTransliterator {
prefix: &'static str,
suffix: &'static str,
min_length: u8,
case: Case,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum Case {
Upper,
Lower,
}
impl HexTransliterator {
pub(super) fn new(
prefix: &'static str,
suffix: &'static str,
min_length: u8,
case: Case,
) -> Self {
Self {
prefix,
suffix,
min_length,
case,
}
}
pub(super) fn transliterate(&self, mut rep: Replaceable) {
while !rep.is_finished() {
let mut matcher = rep.start_match();
// Thought: ok this fully specified path is annoying, maybe a separate API surface is
// better for Forward vs Reverse matching.
let c = Utf8Matcher::<Forward>::next_char(&matcher);
// there must always be a char, because we just checked that `rep` is not finished yet.
let c = c.unwrap();
Utf8Matcher::<Forward>::match_and_consume_char(&mut matcher, c);
let mut dest = matcher.finish_match();
let c_u32 = c as u32;
// rounding-up division by 4
let length = (u32::BITS - c_u32.leading_zeros() + 3) / 4;
let padding = self.min_length.saturating_sub(length as u8);
dest.apply_size_hint(
self.prefix.len() + padding as usize + length as usize + self.suffix.len(),
);
dest.push_str(self.prefix);
for _ in 0..padding {
dest.push_str("0");
}
let mut remaining_c = c_u32;
// temporary buffer because forward iteration through a u32's bytes is easier and
// we need the reverse order
let mut buf = [0; 6];
for slot in buf.iter_mut() {
if c_u32 == 0 {
break;
}
*slot = match remaining_c & 0xF {
x @ 0x0..=0x9 => b'0' + x as u8,
x @ 0xA..=0xF if self.case == Case::Lower => b'a' + (x - 0xA) as u8,
x => b'A' + (x - 0xA) as u8,
};
remaining_c >>= 4;
}
// only `length` hex digits are actually from the char
for c in buf[..length as usize].iter().rev() {
dest.push(*c as char);
}
dest.push_str(self.suffix);
}
}
}