use super::*;
use crate::unicodeset_parse::{self as icu_unicodeset_parse, VariableMap, VariableValue};
use alloc::borrow::Cow;
use alloc::boxed::Box;
use alloc::fmt::{Display, Formatter};
use alloc::string::ToString;
use alloc::vec;
use core::{iter::Peekable, str::CharIndices};
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
type Result<T> = core::result::Result<T, CompileError>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ElementKind {
Literal,
VariableReference,
BackReference,
Quantifier,
Segment,
UnicodeSet,
FunctionCall,
Cursor,
AnchorStart,
AnchorEnd,
}
impl ElementKind {
pub(crate) fn skipped_in(self, location: ElementLocation) -> bool {
#[allow(clippy::match_like_matches_macro)] match (location, self) {
(ElementLocation::Source, Self::Cursor) => true,
(ElementLocation::Target, Self::AnchorStart | Self::AnchorEnd) => true,
_ => false,
}
}
pub(crate) fn debug_str(&self) -> &'static str {
match self {
ElementKind::Literal => "literal",
ElementKind::VariableReference => "variable reference",
ElementKind::BackReference => "back reference",
ElementKind::Quantifier => "quantifier",
ElementKind::Segment => "segment",
ElementKind::UnicodeSet => "unicodeset",
ElementKind::FunctionCall => "function call",
ElementKind::Cursor => "cursor",
ElementKind::AnchorStart => "start anchor",
ElementKind::AnchorEnd => "end anchor",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ElementLocation {
Source,
Target,
VariableDefinition,
}
pub(crate) type UnicodeSet = CodePointInversionListAndStringList<'static>;
pub(crate) type FilterSet = CodePointInversionList<'static>;
#[derive(Debug, Clone, Copy)]
pub(crate) enum QuantifierKind {
ZeroOrOne,
ZeroOrMore,
OneOrMore,
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub(crate) struct BasicId {
pub(crate) source: String,
pub(crate) target: String,
pub(crate) variant: Option<String>,
}
impl BasicId {
pub(crate) fn reverse(self) -> Self {
let source = self.source.to_lowercase();
let target = self.target.to_lowercase();
let (new_source, new_target) = match (source.as_str(), target.as_str()) {
("any", "lower") => (self.source, "Upper".to_string()),
("any", "upper") => (self.source, "Lower".to_string()),
("any", "nfc") => (self.source, "NFD".to_string()),
("any", "nfd") => (self.source, "NFC".to_string()),
("any", "nfkc") => (self.source, "NFKD".to_string()),
("any", "nfkd") => (self.source, "NFKC".to_string()),
("any", "remove" | "null") => (self.source, self.target),
_ => (self.target, self.source),
};
Self {
source: new_source,
target: new_target,
variant: self.variant,
}
}
}
impl Default for BasicId {
fn default() -> Self {
Self {
source: "Any".to_string(),
target: "Null".to_string(),
variant: None,
}
}
}
impl Display for BasicId {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
write!(
f,
"{}-{}",
self.source.to_ascii_lowercase(),
self.target.to_ascii_lowercase()
)?;
if let Some(variant) = &self.variant {
write!(f, "/{}", variant.to_ascii_lowercase())?;
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub(crate) struct SingleId {
pub(crate) filter: Option<FilterSet>,
pub(crate) basic_id: BasicId,
}
impl SingleId {
pub(crate) fn reverse(self) -> Self {
Self {
basic_id: self.basic_id.reverse(),
..self
}
}
}
#[derive(Debug, Clone)]
pub(crate) enum Element {
Literal(String),
VariableRef(String),
BackRef(u32),
Quantifier(QuantifierKind, Box<Element>),
Segment(Section),
UnicodeSet(UnicodeSet),
FunctionCall(SingleId, Section),
Cursor(u32, u32),
AnchorStart,
AnchorEnd,
}
impl Element {
pub(crate) fn kind(&self) -> ElementKind {
match self {
Element::Literal(..) => ElementKind::Literal,
Element::VariableRef(..) => ElementKind::VariableReference,
Element::BackRef(..) => ElementKind::BackReference,
Element::Quantifier(..) => ElementKind::Quantifier,
Element::Segment(..) => ElementKind::Segment,
Element::UnicodeSet(..) => ElementKind::UnicodeSet,
Element::FunctionCall(..) => ElementKind::FunctionCall,
Element::Cursor(..) => ElementKind::Cursor,
Element::AnchorStart => ElementKind::AnchorStart,
Element::AnchorEnd => ElementKind::AnchorEnd,
}
}
}
pub(crate) type Section = Vec<Element>;
#[derive(Debug, Clone)]
pub(crate) struct HalfRule {
pub(crate) ante: Section,
pub(crate) key: Section,
pub(crate) post: Section,
}
#[derive(Debug, Clone)]
#[allow(clippy::large_enum_variant)]
pub(crate) enum Rule {
GlobalFilter(FilterSet),
GlobalInverseFilter(FilterSet),
Transform(SingleId, Option<SingleId>),
Conversion(HalfRule, Direction, HalfRule),
VariableDefinition(String, Section),
}
pub(crate) struct Parser<'a, P: ?Sized> {
iter: Peekable<CharIndices<'a>>,
source: &'a str,
variable_map: VariableMap<'static>,
dot_set: Option<UnicodeSet>,
xid_start: &'a CodePointInversionList<'a>,
xid_continue: &'a CodePointInversionList<'a>,
pat_ws: &'a CodePointInversionList<'a>,
property_provider: &'a P,
}
impl<'a, P> Parser<'a, P>
where
P: ?Sized
+ DataProvider<AsciiHexDigitV1Marker>
+ DataProvider<AlphabeticV1Marker>
+ DataProvider<BidiControlV1Marker>
+ DataProvider<BidiMirroredV1Marker>
+ DataProvider<CanonicalCombiningClassV1Marker>
+ DataProvider<CanonicalCombiningClassNameToValueV2Marker>
+ DataProvider<CaseIgnorableV1Marker>
+ DataProvider<CasedV1Marker>
+ DataProvider<ChangesWhenCasefoldedV1Marker>
+ DataProvider<ChangesWhenCasemappedV1Marker>
+ DataProvider<ChangesWhenLowercasedV1Marker>
+ DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
+ DataProvider<ChangesWhenTitlecasedV1Marker>
+ DataProvider<ChangesWhenUppercasedV1Marker>
+ DataProvider<DashV1Marker>
+ DataProvider<DefaultIgnorableCodePointV1Marker>
+ DataProvider<DeprecatedV1Marker>
+ DataProvider<DiacriticV1Marker>
+ DataProvider<EmojiV1Marker>
+ DataProvider<EmojiComponentV1Marker>
+ DataProvider<EmojiModifierV1Marker>
+ DataProvider<EmojiModifierBaseV1Marker>
+ DataProvider<EmojiPresentationV1Marker>
+ DataProvider<ExtendedPictographicV1Marker>
+ DataProvider<ExtenderV1Marker>
+ DataProvider<GraphemeBaseV1Marker>
+ DataProvider<GraphemeClusterBreakV1Marker>
+ DataProvider<GraphemeClusterBreakNameToValueV2Marker>
+ DataProvider<GraphemeExtendV1Marker>
+ DataProvider<HexDigitV1Marker>
+ DataProvider<IdsBinaryOperatorV1Marker>
+ DataProvider<IdsTrinaryOperatorV1Marker>
+ DataProvider<IdContinueV1Marker>
+ DataProvider<IdStartV1Marker>
+ DataProvider<IdeographicV1Marker>
+ DataProvider<JoinControlV1Marker>
+ DataProvider<LogicalOrderExceptionV1Marker>
+ DataProvider<LowercaseV1Marker>
+ DataProvider<MathV1Marker>
+ DataProvider<NoncharacterCodePointV1Marker>
+ DataProvider<PatternSyntaxV1Marker>
+ DataProvider<PatternWhiteSpaceV1Marker>
+ DataProvider<QuotationMarkV1Marker>
+ DataProvider<RadicalV1Marker>
+ DataProvider<RegionalIndicatorV1Marker>
+ DataProvider<SentenceBreakV1Marker>
+ DataProvider<SentenceBreakNameToValueV2Marker>
+ DataProvider<SentenceTerminalV1Marker>
+ DataProvider<SoftDottedV1Marker>
+ DataProvider<TerminalPunctuationV1Marker>
+ DataProvider<UnifiedIdeographV1Marker>
+ DataProvider<UppercaseV1Marker>
+ DataProvider<VariationSelectorV1Marker>
+ DataProvider<WhiteSpaceV1Marker>
+ DataProvider<WordBreakV1Marker>
+ DataProvider<WordBreakNameToValueV2Marker>
+ DataProvider<XidContinueV1Marker>
+ DataProvider<GeneralCategoryMaskNameToValueV2Marker>
+ DataProvider<GeneralCategoryV1Marker>
+ DataProvider<ScriptNameToValueV2Marker>
+ DataProvider<ScriptV1Marker>
+ DataProvider<ScriptWithExtensionsPropertyV1Marker>
+ DataProvider<XidStartV1Marker>,
{
const COMMENT: char = '#';
const COMMENT_END: char = '\n';
const RULE_END: char = ';';
const SPECIAL_START: char = ':';
const SET_START: char = '[';
const DOT: char = '.';
const DOT_SET: &'static str = r"[^[:Zp:][:Zl:]\r\n$]";
const ANCHOR_START: char = '^';
const OPEN_PAREN: char = '(';
const CLOSE_PAREN: char = ')';
const ID_SEP: char = '-';
const VARIANT_SEP: char = '/';
const VAR_PREFIX: char = '$';
const VAR_DEF_OP: char = '=';
const LEFT_CONTEXT: char = '{';
const RIGHT_CONTEXT: char = '}';
const OPTIONAL: char = '?';
const ZERO_OR_MORE: char = '*';
const ONE_OR_MORE: char = '+';
const FUNCTION_PREFIX: char = '&';
const QUOTE: char = '\'';
const ESCAPE: char = '\\';
const CURSOR: char = '|';
const CURSOR_PLACEHOLDER: char = '@';
pub(super) fn run(
source: &'a str,
xid_start: &'a CodePointInversionList<'a>,
xid_continue: &'a CodePointInversionList<'a>,
pat_ws: &'a CodePointInversionList<'a>,
provider: &'a P,
) -> Result<Vec<Rule>> {
let mut s = Self {
iter: source.char_indices().peekable(),
source,
variable_map: Default::default(),
dot_set: None,
xid_start,
xid_continue,
pat_ws,
property_provider: provider,
};
let mut rules = Vec::new();
s.skip_icu_pragma();
loop {
s.skip_whitespace();
if s.iter.peek().is_none() {
break;
}
rules.push(s.parse_rule()?);
}
Ok(rules)
}
fn skip_icu_pragma(&mut self) {
loop {
self.skip_whitespace();
if let Some(start) = self.peek_index() {
#[allow(clippy::indexing_slicing)]
let start_source = &self.source[start..];
if start_source.starts_with("use variable range 0x") {
let conv_idx = start_source.find(['>', '<', '→', '←', '↔']);
let end_idx = start_source.find(Self::RULE_END);
match (end_idx, conv_idx) {
(Some(end_idx), Some(conv_idx)) if conv_idx < end_idx => break,
(None, Some(_)) => break,
_ => {}
}
self.skip_until(Self::RULE_END);
continue;
}
}
break;
}
}
fn parse_rule(&mut self) -> Result<Rule> {
match self.must_peek_char()? {
Self::SPECIAL_START => self.parse_filter_or_transform_rule(),
_ => self.parse_conversion_or_variable_rule(),
}
}
fn parse_filter_or_transform_rule(&mut self) -> Result<Rule> {
self.consume(Self::SPECIAL_START)?;
self.consume(Self::SPECIAL_START)?;
let (forward_filter, forward_basic_id, reverse_filter, reverse_basic_id, has_reverse) =
self.parse_filter_or_transform_rule_parts()?;
self.skip_whitespace();
let meta_err_offset = self.must_peek_index()?;
self.consume(Self::RULE_END)?;
match (
forward_filter.is_some(),
forward_basic_id.is_some(),
reverse_filter.is_some(),
reverse_basic_id.is_some(),
) {
(true, false, false, false) => {
#[allow(clippy::unwrap_used)]
return Ok(Rule::GlobalFilter(forward_filter.unwrap()));
}
(false, false, true, false) => {
#[allow(clippy::unwrap_used)]
return Ok(Rule::GlobalInverseFilter(reverse_filter.unwrap()));
}
_ => {}
}
if forward_basic_id.is_none() && reverse_basic_id.is_none() {
return Err(CompileErrorKind::InvalidId.with_offset(meta_err_offset));
}
if !has_reverse {
let forward_basic_id =
forward_basic_id.ok_or(CompileErrorKind::Internal("transform rule logic error"))?;
return Ok(Rule::Transform(
SingleId {
basic_id: forward_basic_id,
filter: forward_filter,
},
None,
));
}
if forward_filter.is_some() && forward_basic_id.is_none()
|| reverse_filter.is_some() && reverse_basic_id.is_none()
{
return Err(CompileErrorKind::InvalidId.with_offset(meta_err_offset));
}
let forward_basic_id = forward_basic_id.unwrap_or(BasicId::default());
let reverse_basic_id = reverse_basic_id.unwrap_or(BasicId::default());
let forward_single_id = SingleId {
basic_id: forward_basic_id,
filter: forward_filter,
};
let reverse_single_id = SingleId {
basic_id: reverse_basic_id,
filter: reverse_filter,
};
Ok(Rule::Transform(forward_single_id, Some(reverse_single_id)))
}
#[allow(clippy::type_complexity)] fn parse_filter_or_transform_rule_parts(
&mut self,
) -> Result<(
Option<FilterSet>,
Option<BasicId>,
Option<FilterSet>,
Option<BasicId>,
bool,
)> {
self.skip_whitespace();
let forward_filter = self.try_parse_filter_set()?;
self.skip_whitespace();
let forward_basic_id = self.try_parse_basic_id()?;
self.skip_whitespace();
let has_reverse = match self.must_peek_char()? {
Self::OPEN_PAREN => true,
Self::RULE_END => false,
_ => return self.unexpected_char_here(),
};
let reverse_filter;
let reverse_basic_id;
if has_reverse {
self.consume(Self::OPEN_PAREN)?;
self.skip_whitespace();
reverse_filter = self.try_parse_filter_set()?;
self.skip_whitespace();
reverse_basic_id = self.try_parse_basic_id()?;
self.skip_whitespace();
self.consume(Self::CLOSE_PAREN)?;
} else {
reverse_filter = None;
reverse_basic_id = None;
}
Ok((
forward_filter,
forward_basic_id,
reverse_filter,
reverse_basic_id,
has_reverse,
))
}
fn parse_conversion_or_variable_rule(&mut self) -> Result<Rule> {
let first_elt = if Self::VAR_PREFIX == self.must_peek_char()? {
let elt = self.parse_variable_or_backref_or_anchor_end()?;
self.skip_whitespace();
if Self::VAR_DEF_OP == self.must_peek_char()? {
let var_name = match elt {
Element::VariableRef(var_name) => var_name,
_ => return self.unexpected_char_here(),
};
self.iter.next();
let section = self.parse_section(None)?;
let err_offset = self.must_peek_index()?;
self.consume(Self::RULE_END)?;
self.add_variable(var_name.clone(), section.clone(), err_offset)?;
return Ok(Rule::VariableDefinition(var_name, section));
}
Some(elt)
} else {
None
};
let first_half = self.parse_half_rule(first_elt)?;
let dir = self.parse_direction()?;
let second_half = self.parse_half_rule(None)?;
self.consume(Self::RULE_END)?;
Ok(Rule::Conversion(first_half, dir, second_half))
}
fn parse_single_id(&mut self) -> Result<SingleId> {
self.skip_whitespace();
let filter = self.try_parse_filter_set()?;
self.skip_whitespace();
let basic_id = self.parse_basic_id()?;
Ok(SingleId { filter, basic_id })
}
fn try_parse_basic_id(&mut self) -> Result<Option<BasicId>> {
if let Some(c) = self.peek_char() {
if self.xid_start.contains(c) {
return Ok(Some(self.parse_basic_id()?));
}
}
Ok(None)
}
fn parse_basic_id(&mut self) -> Result<BasicId> {
let first_id = self.parse_unicode_identifier()?;
self.skip_whitespace();
let second_id = self.try_parse_sep_and_unicode_identifier(Self::ID_SEP)?;
self.skip_whitespace();
let variant_id = self.try_parse_sep_and_unicode_identifier(Self::VARIANT_SEP)?;
let (source, target) = match second_id {
None => ("Any".to_string(), first_id),
Some(second_id) => (first_id, second_id),
};
Ok(BasicId {
source,
target,
variant: variant_id,
})
}
fn try_parse_sep_and_unicode_identifier(&mut self, sep: char) -> Result<Option<String>> {
if Some(sep) == self.peek_char() {
self.iter.next();
self.skip_whitespace();
return Ok(Some(self.parse_unicode_identifier()?));
}
Ok(None)
}
fn parse_unicode_identifier(&mut self) -> Result<String> {
let mut id = String::new();
let (first_offset, first_c) = self.must_peek()?;
if !self.xid_start.contains(first_c) {
return Err(CompileErrorKind::UnexpectedChar(first_c).with_offset(first_offset));
}
self.iter.next();
id.push(first_c);
loop {
let c = self.must_peek_char()?;
if !self.xid_continue.contains(c) {
break;
}
id.push(c);
self.iter.next();
}
Ok(id)
}
fn parse_half_rule(&mut self, prev_elt: Option<Element>) -> Result<HalfRule> {
let ante;
let key;
let post;
let first = self.parse_section(prev_elt)?;
if Self::LEFT_CONTEXT == self.must_peek_char()? {
self.iter.next();
ante = first;
key = self.parse_section(None)?;
} else {
ante = vec![];
key = first;
}
if Self::RIGHT_CONTEXT == self.must_peek_char()? {
self.iter.next();
post = self.parse_section(None)?;
} else {
post = vec![];
}
Ok(HalfRule { ante, key, post })
}
fn parse_direction(&mut self) -> Result<Direction> {
match self.must_peek_char()? {
'>' | '→' => {
self.iter.next();
Ok(Direction::Forward)
}
'↔' => {
self.iter.next();
Ok(Direction::Both)
}
'←' => {
self.iter.next();
Ok(Direction::Reverse)
}
'<' => {
self.iter.next();
match self.must_peek_char()? {
'>' => {
self.iter.next();
Ok(Direction::Both)
}
_ => Ok(Direction::Reverse),
}
}
_ => self.unexpected_char_here(),
}
}
fn parse_section(&mut self, prev_elt: Option<Element>) -> Result<Section> {
let mut section = Section::new();
let mut prev_elt = prev_elt;
loop {
self.skip_whitespace();
let c = self.must_peek_char()?;
if self.is_section_end(c) {
if let Some(elt) = prev_elt.take() {
section.push(elt);
}
break;
}
let next_elt = self.parse_element(&mut prev_elt)?;
if let Some(elt) = prev_elt {
section.push(elt);
}
prev_elt = Some(next_elt);
}
Ok(section)
}
fn parse_quantifier_kind(&mut self) -> Result<QuantifierKind> {
match self.must_peek_char()? {
Self::OPTIONAL => {
self.iter.next();
Ok(QuantifierKind::ZeroOrOne)
}
Self::ZERO_OR_MORE => {
self.iter.next();
Ok(QuantifierKind::ZeroOrMore)
}
Self::ONE_OR_MORE => {
self.iter.next();
Ok(QuantifierKind::OneOrMore)
}
_ => self.unexpected_char_here(),
}
}
fn parse_element(&mut self, prev_elt: &mut Option<Element>) -> Result<Element> {
match self.must_peek_char()? {
Self::VAR_PREFIX => self.parse_variable_or_backref_or_anchor_end(),
Self::ANCHOR_START => {
self.iter.next();
Ok(Element::AnchorStart)
}
Self::OPEN_PAREN => self.parse_segment(),
Self::DOT => {
self.iter.next();
Ok(Element::UnicodeSet(self.get_dot_set()?))
}
Self::OPTIONAL | Self::ZERO_OR_MORE | Self::ONE_OR_MORE => {
let quantifier = self.parse_quantifier_kind()?;
if let Some(elt) = prev_elt.take() {
Ok(Element::Quantifier(quantifier, Box::new(elt)))
} else {
self.unexpected_char_here()
}
}
Self::FUNCTION_PREFIX => self.parse_function_call(),
Self::CURSOR_PLACEHOLDER | Self::CURSOR => self.parse_cursor(),
Self::QUOTE => Ok(Element::Literal(self.parse_quoted_literal()?)),
_ if self.peek_is_unicode_set_start() => {
let (_, set) = self.parse_unicode_set()?;
Ok(Element::UnicodeSet(set))
}
c if self.is_valid_unquoted_literal(c) => Ok(Element::Literal(self.parse_literal()?)),
_ => self.unexpected_char_here(),
}
}
fn parse_variable_or_backref_or_anchor_end(&mut self) -> Result<Element> {
self.consume(Self::VAR_PREFIX)?;
match self.must_peek_char()? {
c if c.is_ascii_digit() => {
let num = self.parse_number()?;
Ok(Element::BackRef(num))
}
c if self.xid_start.contains(c) => {
let variable_id = self.parse_unicode_identifier()?;
Ok(Element::VariableRef(variable_id))
}
_ => {
Ok(Element::AnchorEnd)
}
}
}
fn parse_number(&mut self) -> Result<u32> {
let (first_offset, first_c) = self.must_next()?;
if !matches!(first_c, '1'..='9') {
return Err(CompileErrorKind::UnexpectedChar(first_c).with_offset(first_offset));
}
let mut end_offset = first_offset;
loop {
let (offset, c) = self.must_peek()?;
if !c.is_ascii_digit() {
break;
}
self.iter.next();
end_offset = offset;
}
#[allow(clippy::indexing_slicing)]
self.source[first_offset..=end_offset]
.parse()
.map_err(|_| CompileErrorKind::InvalidNumber.with_offset(end_offset))
}
fn parse_literal(&mut self) -> Result<String> {
let mut buf = String::new();
loop {
self.skip_whitespace();
let c = self.must_peek_char()?;
if c == Self::ESCAPE {
self.parse_escaped_char_into_buf(&mut buf)?;
continue;
}
if !self.is_valid_unquoted_literal(c) {
break;
}
self.iter.next();
buf.push(c);
}
Ok(buf)
}
fn parse_quoted_literal(&mut self) -> Result<String> {
let mut buf = String::new();
self.consume(Self::QUOTE)?;
loop {
let c = self.must_next_char()?;
if c == Self::QUOTE {
break;
}
buf.push(c);
}
if buf.is_empty() {
buf.push(Self::QUOTE);
}
Ok(buf)
}
fn parse_escaped_char_into_buf(&mut self, buf: &mut String) -> Result<()> {
self.consume(Self::ESCAPE)?;
let (offset, next_char) = self.must_next()?;
match next_char {
'u' | 'x' if self.peek_char() == Some('{') => {
self.iter.next();
self.skip_whitespace();
let c = self.parse_hex_digits_into_char(1, 6)?;
buf.push(c);
loop {
let skipped = self.skip_whitespace();
let next_char = self.must_peek_char()?;
if next_char == '}' {
self.iter.next();
break;
}
if skipped == 0 {
return self.unexpected_char_here();
}
let c = self.parse_hex_digits_into_char(1, 6)?;
buf.push(c);
}
}
'u' => {
let c = self.parse_hex_digits_into_char(4, 4)?;
buf.push(c);
}
'x' => {
let c = self.parse_hex_digits_into_char(2, 2)?;
buf.push(c);
}
'U' => {
let c = self.parse_hex_digits_into_char(6, 6)?;
buf.push(c);
}
'N' => {
return Err(CompileErrorKind::Unimplemented.with_offset(offset));
}
'a' => buf.push('\u{0007}'),
'b' => buf.push('\u{0008}'),
't' => buf.push('\u{0009}'),
'n' => buf.push('\u{000A}'),
'v' => buf.push('\u{000B}'),
'f' => buf.push('\u{000C}'),
'r' => buf.push('\u{000D}'),
_ => buf.push(next_char),
}
Ok(())
}
fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<char> {
let first_offset = self.must_peek_index()?;
let end_offset = self.validate_hex_digits(min, max)?;
#[allow(clippy::indexing_slicing)]
let hex_source = &self.source[first_offset..=end_offset];
let num = u32::from_str_radix(hex_source, 16)
.map_err(|_| CompileErrorKind::Internal("expected valid hex escape"))?;
char::try_from(num).map_err(|_| CompileErrorKind::InvalidEscape.with_offset(end_offset))
}
fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
let mut last_offset = 0;
for count in 0..max {
let (offset, c) = self.must_peek()?;
if !c.is_ascii_hexdigit() {
if count < min {
return self.unexpected_char_here();
} else {
break;
}
}
self.iter.next();
last_offset = offset;
}
Ok(last_offset)
}
fn parse_segment(&mut self) -> Result<Element> {
self.consume(Self::OPEN_PAREN)?;
let elt = Element::Segment(self.parse_section(None)?);
self.consume(Self::CLOSE_PAREN)?;
Ok(elt)
}
fn try_parse_filter_set(&mut self) -> Result<Option<FilterSet>> {
if self.peek_is_unicode_set_start() {
let (offset, set) = self.parse_unicode_set()?;
if set.has_strings() {
return Err(CompileErrorKind::GlobalFilterWithStrings.with_offset(offset));
}
return Ok(Some(set.code_points().clone()));
}
Ok(None)
}
fn parse_unicode_set(&mut self) -> Result<(usize, UnicodeSet)> {
let pre_offset = self.must_peek_index()?;
#[allow(clippy::indexing_slicing)]
let set_source = &self.source[pre_offset..];
let (set, consumed_bytes) = icu_unicodeset_parse::parse_unstable_with_variables(
set_source,
&self.variable_map,
self.property_provider,
)
.map_err(|e| CompileErrorKind::UnicodeSetError(e).with_offset(pre_offset))?;
let mut last_offset = pre_offset;
while let Some(offset) = self.peek_index() {
if offset == pre_offset + consumed_bytes {
break;
}
last_offset = offset;
self.iter.next();
}
Ok((last_offset, set))
}
fn get_dot_set(&mut self) -> Result<UnicodeSet> {
match &self.dot_set {
Some(set) => Ok(set.clone()),
None => {
let (set, _) =
icu_unicodeset_parse::parse_unstable(Self::DOT_SET, self.property_provider)
.map_err(|_| CompileErrorKind::Internal("dot set syntax not valid"))?;
self.dot_set = Some(set.clone());
Ok(set)
}
}
}
fn parse_function_call(&mut self) -> Result<Element> {
self.consume(Self::FUNCTION_PREFIX)?;
let single_id = self.parse_single_id()?;
self.skip_whitespace();
self.consume(Self::OPEN_PAREN)?;
let section = self.parse_section(None)?;
self.consume(Self::CLOSE_PAREN)?;
Ok(Element::FunctionCall(single_id, section))
}
fn parse_cursor(&mut self) -> Result<Element> {
let mut num_pre = 0;
let mut num_post = 0;
loop {
self.skip_whitespace();
match self.must_peek_char()? {
Self::CURSOR_PLACEHOLDER => {
self.iter.next();
num_pre += 1;
}
Self::CURSOR => {
self.iter.next();
break;
}
_ => return self.unexpected_char_here(),
}
}
loop {
self.skip_whitespace();
match self.must_peek_char()? {
Self::CURSOR_PLACEHOLDER => {
self.iter.next();
num_post += 1;
}
_ => break,
}
}
Ok(Element::Cursor(num_pre, num_post))
}
fn add_variable(&mut self, name: String, value: Section, offset: usize) -> Result<()> {
if let Some(uset_value) = self.try_uset_flatten_section(&value) {
self.variable_map
.insert(name.to_string(), uset_value)
.map_err(|_| CompileErrorKind::DuplicateVariable.with_offset(offset))?;
}
Ok(())
}
fn try_uset_flatten_section(&self, section: &Section) -> Option<VariableValue<'static>> {
if let [Element::UnicodeSet(set)] = §ion[..] {
return Some(VariableValue::UnicodeSet(set.clone()));
}
if let [Element::VariableRef(name)] = §ion[..] {
if let Some(value) = self.variable_map.get(name) {
return Some(value.clone());
}
return None;
}
let mut combined_literal = String::new();
for elt in section {
match elt {
Element::Literal(s) => combined_literal.push_str(s),
Element::VariableRef(name) => match self.variable_map.get(name) {
Some(VariableValue::String(s)) => combined_literal.push_str(s),
Some(VariableValue::Char(c)) => combined_literal.push(*c),
_ => return None,
},
_ => return None,
}
}
Some(VariableValue::String(Cow::Owned(combined_literal)))
}
fn consume(&mut self, expected: char) -> Result<()> {
match self.must_next()? {
(offset, c) if c != expected => {
Err(CompileErrorKind::UnexpectedChar(c).with_offset(offset))
}
_ => Ok(()),
}
}
fn skip_whitespace(&mut self) -> usize {
let mut count = 0;
while let Some(c) = self.peek_char() {
if c == Self::COMMENT {
count += self.skip_until(Self::COMMENT_END);
continue;
}
if !self.pat_ws.contains(c) {
break;
}
self.iter.next();
count += 1;
}
count
}
fn skip_until(&mut self, end: char) -> usize {
let mut count = 0;
for (_, c) in self.iter.by_ref() {
count += 1;
if c == end {
break;
}
}
count
}
fn peek_is_unicode_set_start(&mut self) -> bool {
match self.peek_char() {
Some(Self::SET_START) => true,
Some(Self::ESCAPE) => {
let mut it = self.iter.clone();
it.next();
matches!(it.next(), Some((_, 'p' | 'P')))
}
_ => false,
}
}
fn peek_char(&mut self) -> Option<char> {
self.iter.peek().map(|(_, c)| *c)
}
fn peek_index(&mut self) -> Option<usize> {
self.iter.peek().map(|(idx, _)| *idx)
}
fn must_next(&mut self) -> Result<(usize, char)> {
self.iter.next().ok_or(CompileErrorKind::Eof.into())
}
fn must_next_char(&mut self) -> Result<char> {
self.must_next().map(|(_, c)| c)
}
fn must_peek(&mut self) -> Result<(usize, char)> {
self.iter
.peek()
.copied()
.ok_or(CompileErrorKind::Eof.into())
}
fn must_peek_char(&mut self) -> Result<char> {
self.must_peek().map(|(_, c)| c)
}
fn must_peek_index(&mut self) -> Result<usize> {
self.must_peek().map(|(idx, _)| idx)
}
fn unexpected_char_here<T>(&mut self) -> Result<T> {
let (offset, char) = self.must_peek()?;
Err(CompileErrorKind::UnexpectedChar(char).with_offset(offset))
}
fn is_section_end(&self, c: char) -> bool {
matches!(
c,
Self::RULE_END
| Self::CLOSE_PAREN
| Self::RIGHT_CONTEXT
| Self::LEFT_CONTEXT
| Self::VAR_DEF_OP
| '<'
| '>'
| '→'
| '←'
| '↔'
)
}
fn is_valid_unquoted_literal(&self, c: char) -> bool {
c.is_ascii() && (c.is_ascii_alphanumeric() || c == '\\')
|| (!c.is_ascii() && c != '→' && c != '←' && c != '↔')
}
}
#[cfg(test)]
pub(super) fn parse(source: &str) -> Result<Vec<Rule>> {
use icu::properties::CodePointSetData;
Parser::run(
source,
&CodePointSetData::new::<XidStart>()
.static_to_owned()
.to_code_point_inversion_list(),
&CodePointSetData::new::<XidContinue>()
.static_to_owned()
.to_code_point_inversion_list(),
&CodePointSetData::new::<PatternWhiteSpace>()
.static_to_owned()
.to_code_point_inversion_list(),
&icu_properties::provider::Baked,
)
}
#[test]
fn test_full() {
let source = r"
# these are skipped:
use variable range 0x70 0x72 ;
use variable range 0x1 0x2 ;
:: [a-z\]] ; :: [b-z] Latin/BGN ;
:: Source-Target/Variant () ;::([b-z]Target-Source/Variant) ;
:: [a-z] Any ([b-z] Target-Source/Variant);
$my_var = an arbitrary section ',' some quantifiers *+? 'and other variables: $var' $var ;
$innerMinus = '-' ;
$minus = $innerMinus ;
$good_set = [a $minus z] ;
^ (start) { key ' key '+ $good_set } > $102 } post\-context$;
# contexts are optional
target < source [{set\ with\ string}];
# contexts can be empty
{ 'source-or-target' } <> { 'target-or-source' } ;
(nested (sections)+ are () so fun) > ;
. > ;
:: ([inverse-filter]) ;
";
parse(source).map_err(|e| e.explain(source)).unwrap();
}
#[test]
fn test_conversion_rules_ok() {
let sources = [
r"a > b ;",
r"a < b ;",
r"a <> b ;",
r"a → b ;",
r"a ← b ;",
r"a ↔ b ;",
r"a \> > b ;",
r"a \→ > b ;",
r"{ a > b ;",
r"a { > b ;",
r"{ a } > b ;",
r"{ a } > { b ;",
r"{ a } > { b } ;",
r"^ pre [a-z] { a } post [$] $ > ^ [$] pre { b [b-z] } post $ ;",
r"[äöü] > ;",
r"([äöü]) > &Remove($1) ;",
r"[äöü] { ([äöü]+) > &Remove($1) ;",
r"|@@@ a <> b @@@@ @ | ;",
r"|a <> b ;",
];
for source in sources {
parse(source).map_err(|e| e.explain(source)).unwrap();
}
}
#[test]
fn test_conversion_rules_err() {
let sources = [
r"a > > b ;",
r"a >< b ;",
r"(a > b) > b ;",
r"a \← b ;",
r"a ↔ { b > } ;",
r"a ↔ { b > } ;",
r"a > b",
r"@ a > b ;",
r"a ( { > b ;",
r"a ( { ) > b ;",
r"a } + > b ;",
r"a (+?*) > b ;",
r"+?* > b ;",
r"+ > b ;",
r"* > b ;",
r"? > b ;",
r"use variable range 0x71 > 2 ; use variable range 0x71 ;",
];
for source in sources {
parse(source).unwrap_err();
}
}
#[test]
fn test_variable_rules_ok() {
let sources = [
r" $my_var = [a-z] ;",
r"$my_var = äüöÜ ;",
r"$my_var = [a-z] literal ; $other_var = [A-Z] [b-z];",
r"$my_var = [a-z] ; $other_var = [A-Z] [b-z];",
r"$my_var = [a-z] ; $other_var = $my_var + $2222;",
r"$my_var = [a-z] ; $other_var = $my_var \+\ \$2222 \\ 'hello\';",
r"
$innerMinus = '-' ;
$minus = $innerMinus ;
$good_set = [a $minus z] ;
",
];
for source in sources {
parse(source).map_err(|e| e.explain(source)).unwrap();
}
}
#[test]
fn test_variable_rules_err() {
let sources = [
r" $ my_var = a ;",
r" $my_var = a_2 ;",
r"$my_var 2 = [a-z] literal ;",
r"$my_var = [$doesnt_exist] ;",
];
for source in sources {
if let Ok(rules) = parse(source) {
panic!("Parsed invalid source {:?}: {:?}", source, rules);
}
}
}
#[test]
fn test_global_filters_ok() {
let sources = [
r":: [^\[$] ;",
r":: \p{L} ;",
r":: [^\[{[}$] ;",
r":: [^\[{]}$] ;",
r":: [^\[{]\}]}$] ;",
r":: ([^\[$]) ;",
r":: ( [^\[$] ) ;",
r":: [^[a-z[]][]] ;",
r":: [^[a-z\[\]]\]] ;",
r":: [^\]] ;",
];
for source in sources {
parse(source).map_err(|e| e.explain(source)).unwrap();
}
}
#[test]
fn test_global_filters_err() {
let sources = [
r":: [^\[$ ;",
r":: \p{L ;",
r":: [^[$] ;",
r":: [^\[$]) ;",
r":: ( [^\[$] ;",
r":: [^[a-z[]][]] [] ;",
r":: [^[a-z\[\]]\]] ([a-z]);",
r":: [a$-^\]] ;",
r":: ( [] [] ) ;",
r":: () [] ;",
r":: [{string}];",
r":: ([{string}]);",
];
for source in sources {
if let Ok(rules) = parse(source) {
panic!("Parsed invalid source {:?}: {:?}", source, rules);
}
}
}
#[test]
fn test_function_calls_ok() {
let sources = [
r"$fn = & Any-Any/Variant ($var literal 'quoted literal' $1) ;",
r"$fn = &[a-z] Any-Any/Variant ($var literal 'quoted literal' $1) ;",
r"$fn = &[a-z]Any-Any/Variant ($var literal 'quoted literal' $1) ;",
r"$fn = &[a-z]Any/Variant ($var literal 'quoted literal' $1) ;",
r"$fn = &Any/Variant ($var literal 'quoted literal' $1) ;",
r"$fn = &[a-z]Any ($var literal 'quoted literal' $1) ;",
r"$fn = &Any($var literal 'quoted literal' $1) ;",
];
for source in sources {
parse(source).map_err(|e| e.explain(source)).unwrap();
}
}
#[test]
fn test_function_calls_err() {
let sources = [
r"$fn = &[a-z]($var literal 'quoted literal' $1) ;",
r"$fn = &[a-z] ($var literal 'quoted literal' $1) ;",
r"$fn = &($var literal 'quoted literal' $1) ;",
];
for source in sources {
if let Ok(rules) = parse(source) {
panic!("Parsed invalid source {:?}: {:?}", source, rules);
}
}
}
#[test]
fn test_transform_rules_ok() {
let sources = [
":: NFD; :: NFKC;",
":: Latin ;",
":: any - Latin;",
":: any - Latin/bgn;",
":: any - Latin/bgn ();",
":: any - Latin/bgn ([a-z] a-z);",
":: ([a-z] a-z);",
":: (a-z);",
":: (a-z / variant);",
":: [a-z] latin/variant (a-z / variant);",
":: [a-z] latin/variant (a-z / variant) ;",
":: [a-z] latin ( );",
":: [a-z] latin ;",
"::[];",
];
for source in sources {
parse(source).map_err(|e| e.explain(source)).unwrap();
}
}
#[test]
fn test_transform_rules_err() {
let sources = [
r":: a a ;",
r":: (a a) ;",
r":: a - z - b ;",
r":: ( a - z - b) ;",
r":: [] ( a - z) ;",
r":: a-z ( [] ) ;",
r":: a-z / ( [] a-z ) ;",
r":: Latin-ASCII/BGN Arab-Greek/UNGEGN ;",
r":: (Latin-ASCII/BGN Arab-Greek/UNGEGN) ;",
r":: [a-z{string}] Remove ;",
];
for source in sources {
if let Ok(rules) = parse(source) {
panic!("Parsed invalid source {:?}: {:?}", source, rules);
}
}
}