icu_experimental/transliterate/compile/
rule_group_agg.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module contains the logic for aggregating rules into groups.
6//! Due to incompatible orders (see comment on `ReverseRuleGroupAggregator`), we need separate
7//! aggregators for forward and reverse directions.
8//!
9//! These aggregators both accept source-order (!) `parse::Rule`s and aggregate them into the rule
10//! groups that the data struct [`RuleBasedTransliterator`](icu::experimental::transliterate::provider::RuleBasedTransliterator)
11//! semantically expects. A later step converts these into the actual zero-copy format.
12//! They also transform the bidirectional `parse::Rule`s into a unidirectional version.
13
14// note: the aggregators currently work in a one-`parse::Rule`-at-a-time fashion, but really, they
15// could also receive the full `&[parse::Rule]` slice. with some careful index handling, some
16// allocations could be avoided.
17
18use super::*;
19use alloc::borrow::Cow;
20use alloc::collections::VecDeque;
21use alloc::vec;
22
23// parse::Rule::Conversion but unidirectional
24#[derive(Debug, Clone)]
25pub(crate) struct UniConversionRule<'p> {
26    pub(crate) ante: &'p [parse::Element],
27    pub(crate) key: &'p [parse::Element],
28    pub(crate) post: &'p [parse::Element],
29    pub(crate) replacement: &'p [parse::Element],
30}
31
32// transform + conversion rule groups for a single direction
33pub(crate) type RuleGroups<'p> = Vec<(Vec<Cow<'p, parse::SingleId>>, Vec<UniConversionRule<'p>>)>;
34
35// an intermediate enum for use in the aggregators
36enum UniRule<'p> {
37    Conversion(UniConversionRule<'p>),
38    Transform(Cow<'p, parse::SingleId>),
39}
40
41#[derive(Debug, Clone)]
42pub(crate) struct ForwardRuleGroupAggregator<'p> {
43    current: ForwardRuleGroup<'p>,
44    // the forward aggregator can use the final type directly, as source-order is equivalent to
45    // forward-order.
46    groups: RuleGroups<'p>,
47    // the transform_group of a group pair appears first
48    preceding_transform_group: Option<Vec<Cow<'p, parse::SingleId>>>,
49}
50
51impl<'p> ForwardRuleGroupAggregator<'p> {
52    pub(crate) fn new() -> Self {
53        Self {
54            // this is a somewhat important first group.
55            // we want &[(transform_group), (conversion_group)] in the end, and because we iterate
56            // in source-order, the first element of that is a transform_group.
57            current: ForwardRuleGroup::Transform(Vec::new()),
58            groups: Vec::new(),
59            preceding_transform_group: None,
60        }
61    }
62
63    pub(crate) fn push(&mut self, rule: &'p parse::Rule) {
64        match rule {
65            parse::Rule::Conversion(source_half, dir, target_half) => {
66                if !dir.permits(Direction::Forward) {
67                    return;
68                }
69
70                let ante = &source_half.ante;
71                let key = &source_half.key;
72                let post = &source_half.post;
73                let replacement = &target_half.key;
74
75                let rule = UniConversionRule {
76                    ante,
77                    key,
78                    post,
79                    replacement,
80                };
81
82                let finished_group = self.current.push(UniRule::Conversion(rule));
83                if let Some(finished_group) = finished_group {
84                    self.push_rule_group(finished_group);
85                }
86            }
87            parse::Rule::Transform(fwd, _) => {
88                let finished_group = self.current.push(UniRule::Transform(Cow::Borrowed(fwd)));
89                if let Some(finished_group) = finished_group {
90                    self.push_rule_group(finished_group);
91                }
92            }
93            parse::Rule::VariableDefinition(..) => {
94                // variable definitions are handled in a previous step
95            }
96            parse::Rule::GlobalFilter(..) | parse::Rule::GlobalInverseFilter(..) => {
97                // global filters are handled in a previous step
98            }
99        }
100    }
101
102    fn push_rule_group(&mut self, group: ForwardRuleGroup<'p>) {
103        match group {
104            ForwardRuleGroup::Transform(transform_group) => {
105                // because ForwardRuleGroup returns a different kind of group every time,
106                // the previous group must have been a conversion group which pushed the
107                // finished group pair into self.groups.
108                debug_assert!(self.preceding_transform_group.is_none());
109                self.preceding_transform_group = Some(transform_group);
110            }
111            ForwardRuleGroup::Conversion(conversion_group) => {
112                // unwrap is necessary if the first source-order rule group is a conversion group
113                let associated_transform_group =
114                    self.preceding_transform_group.take().unwrap_or_default();
115                self.groups
116                    .push((associated_transform_group, conversion_group));
117            }
118        }
119    }
120
121    pub(crate) fn finalize(mut self) -> RuleGroups<'p> {
122        // push the current group
123        // note: refactoring could get rid of clone
124        self.push_rule_group(self.current.clone());
125        // push any remaining group pairs
126        if let Some(transform_group) = self.preceding_transform_group.take() {
127            self.groups.push((transform_group, Vec::new()));
128        }
129
130        self.groups
131    }
132}
133
134// Represents a non-empty rule group for the forward direction.
135#[derive(Debug, Clone)]
136enum ForwardRuleGroup<'p> {
137    Conversion(Vec<UniConversionRule<'p>>),
138    Transform(Vec<Cow<'p, parse::SingleId>>),
139}
140
141impl<'p> ForwardRuleGroup<'p> {
142    fn new_conversion(rule: UniConversionRule<'p>) -> Self {
143        Self::Conversion(vec![rule])
144    }
145
146    fn new_transform(rule: Cow<'p, parse::SingleId>) -> Self {
147        Self::Transform(vec![rule])
148    }
149
150    // if the group is full return self, and push the rule into a new group
151    fn push(&mut self, rule: UniRule<'p>) -> Option<Self> {
152        // necessary reborrow for mem::replace
153        match (&mut *self, rule) {
154            (Self::Conversion(group), UniRule::Conversion(rule)) => {
155                group.push(rule);
156                None
157            }
158            (Self::Transform(group), UniRule::Transform(rule)) => {
159                group.push(rule);
160                None
161            }
162            (Self::Conversion(_), UniRule::Transform(new_rule)) => {
163                Some(core::mem::replace(self, Self::new_transform(new_rule)))
164            }
165            (Self::Transform(_), UniRule::Conversion(new_rule)) => {
166                Some(core::mem::replace(self, Self::new_conversion(new_rule)))
167            }
168        }
169    }
170}
171
172// Rules will be pushed in source-order (i.e., forward order), which means we have to be careful
173// in which order we aggregate them. Example: (T = transform rule, C = conversion rule)
174// T1 T2 C1 C2 T3 C3 C4 T4 T5
175// should be aggregated as
176// (T5, T4), (C3, C4), (T3), (C1, C2), (T2, T1) (assuming all rules apply to the reverse direction)
177// note in particular the discrepancy between the order of contiguous T's and contiguous C's:
178// contiguous C's keep the source order, but contiguous T's are reversed. Also the overall order
179// is reversed, of course.
180//
181// We do this by using VecDeque, push_front, and make_contiguous in the end.
182#[derive(Debug, Clone)]
183pub(crate) struct ReverseRuleGroupAggregator<'p> {
184    current: ReverseRuleGroup<'p>,
185    // VecDeque because we encounter groups in source-order, but we want to aggregate them in
186    // reverse-order.
187    groups: VecDeque<(Vec<Cow<'p, parse::SingleId>>, Vec<UniConversionRule<'p>>)>,
188    // the conversion_group of a group pair appears first due to the reverse order
189    preceding_conversion_group: Option<Vec<UniConversionRule<'p>>>,
190}
191
192impl<'p> ReverseRuleGroupAggregator<'p> {
193    pub(crate) fn new() -> Self {
194        Self {
195            // this is a somewhat important first group.
196            // we want &[(transform_group), (conversion_group)] in the end, and because we iterate
197            // in opposite order, the last element of that slice is a conversion_group.
198            current: ReverseRuleGroup::Conversion(Vec::new()),
199            groups: VecDeque::new(),
200            preceding_conversion_group: None,
201        }
202    }
203
204    pub(crate) fn push(&mut self, rule: &'p parse::Rule) {
205        match rule {
206            parse::Rule::Conversion(target_half, dir, source_half) => {
207                if !dir.permits(Direction::Reverse) {
208                    return;
209                }
210
211                let ante = &source_half.ante;
212                let key = &source_half.key;
213                let post = &source_half.post;
214                let replacement = &target_half.key;
215
216                let rule = UniConversionRule {
217                    ante,
218                    key,
219                    post,
220                    replacement,
221                };
222
223                let finished_group = self.current.push(UniRule::Conversion(rule));
224                if let Some(finished_group) = finished_group {
225                    self.push_rule_group(finished_group);
226                }
227            }
228            parse::Rule::Transform(fwd, rev) => {
229                let rev = rev.clone().unwrap_or_else(|| fwd.clone().reverse());
230
231                let finished_group = self.current.push(UniRule::Transform(Cow::Owned(rev)));
232                if let Some(finished_group) = finished_group {
233                    self.push_rule_group(finished_group);
234                }
235            }
236            parse::Rule::VariableDefinition(..) => {
237                // variable definitions are handled in a previous step
238            }
239            parse::Rule::GlobalFilter(..) | parse::Rule::GlobalInverseFilter(..) => {
240                // global filters are handled in a previous step
241            }
242        }
243    }
244
245    fn push_rule_group(&mut self, group: ReverseRuleGroup<'p>) {
246        match group {
247            ReverseRuleGroup::Conversion(conv_group) => {
248                // because ReverseRuleGroup returns a different kind of group every time,
249                // the previous group must have been a transform group which pushed the
250                // finished group pair into self.groups.
251                debug_assert!(self.preceding_conversion_group.is_none());
252                self.preceding_conversion_group = Some(conv_group);
253            }
254            ReverseRuleGroup::Transform(transform_group) => {
255                // unwrap is necessary if the first source-order rule group is a transform group
256                let associated_conv_group =
257                    self.preceding_conversion_group.take().unwrap_or_default();
258                let vec_transform_group = transform_group.into(); // non-allocating conversion
259                self.groups
260                    .push_front((vec_transform_group, associated_conv_group));
261            }
262        }
263    }
264
265    pub(crate) fn finalize(mut self) -> RuleGroups<'p> {
266        // push the current group
267        // note: refactoring could get rid of clone
268        self.push_rule_group(self.current.clone());
269        // push any remaining group pairs
270        if let Some(conv_group) = self.preceding_conversion_group.take() {
271            // a trailing conversion group in source order is the same as having a conversion
272            // group as the first in-order group. we can just prepend an empty transform group.
273            self.groups.push_front((Vec::new(), conv_group));
274        }
275
276        self.groups.into() // non-allocating conversion
277    }
278}
279
280// Represents a non-empty rule group for the reverse direction.
281#[derive(Debug, Clone)]
282enum ReverseRuleGroup<'p> {
283    // because contiguous C's are aggregated in source-order, we can just use a Vec
284    Conversion(Vec<UniConversionRule<'p>>),
285    // but contiguous T's are aggregated in reverse-order, so we need to use a VecDeque and push_front
286    Transform(VecDeque<Cow<'p, parse::SingleId>>),
287}
288
289impl Default for ReverseRuleGroup<'_> {
290    fn default() -> Self {
291        Self::Conversion(Vec::new())
292    }
293}
294
295impl<'p> ReverseRuleGroup<'p> {
296    fn new_conversion(rule: UniConversionRule<'p>) -> Self {
297        Self::Conversion(vec![rule])
298    }
299
300    fn new_transform(rule: Cow<'p, parse::SingleId>) -> Self {
301        let mut group = VecDeque::new();
302        group.push_front(rule);
303        Self::Transform(group)
304    }
305
306    fn push(&mut self, rule: UniRule<'p>) -> Option<Self> {
307        // necessary reborrow for mem::replace
308        match (&mut *self, rule) {
309            (Self::Conversion(group), UniRule::Conversion(rule)) => {
310                group.push(rule);
311                None
312            }
313            (Self::Transform(group), UniRule::Transform(rule)) => {
314                // we receive rules via `push` in source-order, which is the opposite order we want,
315                // so we push_front.
316                group.push_front(rule);
317                None
318            }
319            (Self::Conversion(_), UniRule::Transform(new_rule)) => {
320                Some(core::mem::replace(self, Self::new_transform(new_rule)))
321            }
322            (Self::Transform(_), UniRule::Conversion(new_rule)) => {
323                Some(core::mem::replace(self, Self::new_conversion(new_rule)))
324            }
325        }
326    }
327}