icu_experimental/transliterate/transliterator/replaceable.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! APIs to support in-place transliteration.
//!
//! The process will usually look as follows:
//!
//! 1. Create a [`Replaceable`].
//! 2. If there is a filter to be applied, call [`Replaceable::for_each_run`] with the filter.
//! 3. Next, if we are black-box transliterating, call [`Replaceable::replace_modifiable_with_str`],
//! after which we are done. If we are transliterating UTS #35 Transform Rules, continue:
//! 4. Create a [`RepMatcher`] with [`Replaceable::start_match`].
//! 5. Match a rule with the `RepMatcher`.
//! 6. Assuming the rule matches, call [`RepMatcher::finish_match`] to obtain an [`Insertable`].
//! 7. Walk through the rule's replacement string, and use `Insertable`'s methods to insert
//! valid UTF-8 content. If there are recursive function calls, use the entry point
//! [`Insertable::start_replaceable_adapter`] to obtain a child `Replaceable` that can be
//! used by going back to step 2.
//! 8. Once the replacement string has been fully processed, the `Insertable`'s `Drop`
//! implementation will adjust the `Replaceable`'s cursor to reflect the applied replacement.
//! 9. The rule is done, continue with the next rule and step 4 until [`Replaceable::is_finished`]
//! returns true.
//!
//! This module uses a lot of unsafe code to work with UTF-8 in an allocation-efficient manner and to
//! avoid a lot of UTF-8 validity checks.
// note: this mod could be moved into a utility crate, the only thing
// quite closely coupled to Transform Rules is `RepMatcher` and its explicit `ante`, `post`, `key`
// handling.
// QUESTION: for this whole module, I don't know how panics work together with safety invariants. I'm fairly sure that unexpected panics
// could break some invariants.
use super::Filter;
use alloc::string::String;
use alloc::vec::Vec;
use core::fmt::{Debug, Formatter};
use core::mem::ManuallyDrop;
use core::ops::Range;
use core::ops::{Deref, DerefMut};
/// The backing buffer for the API provided by this module.
///
/// # Safety
/// Exclusive access to a buffer implies it contains valid UTF-8.
pub(crate) struct TransliteratorBuffer(Vec<u8>);
impl TransliteratorBuffer {
pub(crate) fn from_string(s: String) -> Self {
Self(s.into_bytes())
}
pub(crate) fn into_string(self) -> String {
debug_assert!(core::str::from_utf8(&self.0).is_ok());
// SAFETY: We have exclusive access, so the vec must contain valid UTF-8
unsafe { String::from_utf8_unchecked(self.0) }
}
}
/// A wrapper over `Vec<u8>` that only allows access (and modification) to a certain range.
///
/// This is useful for other types that might only need a part of a `Vec<u8>` to be of a certain
/// structure, such as UTF-8. With this wrapper, they can assert their invariants only for the
/// accessible portion.
///
/// All methods that take indices as arguments expect them to be relative to the *visible* part.
struct Hide<'a> {
raw: &'a mut Vec<u8>,
hide_pre_len: usize,
hide_post_len: usize,
}
impl<'a> Hide<'a> {
fn new(raw: &'a mut Vec<u8>) -> Self {
Self {
raw,
hide_pre_len: 0,
hide_post_len: 0,
}
}
fn splice(&mut self, range: Range<usize>, replace_with: impl IntoIterator<Item = u8>) {
let adjusted_range = range.start + self.hide_pre_len..range.end + self.hide_pre_len;
self.raw.splice(adjusted_range, replace_with);
}
fn child(&mut self) -> Hide {
Hide {
raw: self.raw,
hide_pre_len: self.hide_pre_len,
hide_post_len: self.hide_post_len,
}
}
/// Borrows into a child `Hide` with its visible part restricted to the given range.
fn tighten(&mut self, visible_range: Range<usize>) -> Hide {
debug_assert!(visible_range.start <= self.len());
debug_assert!(visible_range.end <= self.len());
let hide_pre_len = self.hide_pre_len + visible_range.start;
let hide_post_len = self.hide_post_len + (self.len() - visible_range.end);
Hide {
raw: self.raw,
hide_pre_len,
hide_post_len,
}
}
fn hidden_prefix(&self) -> &[u8] {
&self.raw[..self.hide_pre_len]
}
fn hidden_suffix(&self) -> &[u8] {
&self.raw[self.raw.len() - self.hide_post_len..]
}
}
impl Deref for Hide<'_> {
type Target = [u8];
fn deref(&self) -> &Self::Target {
&self.raw[self.hide_pre_len..self.raw.len() - self.hide_post_len]
}
}
impl DerefMut for Hide<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
let len = self.raw.len();
&mut self.raw[self.hide_pre_len..len - self.hide_post_len]
}
}
// Thought: if we loosen the invariant of Replaceable UTF-8 content to only need UTF-8 content in
// the visible range, we would not need to make_contiguous in Insertable, which would avoid
// moving around the tail after a non-final function call.
// => invariants have been loosened, but to avoid make_contiguous our on_drop needs to be smarter
// than right now, because it assumes self.curr == self.end().
// Thought: A transliteration run does not necessarily need cursors. In fact, cursor only exist
// in the context of Transform Rules. We could have a wrapping CursoredReplaceable type
// that is specifically for transform rules.
// Thought for above: Renames? Replaceable => Run, and CursoredRun/RunWithCursor.
/// Represents a transliteration run. It is aware of the range of the input that is allowed
/// to be transliterated, according to the filter.
///
/// `Replaceable`s are made to be stacked. This means that while a given `Replaceable` represents
/// a single run, it can be used to iterate over all sub-runs with a given filter of itself using
/// [`for_each_run`](Replaceable::for_each_run).
///
/// Typical usage of a `Replaceable` depends on the client:
/// - When transliterating transform rules, the `cursor`-related methods, as well as [`start_match`](Replaceable::start_match)
/// will be of interest.
/// - When transliterating with a black box, most likely [`replace_modifiable_with_str`](Replaceable::replace_modifiable_with_str)
/// is the only method that will be used.
///
/// # Safety
/// Note: `content` is a `Hide`. Whenever `content` is mentioned, only the *visible* part of it is
/// meant.
///
/// The invariants of this struct are as follows:
/// - (The visible part of) `content` must be valid UTF-8.
/// - `cursor` must be a valid UTF-8 index into the visible part of `content`.
/// - `run_range()` (as defined by `freeze_pre_len` and `freeze_post_len`), must always be a valid
/// UTF-8 range into `content`.
pub(crate) struct Replaceable<'a> {
content: Hide<'a>,
freeze_pre_len: usize,
freeze_post_len: usize,
cursor: usize,
}
impl<'a> Replaceable<'a> {
pub(crate) fn new(buf: &'a mut TransliteratorBuffer) -> Self {
// SAFETY: we have exclusive access to the buffer, so it must contain valid UTF-8
unsafe { Replaceable::from_hide(Hide::new(&mut buf.0)) }
}
/// # Safety
/// The caller must ensure the visible portion of `content` is valid UTF-8.
unsafe fn from_hide(content: Hide<'a>) -> Self {
debug_assert!(core::str::from_utf8(&content).is_ok());
Self {
content,
// SAFETY: these uphold the invariants
freeze_pre_len: 0,
freeze_post_len: 0,
cursor: 0,
}
}
pub(crate) fn replace_modifiable_with_str(&mut self, s: &str) {
// SAFETY: allowed_range() returns a valid UTF-8 range, and `s.bytes()` contains only valid UTF-8
self.content.splice(self.allowed_range(), s.bytes());
}
/// Returns the full internal text as a `&str`.
pub(crate) fn as_str(&self) -> &str {
debug_assert!(core::str::from_utf8(&self.content).is_ok());
// SAFETY: Replaceable's invariant states that content is always valid UTF-8
unsafe { core::str::from_utf8_unchecked(&self.content) }
}
/// Returns the current modifiable text as a `&str`.
pub(crate) fn as_str_modifiable(&self) -> &str {
&self.as_str()[self.allowed_range()]
}
// Thought: rename to run_range()? also any associated mentions of this, the upper bound,
// "modifiable range" etc..
/// Returns the range of bytes that are currently allowed to be modified.
///
/// This is guaranteed to be a range compatible with the internal text.
pub(crate) fn allowed_range(&self) -> Range<usize> {
self.freeze_pre_len..self.allowed_upper_bound()
}
/// Returns the cursor.
///
/// This is guaranteed to be a valid UTF-8 index into the internal text.
pub(crate) fn cursor(&self) -> usize {
// // eprintln!("cursor called with raw_cursor: {}, ignore_pre_len: {}", self.raw_cursor, self.ignore_pre_len);
self.cursor
}
/// Advances the cursor by one char.
pub(crate) fn step_cursor(&mut self) {
let step_len = self.as_str()[self.cursor..]
.chars()
.next()
.map(char::len_utf8)
.unwrap_or(0);
// // eprintln!("step_cursor: {}", step_len);
// SAFETY: step_len is the UTF-8 length of the char after `self.cursor`
self.cursor += step_len;
}
/// Sets the cursor. The cursor must remain in the modifiable window.
///
/// # Safety
/// The caller must ensure that `cursor` is a valid UTF-8 index into the internal text.
unsafe fn set_cursor(&mut self, cursor: usize) {
debug_assert!(cursor <= self.allowed_upper_bound());
debug_assert!(cursor >= self.freeze_pre_len);
self.cursor = cursor;
}
/// Returns true if the cursor is at the end of the modifiable range.
pub(crate) fn is_finished(&self) -> bool {
// the cursor should never be > the upper bound
debug_assert!(self.cursor <= self.allowed_upper_bound());
self.cursor >= self.allowed_upper_bound()
}
/// Returns a `Replaceable` with the same content as the current one.
///
/// This is useful for repeated transliterations of the same modifiable range.
pub(crate) fn child(&mut self) -> Replaceable {
Replaceable {
content: self.content.child(),
freeze_pre_len: self.freeze_pre_len,
freeze_post_len: self.freeze_post_len,
cursor: self.cursor,
}
}
// Thought: could replace the F generic with a InternalTransliteratorTrait generic, but this is fine?
/// Applies `f` to each sub-run as defined by `filter` of the current `Replaceable`'s run.
pub(crate) fn for_each_run<F>(&mut self, filter: &Filter, mut f: F)
where
F: FnMut(&mut Replaceable),
{
// sub-runs must be part of *self*'s run, so we can only start in our modifiable range.
let mut start = self.freeze_pre_len;
// SAFETY: start is always the result of a function returning valid UTF-8 indices
while let Some(mut run) = unsafe { self.next_filtered_run(start, filter) } {
f(&mut run);
start = run.allowed_upper_bound();
}
}
/// Initiate the matching process for a single rule, starting at the current cursor.
pub(super) fn start_match(&mut self) -> RepMatcher<'a, '_, false> {
let cursor = self.cursor;
RepMatcher {
rep: self,
key_match_len: 0,
forward_cursor: cursor,
ante_match_len: 0,
post_match_len: 0,
}
}
/// Returns the next run (as a Replaceable with the corresponding frozen range)
/// that occurs on or after `start`, if one exists.
///
/// # Safety
/// The caller must ensure that `start` is a valid UTF-8 index into the internal text.
unsafe fn next_filtered_run(&mut self, start: usize, filter: &Filter) -> Option<Replaceable> {
if start == self.allowed_upper_bound() {
// we have reached the end, there are no more runs
return None;
}
debug_assert!(
start < self.allowed_upper_bound(),
"start `{start}` must be within the content length `{}`",
self.allowed_upper_bound()
);
debug_assert!(self.as_str().is_char_boundary(start));
let run_start;
let run_end;
if filter == &Filter::all() {
// special case for the noop filter
run_start = start;
run_end = self.allowed_upper_bound();
} else {
run_start = self.find_first_char_in_modifiable_range(start, |c| filter.contains(c))?;
run_end = self
.find_first_char_in_modifiable_range(run_start, |c| !filter.contains(c))
.unwrap_or(self.allowed_upper_bound());
}
// eprintln!("computing filtered run for rep: {self:?}, start: {start}, run_start: {run_start}, run_end: {run_end}");
let freeze_post_len = self.content.len() - run_end;
Some(Replaceable {
content: self.content.child(),
// safety: these uphold the invariants
freeze_pre_len: run_start,
freeze_post_len,
cursor: run_start,
})
}
/// Returns the index of the first char in the modifiable text that satisfies `f`,
/// starting at index `start`. The returned index is guaranteed to be a valid UTF-8 index
/// into the internal text.
///
/// `start` must be a valid UTF-8 index into into the internal text.
fn find_first_char_in_modifiable_range<F>(&self, start: usize, f: F) -> Option<usize>
where
F: Fn(char) -> bool,
{
let tail = &self.as_str()[start..self.allowed_upper_bound()];
let (idx, _) = tail.char_indices().find(|&(_, c)| f(c))?;
Some(start + idx)
}
/// Returns the current (exclusive) upper bound of the modifiable range.
///
/// This is guaranteed to be a valid UTF-8 index into the internal text.
pub(crate) fn allowed_upper_bound(&self) -> usize {
// // eprintln!("allowed_upper_bound called with len: {}, freeze_post_len: {}, ignore_pre_len: {}", self.content.len(), self.freeze_post_len, self.ignore_pre_len);
self.content.len() - self.freeze_post_len
}
}
impl Debug for Replaceable<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
write!(f, "{:?}", self.content.hidden_prefix())?;
write!(f, "[[[")?;
write!(f, "{}", &self.as_str()[..self.freeze_pre_len])?;
write!(f, "{{{{{{")?;
write!(f, "{}", &self.as_str()[self.freeze_pre_len..self.cursor()])?;
write!(f, "|||")?;
write!(
f,
"{}",
&self.as_str()[self.cursor()..self.allowed_upper_bound()]
)?;
write!(f, "}}}}}}")?;
write!(f, "{}", &self.as_str()[self.allowed_upper_bound()..])?;
write!(f, "]]]")?;
write!(f, "{:?}", self.content.hidden_suffix())?;
Ok(())
}
}
// Thought: Maybe this should be renamed as Matchable (also the trait), as we have "SpecialMatchers"
/// Supports safe conversion rule matching over a [`Replaceable`].
///
/// Conversion rule matching consists of three parts:
/// 1. Matching the ante. This is a right-aligned (reverse) match at the `Replaceable`'s cursor.
/// 2. Matching the key. This is a left-aligned (forward) match at the `Replaceable`'s cursor.
/// 3. Matching the post. This is a left-aligned (forward) match at the end of _the matched key_.
/// Note that this means we _cannot_ match the key after the post, as the post match is aligned
/// to the end of the key match. The `KEY_FINISHED` parameter enforces this at compile-time.
///
/// After a successful match, the replacement can be applied using the [`Insertable`] returned by
/// [`RepMatcher::finish_match`].
///
/// # Safety
/// The matched portions of the string (as defined by `rep.cursor`, `key_match_len`,
/// `ante_match_len` and `post_match_len`) are all guaranteed to be valid UTF-8 subslices of
/// the `Replaceable`'s internal text.
///
/// The `RepMatcher` does not modify the contained `Replaceable`.
#[derive(Debug)]
pub(super) struct RepMatcher<'a, 'b, const KEY_FINISHED: bool> {
rep: &'b mut Replaceable<'a>,
key_match_len: usize, // relative to rep.cursor
ante_match_len: usize, // relative to rep.cursor
post_match_len: usize, // relative to rep.cursor + key_match_len
forward_cursor: usize, // absolute
}
// we can only finish a KEY_FINISHED = true matcher
impl<'a, 'b> RepMatcher<'a, 'b, true> {
pub(super) fn finish_match(self) -> Insertable<'a, 'b> {
Insertable::from_matcher(self)
}
fn match_lens(&self) -> MatchLengths {
MatchLengths {
key: self.key_match_len,
ante: self.ante_match_len,
post: self.post_match_len,
}
}
}
// we can only finish matching the key once
impl<'a, 'b> RepMatcher<'a, 'b, false> {
pub(super) fn finish_match(self) -> Insertable<'a, 'b> {
Insertable::from_matcher(self.finish_key())
}
pub(super) fn finish_key(self) -> RepMatcher<'a, 'b, true> {
RepMatcher {
rep: self.rep,
key_match_len: self.key_match_len,
ante_match_len: self.ante_match_len,
post_match_len: self.post_match_len,
forward_cursor: self.forward_cursor,
}
}
}
impl<const KEY_FINISHED: bool> RepMatcher<'_, '_, KEY_FINISHED> {
fn remaining(&self) -> usize {
if KEY_FINISHED {
self.rep.content.len() - self.forward_cursor
} else {
self.rep.allowed_upper_bound() - self.forward_cursor
}
}
fn remaining_forward_slice(&self) -> &str {
if KEY_FINISHED {
&self.rep.as_str()[self.forward_cursor..]
} else {
&self.rep.as_str()[self.forward_cursor..self.rep.allowed_upper_bound()]
}
}
/// Returns the index (which is a valid UTF-8 index) of the leftmost matched char in the ante context.
#[inline]
fn ante_cursor(&self) -> usize {
self.rep.cursor - self.ante_match_len
}
fn remaining_ante_slice(&self) -> &str {
&self.rep.as_str()[..self.ante_cursor()]
}
}
impl<const KEY_FINISHED: bool> Utf8Matcher<Forward> for RepMatcher<'_, '_, KEY_FINISHED> {
fn cursor(&self) -> usize {
self.forward_cursor
}
fn str_range(&self, range: Range<usize>) -> Option<&str> {
self.rep.as_str().get(range)
}
fn is_empty(&self) -> bool {
self.remaining() == 0
}
fn match_str(&self, s: &str) -> bool {
self.remaining_forward_slice().starts_with(s)
}
fn match_start_anchor(&self) -> bool {
self.forward_cursor == 0
}
fn match_end_anchor(&self) -> bool {
// no matter if we're matching key or post, we must be completely at the end of the string
self.forward_cursor == self.rep.content.len()
}
fn consume(&mut self, len: usize) -> bool {
if len <= self.remaining() {
assert!(self.remaining_forward_slice().is_char_boundary(len));
// SAFETY: `len` is guaranteed to be a valid UTF-8 length starting at `forward_cursor`.
if KEY_FINISHED {
self.post_match_len += len;
} else {
self.key_match_len += len;
}
self.forward_cursor += len;
true
} else {
false
}
}
fn next_char(&self) -> Option<char> {
self.remaining_forward_slice().chars().next()
}
}
// we can always reverse match, no matter if the key has finished matching or not
impl<const KEY_FINISHED: bool> Utf8Matcher<Reverse> for RepMatcher<'_, '_, KEY_FINISHED> {
fn cursor(&self) -> usize {
self.ante_cursor()
}
fn str_range(&self, range: Range<usize>) -> Option<&str> {
self.rep.as_str().get(range)
}
fn is_empty(&self) -> bool {
self.ante_cursor() == 0
}
fn match_str(&self, s: &str) -> bool {
self.remaining_ante_slice().ends_with(s)
}
fn match_start_anchor(&self) -> bool {
self.ante_cursor() == 0
}
fn match_end_anchor(&self) -> bool {
self.ante_cursor() == self.rep.content.len()
}
fn consume(&mut self, len: usize) -> bool {
if len <= self.ante_cursor() {
assert!(self
.remaining_ante_slice()
.is_char_boundary(self.ante_cursor() - len));
// SAFETY: `len` is guaranteed to be a valid UTF-8 length reverse-starting at `ante_cursor()`.
self.ante_match_len += len;
true
} else {
false
}
}
fn next_char(&self) -> Option<char> {
self.remaining_ante_slice().chars().next_back()
}
}
mod sealed {
pub(crate) trait Sealed {}
impl Sealed for super::Forward {}
impl Sealed for super::Reverse {}
}
/// A forward match.
///
/// For example, forward-matching `"aft"` on the input `"previous|after"`, with the cursor at the `'|'`,
/// would return a match, forward-matching `"fter"` would not.
pub(super) struct Forward;
/// A reverse match.
///
/// For example, reverse-matching `"ious"` on the input `"previous|after"`, with the cursor at the `'|'`,
/// would return a match, reverse-matching `"prev"` would not.
pub(super) struct Reverse;
/// The direction a match can be applied. Used in [`Utf8Matcher`].
///
/// See [`Forward`] and [`Reverse`] for implementors.
pub(super) trait MatchDirection: sealed::Sealed {}
impl MatchDirection for Forward {}
impl MatchDirection for Reverse {}
/// Matching functionality on strings. Matching can be done in forward or reverse directions, see [`MatchDirection`].
///
/// The used indices in method parameters are all compatible with each other.
// Thought: I don't think this needs to be called *Utf8* matcher, maybe just Matcheable
pub(super) trait Utf8Matcher<D: MatchDirection>: Debug {
fn cursor(&self) -> usize;
fn str_range(&self, range: Range<usize>) -> Option<&str>;
fn is_empty(&self) -> bool;
fn match_str(&self, s: &str) -> bool;
fn match_and_consume_str(&mut self, s: &str) -> bool {
if self.match_str(s) {
self.consume(s.len())
} else {
false
}
}
fn match_and_consume_char(&mut self, c: char) -> bool {
self.match_and_consume_str(c.encode_utf8(&mut [0; 4]))
}
fn match_start_anchor(&self) -> bool;
fn match_end_anchor(&self) -> bool;
fn consume(&mut self, len: usize) -> bool;
fn next_char(&self) -> Option<char>;
}
/// The match length of the three match portions of a conversion rule.
#[derive(Debug, Clone, Copy)]
struct MatchLengths {
ante: usize,
key: usize,
post: usize,
}
// TODO(#3957) about complexity: in the worst case, we need to move the full `end_len` tail for every
// push. A good size_hint avoids this.
// one fix could be to preemptively move the tail to the end of the buffer whenever the capacity
// changes. Should give us the same benefits as an exponential allocation strategy.
// TODO(#3957): implementing this with a Rope/Cord data structure might be interesting. Decide if necessary
/// This provides (append-only) replacement APIs for a [`Replaceable`].
///
/// It can only be constructed with a complete match using a [`RepMatcher`].
///
/// A main goal of this type is to avoid unnecessary allocations when applying a rule replacement.
/// For this, it keeps track of the range of the `Replaceable`'s content that we are replacing
/// using `start` and `end_len`,
/// and lazily starts replacing this range from the left, in a grow-only fashion. That is,
/// there might be a suffix in this replace range that is invalid UTF-8.
///
/// A noteworthy feature of `Insertable`'s are getting back a *valid* sub-`Replaceable` for
/// full-blown transliteration of the replacement. This means that despite the grow-only
/// nature of this Insertable, we can do arbitrary replacements.
///
/// Furthermore, an `Insertable` takes care of updating its parent's cursor.
///
/// # Safety
/// While we hold onto the associated `Replaceable`, we may temporarily break its invariants.
/// As such, we cannot use any methods on `Replaceable` itself that rely on its invariants.
///
/// Specifically, we temporarily change the validity of `_rep.content`. During a replacement
/// with this `Insertable`, the following invariants hold:
/// - start is a valid UTF-8 index into `_rep.content` that signifies the start of the range we are replacing.
/// It may not be changed.
/// - `end_len` is a valid UTF-8 suffix length into `_rep.content` that signifies the suffix of
/// the `Replaceable`'s internal text that we are *not* replacing.
/// - `curr` is the growing cursor into the range we are replacing. It is relative to `_rep.content`,
/// i.e., `_rep.content[start..curr]` is the replaced text already. That range must be valid
/// UTF-8. We guarantee this by only exposing UTF-8 APIs (pushing `&str`s and `char`s, and
/// creating a sub-`Replaceable` that guarantees UTF-8 through its invariants).
/// - `_rep.content` has valid UTF-8 in the non-replaced portions as defined by `start` and
/// `end_len`.
///
/// In particular, these mean that the range of the `Replaceable`'s internal text that we have
/// *not yet replaced*, is allowed to be invalid UTF-8.
///
/// We fix the `Replaceable`'s invariants in the `Drop` impl using [`Insertable::cleanup`].
pub(crate) struct Insertable<'a, 'b> {
// Replaceable's invariants may temporarily be broken while this Insertable is alive
_rep: &'b mut Replaceable<'a>,
start: usize,
end_len: usize,
curr: usize,
match_lens: MatchLengths,
cursor_offset: CursorOffset,
}
impl<'a, 'b> Insertable<'a, 'b> {
fn from_matcher(matcher: RepMatcher<'a, 'b, true>) -> Insertable<'a, 'b> {
let match_lens = matcher.match_lens();
// SAFETY: RepMatcher does not modify the `Replaceable`.
// This is the start of the range we want to replace.
let start_idx = matcher.rep.cursor;
// whatever is not matched *by the key* is unaffected by this Insertable
// SAFETY: match_lens is guaranteed to have been created for
// *this specific state of the Replaceable*, so these are valid lengths
// according to RepMatcher's guarantees.
let end_idx = start_idx + match_lens.key;
let end_len = matcher.rep.content.len() - end_idx;
Insertable {
_rep: matcher.rep,
start: start_idx,
end_len,
curr: start_idx,
match_lens,
cursor_offset: CursorOffset::Default,
}
}
/// If we know the size of the replacement, we can pre-emptively grow the replacement range.
/// This avoids many moves of the tail (defined by `end_len`) in case of repeated pushes on
/// an empty replacement range.
pub(crate) fn apply_size_hint(&mut self, size: usize) {
let free_bytes = self.free_range().len();
if free_bytes < size {
self._rep.content.splice(
self.end()..self.end(),
core::iter::repeat(0).take(size - free_bytes),
);
}
}
/// Push a `char` on the replacement.
pub(crate) fn push(&mut self, c: char) {
let mut buf = [0; 4];
let c_utf8 = c.encode_utf8(&mut buf);
self.push_str(c_utf8);
debug_assert!(self.curr <= self.end());
}
/// Push a `&str` on the replacement.
pub(crate) fn push_str(&mut self, s: &str) {
// SAFETY: s is valid UTF-8 by type
unsafe { self.push_utf8(s.as_bytes()) };
debug_assert!(self.curr <= self.end());
}
/// # Safety
/// The caller must ensure that `code_units` is valid UTF-8.
unsafe fn push_utf8(&mut self, code_units: &[u8]) {
if self.free_range().len() >= code_units.len() {
// SAFETY: The caller guarantees these are valid UTF-8
self._rep.content[self.curr..self.curr + code_units.len()].copy_from_slice(code_units);
self.curr += code_units.len();
return;
}
// eprintln!("WARNING: free space not sufficient for Insertable::push_bytes");
// SAFETY: The caller guarantees these are valid UTF-8
self._rep
.content
.splice(self.free_range(), code_units.iter().copied());
// SAFETY: The free range did not have enough space. The above splice replaces the completey
// remaining free range with the replacement, so there are no more un-replaced bytes left
// in the replacement range.
self.curr = self.end();
}
pub(crate) fn curr_replacement_len(&self) -> usize {
self.curr - self.start
}
pub(crate) fn curr_replacement(&self) -> &str {
// SAFETY: the invariant states that this part of the content is valid UTF-8
unsafe { core::str::from_utf8_unchecked(&self._rep.content[self.start..self.curr]) }
}
/// Will set the cursor to the current position of the replacement.
pub(super) fn set_offset_to_here(&mut self) {
self.cursor_offset = CursorOffset::Byte(self.curr_replacement_len());
}
/// Will set the cursor to `count` bytes after the end of the replacement, assuming
/// the post context match is long enough.
pub(super) fn set_offset_to_chars_off_end(&mut self, count: u16) {
self.cursor_offset = CursorOffset::CharsOffEnd(count);
}
/// Will set the cursor to `count` bytes before the start of the replacement, assuming
/// the ante context match is long enough.
pub(super) fn set_offset_to_chars_off_start(&mut self, count: u16) {
self.cursor_offset = CursorOffset::CharsOffStart(count);
}
/// Finishes the current replacement by applying the stored offset and calling
/// `make_contiguous`. This is called automatically by the `Drop` impl.
fn cleanup(&mut self) {
self.make_contiguous();
// SAFETY: make_contiguous ensures that `content` is contiguous, valid UTF-8 again, thus
// we can call methods of Replaceable again.
let rep = &self._rep;
// SAFETY: this guaranteed to be the index of the first byte of the replacement
let base_cursor = self.start;
let replacement_len = self.curr_replacement_len();
let cursor = match self.cursor_offset {
CursorOffset::Default => {
// SAFETY: replacement_len is a valid UTF-8 length after cursor.
base_cursor + replacement_len
}
CursorOffset::Byte(offset) => {
// SAFETY: CursorOffset::Byte is only constructed by passing a UTF-8 prefix-len of
// the replacement, which is anchored at the cursor.
base_cursor + offset
}
CursorOffset::CharsOffEnd(count) => {
// they key has already been replaced, so the post match starts `replacement_len`
// bytes after the cursor
let post_start = base_cursor + replacement_len;
// the replacement d oes not affect the post match length
let post_end = post_start + self.match_lens.post;
let matched_post = &rep.as_str()[post_start..post_end];
// compute byte-length of `count` chars in post
let post_offset_len = matched_post
.chars()
.take(count as usize)
.map(char::len_utf8)
.sum::<usize>();
// SAFETY: this is a sum of valid UTF-8 lengths, so it is a valid UTF-8 index
let computed_cursor = base_cursor + replacement_len + post_offset_len;
// SAFETY: the returned range is guaranteed to be valid
let max_cursor = rep.allowed_range().end;
// we are not allowed to move the cursor into unmodifiable territory
max_cursor.min(computed_cursor)
}
CursorOffset::CharsOffStart(count) => {
let ante = &rep.as_str()[..base_cursor];
// restricting to the matched substr ensures no overflow of the cursor past the
// contexts, which is good
let matched_ante = &ante[(ante.len() - self.match_lens.ante)..];
// compute byte-length of `count` chars in ante (from the right)
let ante_len = matched_ante
.chars()
.rev()
.take(count as usize)
.map(char::len_utf8)
.sum::<usize>();
// not underflowing because ante_len is at most the cursor, because the cursor is
// right after where the ante context ends.
// SAFETY: ante_len is a valid UTF-8 substr length preceding the cursor, and the
// cursor is also a valid index.
let computed_cursor = base_cursor - ante_len;
// SAFETY: the returned range is guaranteed to be valid
let min_cursor = rep.allowed_range().start;
// we are not allowed to move the cursor into unmodifiable territory
min_cursor.max(computed_cursor)
}
};
// SAFETY: all cases guarantee a valid UTF-8 index into the visible slice
unsafe { self._rep.set_cursor(cursor) };
}
/// Gets rid of the unreplaced range remaining in this `Insertable`.
///
/// A consequence of this is that `_rep.content` is now completely UTF-8 valid again.
fn make_contiguous(&mut self) {
// need to move the tail of the Vec to fill the remainder of the free range
self._rep
.content
.splice(self.free_range(), core::iter::empty());
}
fn free_range(&self) -> Range<usize> {
// eprintln!("free_range: curr: {}, end: {}", self.curr, self.end());
debug_assert!(self.curr <= self.end());
self.curr..self.end()
}
fn end(&self) -> usize {
self._rep.content.len() - self.end_len
}
/// Use this if you want to get a sub-`Replaceable` for applying a full-blown transliteration to
/// a range of `self`'s already replaced text.
///
/// # Example
/// ```ignore
/// let mut ins = my_insertable();
/// ins.push_str("hello ");
/// let mut adpt = ins.start_replaceable_adapter();
/// adpt.child_for_range().push_str("world");
/// let mut rep = adpt.as_replaceable();
/// // transliterates "world" and inserts it in the original `Insertable`.
/// transliterate_with_rep(rep);
/// ```
pub(super) fn start_replaceable_adapter(
&mut self,
) -> InsertableToReplaceableAdapter<'a, '_, impl FnMut(usize) + '_> {
let range_start = self.curr;
let child_insertable = Insertable {
_rep: self._rep,
start: self.curr,
end_len: self.end_len,
curr: self.curr,
match_lens: self.match_lens,
cursor_offset: CursorOffset::Default,
};
// only `curr` and `cursor_offset` may be modified by the child, so we need to get them back
// once the child is dropped. however, replaceable_adapter is only used for function call
// arguments, which may not contain cursors (and if they do, it's GIGO), so we don't need
// to worry about the cursor_offset.
let on_drop = |new_curr: usize| {
// SAFETY: new_curr will be a later `curr` by `child_insertable`, and due to the grow-only
// nature of curr and the invariants of `Insertable`, this new_curr must be valid for `self` too.
self.curr = new_curr;
};
InsertableToReplaceableAdapter {
child: ManuallyDrop::new(child_insertable),
range_start,
on_drop,
}
}
}
impl Drop for Insertable<'_, '_> {
fn drop(&mut self) {
self.cleanup();
}
}
impl Debug for Insertable<'_, '_> {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
write!(f, "{}|{}", self.curr_replacement(), self.free_range().len())
}
}
/// See [`Insertable::start_replaceable_adapter`].
pub(super) struct InsertableToReplaceableAdapter<'a, 'b, F>
where
F: FnMut(usize),
{
// this is ManuallyDrop because we don't want the child's cleanup function to run.
child: ManuallyDrop<Insertable<'a, 'b>>,
range_start: usize,
on_drop: F,
}
impl<F> InsertableToReplaceableAdapter<'_, '_, F>
where
F: FnMut(usize),
{
/// Returns a type that allows getting a `Replaceable` from it. The replaceable will
/// transliterate everything since `self` was created with [`Insertable::start_replaceable_adapter`].
pub(super) fn as_replaceable(&mut self) -> InsertableGuard<impl FnMut(&[u8]) + '_> {
// Thought: we don't need to make the Insertable contiguous because the visible length hides
// the invalid UTF-8 tail. However, we do not gain anything from that empty buffer at the
// moment, because the child Replaceable's Insertable will not know about it. can we
// somehow give it that information?
// Until that happens, calling make_contiguous has the benefit of not re-allocating the
// backing vec when there was still free space available in the parent Insertable.
self.child.make_contiguous();
let range_end = self.child.curr;
let visible_range = self.range_start..range_end;
let child = self.child.deref_mut();
let hidden_len = child._rep.content.len() - visible_range.len();
let content = &mut child._rep.content;
let modifiable_content = content.tighten(visible_range);
// SAFETY: visible_range is always the range from one valid content-based UTF-8 index of the
// replacement to a subsequent valid UTF-8 content-index of the replacement.
let rep = unsafe { Replaceable::from_hide(modifiable_content) };
// `move` hack - don't move these into closure
let child_curr = &mut child.curr;
let child_end_len = &child.end_len;
// will be called when the `Replaceable` is done.
let on_drop = move |new_content: &[u8]| {
// 1. child's content is contiguous, so we are inserting at the very end.
// 2. we need hidden_len as well because `child.curr` is relative to `child.content`,
// not `new_content`, which is only the visible portion of the `rep`'s content.
// We are updating the child's curr to point to the very end of the replaceable range.
*child_curr = new_content.len() + hidden_len - child_end_len;
};
InsertableGuard::new(rep, on_drop)
}
}
impl<F> Drop for InsertableToReplaceableAdapter<'_, '_, F>
where
F: FnMut(usize),
{
fn drop(&mut self) {
(self.on_drop)(self.child.curr);
}
}
impl<'a, 'b, F> Deref for InsertableToReplaceableAdapter<'a, 'b, F>
where
F: FnMut(usize),
{
type Target = Insertable<'a, 'b>;
fn deref(&self) -> &Self::Target {
self.child.deref()
}
}
impl<F> DerefMut for InsertableToReplaceableAdapter<'_, '_, F>
where
F: FnMut(usize),
{
fn deref_mut(&mut self) -> &mut Self::Target {
self.child.deref_mut()
}
}
pub(super) struct InsertableGuard<'a, F>
where
F: FnMut(&[u8]),
{
rep: Replaceable<'a>,
on_drop: F,
}
impl<'a, F> InsertableGuard<'a, F>
where
F: FnMut(&[u8]),
{
fn new(rep: Replaceable<'a>, on_drop: F) -> Self {
Self { rep, on_drop }
}
pub(crate) fn child(&mut self) -> Replaceable {
self.rep.child()
}
}
impl<F> Drop for InsertableGuard<'_, F>
where
F: FnMut(&[u8]),
{
fn drop(&mut self) {
(self.on_drop)(&self.rep.content);
}
}
/// Stores the kinds of cursor offsets that a replacement can produce.
#[derive(Debug, Clone, Copy, Default)]
enum CursorOffset {
/// The default offset, which just puts the cursor at the end of the replacement.
#[default]
Default,
/// A byte offset into the replacement string that is ready to use.
Byte(usize),
/// A `char`-based offset for after the replacement string.
CharsOffEnd(u16),
/// A `char`-based offset for before the replacement string.
CharsOffStart(u16),
}