zerotrie/
cursor.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Types for walking stepwise through a trie.
//!
//! For examples, see the `.cursor()` functions
//! and the `Cursor` types in this module.

use crate::reader;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrieSimpleAscii;

use core::fmt;

impl<Store> ZeroTrieSimpleAscii<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Gets a cursor into the current trie.
    ///
    /// Useful to query a trie with data that is not a slice.
    ///
    /// This is currently supported only on [`ZeroTrieSimpleAscii`]
    /// and [`ZeroAsciiIgnoreCaseTrie`].
    ///
    /// # Examples
    ///
    /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Get out the value for "abc"
    /// let mut cursor = trie.cursor();
    /// write!(&mut cursor, "abc");
    /// assert_eq!(cursor.take_value(), Some(0));
    /// ```
    ///
    /// Find the longest prefix match:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Find the longest prefix of the string "abcdxy":
    /// let query = b"abcdxy";
    /// let mut longest_prefix = 0;
    /// let mut cursor = trie.cursor();
    /// for (i, b) in query.iter().enumerate() {
    ///     // Checking is_empty() is not required, but it is
    ///     // good for efficiency
    ///     if cursor.is_empty() {
    ///         break;
    ///     }
    ///     if cursor.take_value().is_some() {
    ///         longest_prefix = i;
    ///     }
    ///     cursor.step(*b);
    /// }
    ///
    /// // The longest prefix is "abc" which is length 3:
    /// assert_eq!(longest_prefix, 3);
    /// ```
    #[inline]
    pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor {
        ZeroTrieSimpleAsciiCursor {
            trie: self.as_borrowed_slice(),
        }
    }
}

impl<Store> ZeroAsciiIgnoreCaseTrie<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Gets a cursor into the current trie.
    ///
    /// Useful to query a trie with data that is not a slice.
    ///
    /// This is currently supported only on [`ZeroTrieSimpleAscii`]
    /// and [`ZeroAsciiIgnoreCaseTrie`].
    ///
    /// # Examples
    ///
    /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
    ///
    /// // A trie with two values: "aBc" and "aBcdEf"
    /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
    ///
    /// // Get out the value for "abc" (case-insensitive!)
    /// let mut cursor = trie.cursor();
    /// write!(&mut cursor, "abc");
    /// assert_eq!(cursor.take_value(), Some(0));
    /// ```
    ///
    /// For more examples, see [`ZeroTrieSimpleAscii::cursor`].
    #[inline]
    pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor {
        ZeroAsciiIgnoreCaseTrieCursor {
            trie: self.as_borrowed_slice(),
        }
    }
}

impl<'a> ZeroTrieSimpleAscii<&'a [u8]> {
    /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid
    /// having to doubly anchor the trie to the stack.
    #[inline]
    pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> {
        ZeroTrieSimpleAsciiCursor { trie: self }
    }
}

impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> {
    /// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid
    /// having to doubly anchor the trie to the stack.
    #[inline]
    pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> {
        ZeroAsciiIgnoreCaseTrieCursor { trie: self }
    }
}

/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup.
///
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroTrieSimpleAsciiCursor<'a> {
    trie: ZeroTrieSimpleAscii<&'a [u8]>,
}

/// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup.
///
/// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
    trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
}

/// Information about a probed edge.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive] // no need to destructure or construct this in userland
pub struct AsciiProbeResult {
    /// The character's byte value between this node and its parent.
    pub byte: u8,
    /// The number of siblings of this node, _including itself_.
    pub total_siblings: u8,
}

impl ZeroTrieSimpleAsciiCursor<'_> {
    /// Steps the cursor one character into the trie based on the character's byte value.
    ///
    /// # Examples
    ///
    /// Unrolled loop checking for string presence at every step:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Search the trie for the string "abcdxy"
    /// let mut cursor = trie.cursor();
    /// assert_eq!(cursor.take_value(), None); // ""
    /// cursor.step(b'a');
    /// assert_eq!(cursor.take_value(), None); // "a"
    /// cursor.step(b'b');
    /// assert_eq!(cursor.take_value(), None); // "ab"
    /// cursor.step(b'c');
    /// assert_eq!(cursor.take_value(), Some(0)); // "abc"
    /// cursor.step(b'd');
    /// assert_eq!(cursor.take_value(), None); // "abcd"
    /// assert!(!cursor.is_empty());
    /// cursor.step(b'x'); // no strings have the prefix "abcdx"
    /// assert!(cursor.is_empty());
    /// assert_eq!(cursor.take_value(), None); // "abcdx"
    /// cursor.step(b'y');
    /// assert_eq!(cursor.take_value(), None); // "abcdxy"
    /// ```
    ///
    /// If the byte is not ASCII, the cursor will become empty:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// assert_eq!(cursor.take_value(), None); // ""
    /// cursor.step(b'a');
    /// assert_eq!(cursor.take_value(), None); // "a"
    /// cursor.step(b'b');
    /// assert_eq!(cursor.take_value(), None); // "ab"
    /// cursor.step(b'\xFF');
    /// assert!(cursor.is_empty());
    /// assert_eq!(cursor.take_value(), None);
    /// ```
    #[inline]
    pub fn step(&mut self, byte: u8) {
        reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte);
    }

    /// Takes the value at the current position.
    ///
    /// Calling this function on a new cursor is equivalent to calling `.get()`
    /// with the empty string (except that it can only be called once).
    ///
    /// # Examples
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "" and "abc"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81");
    ///
    /// assert_eq!(Some(0), trie.get(""));
    /// let mut cursor = trie.cursor();
    /// assert_eq!(Some(0), cursor.take_value());
    /// assert_eq!(None, cursor.take_value());
    /// ```
    #[inline]
    pub fn take_value(&mut self) -> Option<usize> {
        reader::take_value(&mut self.trie.store)
    }

    /// Steps the cursor one character into the trie based on an edge index,
    /// returning the corresponding character as a byte.
    ///
    /// This function is similar to [`Self::step()`], but it takes an index instead of a char.
    /// This enables stepwise iteration over the contents of the trie.
    ///
    /// If there are multiple possibilities for the next byte, the `index` argument allows
    /// visiting them in order. Since this function steps the cursor, the cursor must be
    /// cloned (a cheap operation) in order to visit multiple children.
    ///
    /// # Examples
    ///
    /// Continually query index 0 to extract the first item from a trie:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// let data: &[(String, usize)] = &[
    ///     ("ab".to_string(), 111),
    ///     ("abcxyz".to_string(), 22),
    ///     ("abde".to_string(), 333),
    ///     ("afg".to_string(), 44),
    /// ];
    ///
    /// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
    ///     data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
    ///
    /// let mut cursor = trie.cursor();
    /// let mut key = String::new();
    /// let value = loop {
    ///     if let Some(value) = cursor.take_value() {
    ///         break value;
    ///     }
    ///     let probe_result = cursor.probe(0).unwrap();
    ///     key.push(char::from(probe_result.byte));
    /// };
    ///
    /// assert_eq!(key, "ab");
    /// assert_eq!(value, 111);
    /// ```
    ///
    /// Stepwise iterate over all entries in the trie:
    ///
    /// ```
    /// # use zerotrie::ZeroTrieSimpleAscii;
    /// # let data: &[(String, usize)] = &[
    /// #     ("ab".to_string(), 111),
    /// #     ("abcxyz".to_string(), 22),
    /// #     ("abde".to_string(), 333),
    /// #     ("afg".to_string(), 44)
    /// # ];
    /// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
    /// #     .iter()
    /// #     .map(|(s, v)| (s.as_str(), *v))
    /// #     .collect();
    /// // (trie built as in previous example)
    ///
    /// // Initialize the iteration at the first child of the trie.
    /// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
    /// let mut key = Vec::new();
    /// let mut results = Vec::new();
    /// loop {
    ///     let Some((mut cursor, index, suffix_len)) = stack.pop() else {
    ///         // Nothing left in the trie.
    ///         break;
    ///     };
    ///     // Check to see if there is a value at the current node.
    ///     if let Some(value) = cursor.take_value() {
    ///         results.push((String::from_utf8(key.clone()).unwrap(), value));
    ///     }
    ///     // Now check for children of the current node.
    ///     let mut sub_cursor = cursor.clone();
    ///     if let Some(probe_result) = sub_cursor.probe(index) {
    ///         // Found a child. Add the current byte edge to the key.
    ///         key.push(probe_result.byte);
    ///         // Add the child to the stack, and also add back the current
    ///         // node if there are more siblings to visit.
    ///         if index + 1 < probe_result.total_siblings as usize {
    ///             stack.push((cursor, index + 1, suffix_len));
    ///             stack.push((sub_cursor, 0, 1));
    ///         } else {
    ///             stack.push((sub_cursor, 0, suffix_len + 1));
    ///         }
    ///     } else {
    ///         // No more children. Pop this node's bytes from the key.
    ///         for _ in 0..suffix_len {
    ///             key.pop();
    ///         }
    ///     }
    /// }
    ///
    /// assert_eq!(&results, data);
    /// ```
    pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
        reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
    }

    /// Checks whether the cursor points to an empty trie.
    ///
    /// Use this to determine when to stop iterating.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.trie.is_empty()
    }
}

impl ZeroAsciiIgnoreCaseTrieCursor<'_> {
    /// Steps the cursor one byte into the trie.
    ///
    /// Returns the byte if matched, which may be a different case than the input byte.
    /// If this function returns `None`, any lookup loops can be terminated.
    ///
    /// # Examples
    ///
    /// Normalize the case of a value by stepping through an ignore-case trie:
    ///
    /// ```
    /// use std::borrow::Cow;
    /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
    ///
    /// // A trie with two values: "aBc" and "aBcdEf"
    /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
    ///
    /// // Get out the value for "abc" and normalize the key string
    /// let mut cursor = trie.cursor();
    /// let mut key_str = Cow::Borrowed("abc".as_bytes());
    /// let mut i = 0;
    /// let value = loop {
    ///     let Some(&input_byte) = key_str.get(i) else {
    ///         break cursor.take_value();
    ///     };
    ///     let Some(matched_byte) = cursor.step(input_byte) else {
    ///         break None;
    ///     };
    ///     if matched_byte != input_byte {
    ///         key_str.to_mut()[i] = matched_byte;
    ///     }
    ///     i += 1;
    /// };
    ///
    /// assert_eq!(value, Some(0));
    /// assert_eq!(&*key_str, "aBc".as_bytes());
    /// ```
    ///
    /// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
    #[inline]
    pub fn step(&mut self, byte: u8) -> Option<u8> {
        reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
    }

    /// Takes the value at the current position.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
    #[inline]
    pub fn take_value(&mut self) -> Option<usize> {
        reader::take_value(&mut self.trie.store)
    }

    /// Probes the next byte in the cursor.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
    pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
        reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
    }

    /// Checks whether the cursor points to an empty trie.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.trie.is_empty()
    }
}

impl fmt::Write for ZeroTrieSimpleAsciiCursor<'_> {
    /// Steps the cursor through each ASCII byte of the string.
    ///
    /// If the string contains non-ASCII chars, an error is returned.
    ///
    /// # Examples
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// cursor.write_str("abcdxy").expect("all ASCII");
    /// cursor.write_str("🚂").expect_err("non-ASCII");
    /// ```
    fn write_str(&mut self, s: &str) -> fmt::Result {
        for b in s.bytes() {
            if !b.is_ascii() {
                return Err(fmt::Error);
            }
            self.step(b);
        }
        Ok(())
    }

    /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns
    /// an error if the char is non-ASCII.
    ///
    /// # Examples
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// cursor.write_char('a').expect("ASCII");
    /// cursor.write_char('x').expect("ASCII");
    /// cursor.write_char('🚂').expect_err("non-ASCII");
    /// ```
    fn write_char(&mut self, c: char) -> fmt::Result {
        if !c.is_ascii() {
            return Err(fmt::Error);
        }
        self.step(c as u8);
        Ok(())
    }
}

impl fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'_> {
    /// Steps the cursor through each ASCII byte of the string.
    ///
    /// If the string contains non-ASCII chars, an error is returned.
    fn write_str(&mut self, s: &str) -> fmt::Result {
        for b in s.bytes() {
            if !b.is_ascii() {
                return Err(fmt::Error);
            }
            self.step(b);
        }
        Ok(())
    }

    /// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns
    /// an error if the char is non-ASCII.
    fn write_char(&mut self, c: char) -> fmt::Result {
        if !c.is_ascii() {
            return Err(fmt::Error);
        }
        self.step(c as u8);
        Ok(())
    }
}