ICU 76.1 76.1
Loading...
Searching...
No Matches
ucharstrie.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2012, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: ucharstrie.h
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010nov14
14* created by: Markus W. Scherer
15*/
16
17#ifndef __UCHARSTRIE_H__
18#define __UCHARSTRIE_H__
19
26#include "unicode/utypes.h"
27
28#if U_SHOW_CPLUSPLUS_API
29
30#include "unicode/unistr.h"
31#include "unicode/uobject.h"
32#include "unicode/ustringtrie.h"
33
34U_NAMESPACE_BEGIN
35
36class Appendable;
37class UCharsTrieBuilder;
38class UVector32;
39
54public:
70 : ownedArray_(nullptr), uchars_(trieUChars),
71 pos_(uchars_), remainingMatchLength_(-1) {}
72
78
86 : ownedArray_(nullptr), uchars_(other.uchars_),
87 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
88
95 pos_=uchars_;
96 remainingMatchLength_=-1;
97 return *this;
98 }
99
109 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
110 static_cast<uint64_t>(pos_ - uchars_);
111 }
112
128 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
129 pos_ = uchars_ + (state & kState64PosMask);
130 return *this;
131 }
132
138 class State : public UMemory {
139 public:
144 State() { uchars=nullptr; }
145 private:
146 friend class UCharsTrie;
147
148 const char16_t *uchars;
149 const char16_t *pos;
150 int32_t remainingMatchLength;
151 };
152
161 state.uchars=uchars_;
162 state.pos=pos_;
163 state.remainingMatchLength=remainingMatchLength_;
164 return *this;
165 }
166
178 if(uchars_==state.uchars && uchars_!=nullptr) {
179 pos_=state.pos;
180 remainingMatchLength_=state.remainingMatchLength;
181 }
182 return *this;
183 }
184
192
201 remainingMatchLength_=-1;
202 return nextImpl(uchars_, uchar);
203 }
204
214
222
231
248
258 inline int32_t getValue() const {
259 const char16_t *pos=pos_;
260 int32_t leadUnit=*pos++;
261 // U_ASSERT(leadUnit>=kMinValueLead);
262 return leadUnit&kValueIsFinal ?
263 readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
264 }
265
276 const char16_t *pos=pos_;
277 // Skip the rest of a pending linear-match node.
278 return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue);
279 }
280
289
295 public:
308
321
327
334
339 UBool hasNext() const;
340
355 UBool next(UErrorCode &errorCode);
356
361 const UnicodeString &getString() const { return str_; }
366 int32_t getValue() const { return value_; }
367
368 private:
369 UBool truncateAndStop() {
370 pos_=nullptr;
371 value_=-1; // no real value for str
372 return true;
373 }
374
375 const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
376
377 const char16_t *uchars_;
378 const char16_t *pos_;
379 const char16_t *initialPos_;
380 int32_t remainingMatchLength_;
381 int32_t initialRemainingMatchLength_;
382 UBool skipValue_; // Skip intermediate value which was already delivered.
383
384 UnicodeString str_;
385 int32_t maxLength_;
386 int32_t value_;
387
388 // The stack stores pairs of integers for backtracking to another
389 // outbound edge of a branch node.
390 // The first integer is an offset from uchars_.
391 // The second integer has the str_.length() from before the node in bits 15..0,
392 // and the remaining branch length in bits 31..16.
393 // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
394 // but the code looks more confusing that way.)
395 UVector32 *stack_;
396 };
397
398private:
399 friend class UCharsTrieBuilder;
400
407 UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
408 : ownedArray_(adoptUChars), uchars_(trieUChars),
409 pos_(uchars_), remainingMatchLength_(-1) {}
410
411 // No assignment operator.
412 UCharsTrie &operator=(const UCharsTrie &other) = delete;
413
414 inline void stop() {
415 pos_=nullptr;
416 }
417
418 // Reads a compact 32-bit integer.
419 // pos is already after the leadUnit, and the lead unit has bit 15 reset.
420 static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
421 int32_t value;
422 if(leadUnit<kMinTwoUnitValueLead) {
423 value=leadUnit;
424 } else if(leadUnit<kThreeUnitValueLead) {
425 value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
426 } else {
427 value=(pos[0]<<16)|pos[1];
428 }
429 return value;
430 }
431 static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
432 if(leadUnit>=kMinTwoUnitValueLead) {
433 if(leadUnit<kThreeUnitValueLead) {
434 ++pos;
435 } else {
436 pos+=2;
437 }
438 }
439 return pos;
440 }
441 static inline const char16_t *skipValue(const char16_t *pos) {
442 int32_t leadUnit=*pos++;
443 return skipValue(pos, leadUnit&0x7fff);
444 }
445
446 static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
447 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
448 int32_t value;
449 if(leadUnit<kMinTwoUnitNodeValueLead) {
450 value=(leadUnit>>6)-1;
451 } else if(leadUnit<kThreeUnitNodeValueLead) {
452 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
453 } else {
454 value=(pos[0]<<16)|pos[1];
455 }
456 return value;
457 }
458 static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
459 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
460 if(leadUnit>=kMinTwoUnitNodeValueLead) {
461 if(leadUnit<kThreeUnitNodeValueLead) {
462 ++pos;
463 } else {
464 pos+=2;
465 }
466 }
467 return pos;
468 }
469
470 static inline const char16_t *jumpByDelta(const char16_t *pos) {
471 int32_t delta=*pos++;
472 if(delta>=kMinTwoUnitDeltaLead) {
473 if(delta==kThreeUnitDeltaLead) {
474 delta=(pos[0]<<16)|pos[1];
475 pos+=2;
476 } else {
477 delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
478 }
479 }
480 return pos+delta;
481 }
482
483 static const char16_t *skipDelta(const char16_t *pos) {
484 int32_t delta=*pos++;
485 if(delta>=kMinTwoUnitDeltaLead) {
486 if(delta==kThreeUnitDeltaLead) {
487 pos+=2;
488 } else {
489 ++pos;
490 }
491 }
492 return pos;
493 }
494
495 static inline UStringTrieResult valueResult(int32_t node) {
496 return static_cast<UStringTrieResult>(USTRINGTRIE_INTERMEDIATE_VALUE - (node >> 15));
497 }
498
499 // Handles a branch node for both next(uchar) and next(string).
500 UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
501
502 // Requires remainingLength_<0.
503 UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
504
505 // Helper functions for hasUniqueValue().
506 // Recursively finds a unique value (or whether there is not a unique one)
507 // from a branch.
508 static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
509 UBool haveUniqueValue, int32_t &uniqueValue);
510 // Recursively finds a unique value (or whether there is not a unique one)
511 // starting from a position on a node lead unit.
512 static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
513
514 // Helper functions for getNextUChars().
515 // getNextUChars() when pos is on a branch node.
516 static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
517
518 // UCharsTrie data structure
519 //
520 // The trie consists of a series of char16_t-serialized nodes for incremental
521 // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
522 // The root node is at the beginning of the trie data.
523 //
524 // Types of nodes are distinguished by their node lead unit ranges.
525 // After each node, except a final-value node, another node follows to
526 // encode match values or continue matching further units.
527 //
528 // Node types:
529 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
530 // The value is for the string/char16_t sequence so far.
531 // - Match node, optionally with an intermediate value in a different compact format.
532 // The value, if present, is for the string/char16_t sequence so far.
533 //
534 // Aside from the value, which uses the node lead unit's high bits:
535 //
536 // - Linear-match node: Matches a number of units.
537 // - Branch node: Branches to other nodes according to the current input unit.
538 // The node unit is the length of the branch (number of units to select from)
539 // minus 1. It is followed by a sub-node:
540 // - If the length is at most kMaxBranchLinearSubNodeLength, then
541 // there are length-1 (key, value) pairs and then one more comparison unit.
542 // If one of the key units matches, then the value is either a final value for
543 // the string so far, or a "jump" delta to the next node.
544 // If the last unit matches, then matching continues with the next node.
545 // (Values have the same encoding as final-value nodes.)
546 // - If the length is greater than kMaxBranchLinearSubNodeLength, then
547 // there is one unit and one "jump" delta.
548 // If the input unit is less than the sub-node unit, then "jump" by delta to
549 // the next sub-node which will have a length of length/2.
550 // (The delta has its own compact encoding.)
551 // Otherwise, skip the "jump" delta to the next sub-node
552 // which will have a length of length-length/2.
553
554 // Match-node lead unit values, after masking off intermediate-value bits:
555
556 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
557 // the length is one more than the next unit.
558
559 // For a branch sub-node with at most this many entries, we drop down
560 // to a linear search.
561 static const int32_t kMaxBranchLinearSubNodeLength=5;
562
563 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
564 static const int32_t kMinLinearMatch=0x30;
565 static const int32_t kMaxLinearMatchLength=0x10;
566
567 // Match-node lead unit bits 14..6 for the optional intermediate value.
568 // If these bits are 0, then there is no intermediate value.
569 // Otherwise, see the *NodeValue* constants below.
570 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
571 static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
572
573 // A final-value node has bit 15 set.
574 static const int32_t kValueIsFinal=0x8000;
575
576 // Compact value: After testing and masking off bit 15, use the following thresholds.
577 static const int32_t kMaxOneUnitValue=0x3fff;
578
579 static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
580 static const int32_t kThreeUnitValueLead=0x7fff;
581
582 static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
583
584 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
585 static const int32_t kMaxOneUnitNodeValue=0xff;
586 static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
587 static const int32_t kThreeUnitNodeValueLead=0x7fc0;
588
589 static const int32_t kMaxTwoUnitNodeValue=
590 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
591
592 // Compact delta integers.
593 static const int32_t kMaxOneUnitDelta=0xfbff;
594 static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
595 static const int32_t kThreeUnitDeltaLead=0xffff;
596
597 static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
598
599 // For getState64():
600 // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
601 // so we need at least 5 bits for that.
602 // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
603 static constexpr int32_t kState64RemainingShift = 59;
604 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
605
606 char16_t *ownedArray_;
607
608 // Fixed value referencing the UCharsTrie words.
609 const char16_t *uchars_;
610
611 // Iterator variables.
612
613 // Pointer to next trie unit to read. nullptr if no more matches.
614 const char16_t *pos_;
615 // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
616 int32_t remainingMatchLength_;
617};
618
619U_NAMESPACE_END
620
621#endif /* U_SHOW_CPLUSPLUS_API */
622
623#endif // __UCHARSTRIE_H__
Base class for objects to which Unicode characters and strings can be appended.
Definition appendable.h:54
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types.
Definition char16ptr.h:150
"Smart pointer" base class; do not use directly: use LocalPointer etc.
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition ucharstrie.h:294
Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the root of a char16_t-serialized UCharsTrie.
Iterator & reset()
Resets this iterator to its initial state.
const UnicodeString & getString() const
Definition ucharstrie.h:361
int32_t getValue() const
Definition ucharstrie.h:366
UBool next(UErrorCode &errorCode)
Finds the next (string, value) pair if there is one.
Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the current state of the specified UCharsTrie.
UCharsTrie state object, for saving a trie's current state and resetting the trie back to this state ...
Definition ucharstrie.h:138
State()
Constructs an empty State.
Definition ucharstrie.h:144
Light-weight, non-const reader class for a UCharsTrie.
Definition ucharstrie.h:53
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition ucharstrie.h:160
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition ucharstrie.h:69
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition ucharstrie.h:85
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition ucharstrie.h:275
uint64_t getState64() const
Returns the state of this trie as a 64-bit integer.
Definition ucharstrie.h:108
UCharsTrie & reset()
Resets this trie to its initial state.
Definition ucharstrie.h:94
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition ucharstrie.h:177
UStringTrieResult next(int32_t uchar)
Traverses the trie from the current state for this input char16_t.
UStringTrieResult nextForCodePoint(UChar32 cp)
Traverses the trie from the current state for the one or two UTF-16 code units for this input code po...
int32_t getValue() const
Returns a matching string's value if called immediately after current()/first()/next() returned USTRI...
Definition ucharstrie.h:258
UStringTrieResult firstForCodePoint(UChar32 cp)
Traverses the trie from the initial state for the one or two UTF-16 code units for this input code po...
UStringTrieResult current() const
Determines whether the string so far matches, whether it has a value, and whether another input char1...
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition ucharstrie.h:200
UCharsTrie & resetToState64(uint64_t state)
Resets this trie to the saved state.
Definition ucharstrie.h:127
~UCharsTrie()
Destructor.
UStringTrieResult next(ConstChar16Ptr s, int32_t length)
Traverses the trie from the current state for this string.
int32_t getNextUChars(Appendable &out) const
Finds each char16_t which continues the string from the current state.
UMemory is the common ICU base class.
Definition uobject.h:115
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:296
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:427
#define UINT64_C(c)
Provides a platform independent way to specify an unsigned 64-bit integer constant.
Definition umachine.h:219
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition umachine.h:247
C++ API: Unicode String.
C++ API: Common ICU base class UObject.
C API: Helper definitions for dictionary trie APIs.
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition ustringtrie.h:35
@ USTRINGTRIE_INTERMEDIATE_VALUE
The input unit(s) continued a matching string and there is a value for the string so far.
Definition ustringtrie.h:66
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition utypes.h:430
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:315