ICU 77.1  77.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ucharstrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __UCHARSTRIE_H__
18 #define __UCHARSTRIE_H__
19 
26 #include "unicode/utypes.h"
27 
28 #if U_SHOW_CPLUSPLUS_API
29 
30 #include "unicode/unistr.h"
31 #include "unicode/uobject.h"
32 #include "unicode/ustringtrie.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 class Appendable;
37 class UCharsTrieBuilder;
38 class UVector32;
39 
54 public:
70  : ownedArray_(nullptr), uchars_(trieUChars),
71  pos_(uchars_), remainingMatchLength_(-1) {}
72 
78 
85  UCharsTrie(const UCharsTrie &other)
86  : ownedArray_(nullptr), uchars_(other.uchars_),
87  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
88 
95  pos_=uchars_;
96  remainingMatchLength_=-1;
97  return *this;
98  }
99 
108  uint64_t getState64() const {
109  return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
110  static_cast<uint64_t>(pos_ - uchars_);
111  }
112 
127  UCharsTrie &resetToState64(uint64_t state) {
128  remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
129  pos_ = uchars_ + (state & kState64PosMask);
130  return *this;
131  }
132 
138  class State : public UMemory {
139  public:
144  State() { uchars=nullptr; }
145  private:
146  friend class UCharsTrie;
147 
148  const char16_t *uchars;
149  const char16_t *pos;
150  int32_t remainingMatchLength;
151  };
152 
160  const UCharsTrie &saveState(State &state) const {
161  state.uchars=uchars_;
162  state.pos=pos_;
163  state.remainingMatchLength=remainingMatchLength_;
164  return *this;
165  }
166 
177  UCharsTrie &resetToState(const State &state) {
178  if(uchars_==state.uchars && uchars_!=nullptr) {
179  pos_=state.pos;
180  remainingMatchLength_=state.remainingMatchLength;
181  }
182  return *this;
183  }
184 
192 
200  inline UStringTrieResult first(int32_t uchar) {
201  remainingMatchLength_=-1;
202  return nextImpl(uchars_, uchar);
203  }
204 
214 
221  UStringTrieResult next(int32_t uchar);
222 
231 
248 
258  inline int32_t getValue() const {
259  const char16_t *pos=pos_;
260  int32_t leadUnit=*pos++;
261  // U_ASSERT(leadUnit>=kMinValueLead);
262  return leadUnit&kValueIsFinal ?
263  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
264  }
265 
275  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
276  const char16_t *pos=pos_;
277  // Skip the rest of a pending linear-match node.
278  return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue);
279  }
280 
288  int32_t getNextUChars(Appendable &out) const;
289 
294  class U_COMMON_API Iterator : public UMemory {
295  public:
307  Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
308 
320  Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
321 
327 
334 
339  UBool hasNext() const;
340 
355  UBool next(UErrorCode &errorCode);
356 
361  const UnicodeString &getString() const { return str_; }
366  int32_t getValue() const { return value_; }
367 
368  private:
369  UBool truncateAndStop() {
370  pos_=nullptr;
371  value_=-1; // no real value for str
372  return true;
373  }
374 
375  const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
376 
377  const char16_t *uchars_;
378  const char16_t *pos_;
379  const char16_t *initialPos_;
380  int32_t remainingMatchLength_;
381  int32_t initialRemainingMatchLength_;
382  UBool skipValue_; // Skip intermediate value which was already delivered.
383 
384  UnicodeString str_;
385  int32_t maxLength_;
386  int32_t value_;
387 
388  // The stack stores pairs of integers for backtracking to another
389  // outbound edge of a branch node.
390  // The first integer is an offset from uchars_.
391  // The second integer has the str_.length() from before the node in bits 15..0,
392  // and the remaining branch length in bits 31..16.
393  // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
394  // but the code looks more confusing that way.)
395  UVector32 *stack_;
396  };
397 
398 private:
399  friend class UCharsTrieBuilder;
400 
407  UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
408  : ownedArray_(adoptUChars), uchars_(trieUChars),
409  pos_(uchars_), remainingMatchLength_(-1) {}
410 
411  // No assignment operator.
412  UCharsTrie &operator=(const UCharsTrie &other) = delete;
413 
414  inline void stop() {
415  pos_=nullptr;
416  }
417 
418  // Reads a compact 32-bit integer.
419  // pos is already after the leadUnit, and the lead unit has bit 15 reset.
420  static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
421  int32_t value;
422  if(leadUnit<kMinTwoUnitValueLead) {
423  value=leadUnit;
424  } else if(leadUnit<kThreeUnitValueLead) {
425  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
426  } else {
427  value=(pos[0]<<16)|pos[1];
428  }
429  return value;
430  }
431  static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
432  if(leadUnit>=kMinTwoUnitValueLead) {
433  if(leadUnit<kThreeUnitValueLead) {
434  ++pos;
435  } else {
436  pos+=2;
437  }
438  }
439  return pos;
440  }
441  static inline const char16_t *skipValue(const char16_t *pos) {
442  int32_t leadUnit=*pos++;
443  return skipValue(pos, leadUnit&0x7fff);
444  }
445 
446  static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
447  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
448  int32_t value;
449  if(leadUnit<kMinTwoUnitNodeValueLead) {
450  value=(leadUnit>>6)-1;
451  } else if(leadUnit<kThreeUnitNodeValueLead) {
452  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
453  } else {
454  value=(pos[0]<<16)|pos[1];
455  }
456  return value;
457  }
458  static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
459  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
460  if(leadUnit>=kMinTwoUnitNodeValueLead) {
461  if(leadUnit<kThreeUnitNodeValueLead) {
462  ++pos;
463  } else {
464  pos+=2;
465  }
466  }
467  return pos;
468  }
469 
470  static inline const char16_t *jumpByDelta(const char16_t *pos) {
471  int32_t delta=*pos++;
472  if(delta>=kMinTwoUnitDeltaLead) {
473  if(delta==kThreeUnitDeltaLead) {
474  delta=(pos[0]<<16)|pos[1];
475  pos+=2;
476  } else {
477  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
478  }
479  }
480  return pos+delta;
481  }
482 
483  static const char16_t *skipDelta(const char16_t *pos) {
484  int32_t delta=*pos++;
485  if(delta>=kMinTwoUnitDeltaLead) {
486  if(delta==kThreeUnitDeltaLead) {
487  pos+=2;
488  } else {
489  ++pos;
490  }
491  }
492  return pos;
493  }
494 
495  static inline UStringTrieResult valueResult(int32_t node) {
496  return static_cast<UStringTrieResult>(USTRINGTRIE_INTERMEDIATE_VALUE - (node >> 15));
497  }
498 
499  // Handles a branch node for both next(uchar) and next(string).
500  UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
501 
502  // Requires remainingLength_<0.
503  UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
504 
505  // Helper functions for hasUniqueValue().
506  // Recursively finds a unique value (or whether there is not a unique one)
507  // from a branch.
508  static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
509  UBool haveUniqueValue, int32_t &uniqueValue);
510  // Recursively finds a unique value (or whether there is not a unique one)
511  // starting from a position on a node lead unit.
512  static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
513 
514  // Helper functions for getNextUChars().
515  // getNextUChars() when pos is on a branch node.
516  static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
517 
518  // UCharsTrie data structure
519  //
520  // The trie consists of a series of char16_t-serialized nodes for incremental
521  // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
522  // The root node is at the beginning of the trie data.
523  //
524  // Types of nodes are distinguished by their node lead unit ranges.
525  // After each node, except a final-value node, another node follows to
526  // encode match values or continue matching further units.
527  //
528  // Node types:
529  // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
530  // The value is for the string/char16_t sequence so far.
531  // - Match node, optionally with an intermediate value in a different compact format.
532  // The value, if present, is for the string/char16_t sequence so far.
533  //
534  // Aside from the value, which uses the node lead unit's high bits:
535  //
536  // - Linear-match node: Matches a number of units.
537  // - Branch node: Branches to other nodes according to the current input unit.
538  // The node unit is the length of the branch (number of units to select from)
539  // minus 1. It is followed by a sub-node:
540  // - If the length is at most kMaxBranchLinearSubNodeLength, then
541  // there are length-1 (key, value) pairs and then one more comparison unit.
542  // If one of the key units matches, then the value is either a final value for
543  // the string so far, or a "jump" delta to the next node.
544  // If the last unit matches, then matching continues with the next node.
545  // (Values have the same encoding as final-value nodes.)
546  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
547  // there is one unit and one "jump" delta.
548  // If the input unit is less than the sub-node unit, then "jump" by delta to
549  // the next sub-node which will have a length of length/2.
550  // (The delta has its own compact encoding.)
551  // Otherwise, skip the "jump" delta to the next sub-node
552  // which will have a length of length-length/2.
553 
554  // Match-node lead unit values, after masking off intermediate-value bits:
555 
556  // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
557  // the length is one more than the next unit.
558 
559  // For a branch sub-node with at most this many entries, we drop down
560  // to a linear search.
561  static const int32_t kMaxBranchLinearSubNodeLength=5;
562 
563  // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
564  static const int32_t kMinLinearMatch=0x30;
565  static const int32_t kMaxLinearMatchLength=0x10;
566 
567  // Match-node lead unit bits 14..6 for the optional intermediate value.
568  // If these bits are 0, then there is no intermediate value.
569  // Otherwise, see the *NodeValue* constants below.
570  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
571  static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
572 
573  // A final-value node has bit 15 set.
574  static const int32_t kValueIsFinal=0x8000;
575 
576  // Compact value: After testing and masking off bit 15, use the following thresholds.
577  static const int32_t kMaxOneUnitValue=0x3fff;
578 
579  static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
580  static const int32_t kThreeUnitValueLead=0x7fff;
581 
582  static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
583 
584  // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
585  static const int32_t kMaxOneUnitNodeValue=0xff;
586  static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
587  static const int32_t kThreeUnitNodeValueLead=0x7fc0;
588 
589  static const int32_t kMaxTwoUnitNodeValue=
590  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
591 
592  // Compact delta integers.
593  static const int32_t kMaxOneUnitDelta=0xfbff;
594  static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
595  static const int32_t kThreeUnitDeltaLead=0xffff;
596 
597  static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
598 
599  // For getState64():
600  // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
601  // so we need at least 5 bits for that.
602  // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
603  static constexpr int32_t kState64RemainingShift = 59;
604  static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
605 
606  char16_t *ownedArray_;
607 
608  // Fixed value referencing the UCharsTrie words.
609  const char16_t *uchars_;
610 
611  // Iterator variables.
612 
613  // Pointer to next trie unit to read. nullptr if no more matches.
614  const char16_t *pos_;
615  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
616  int32_t remainingMatchLength_;
617 };
618 
619 U_NAMESPACE_END
620 
621 #endif /* U_SHOW_CPLUSPLUS_API */
622 
623 #endif // __UCHARSTRIE_H__
Base class for objects to which Unicode characters and strings can be appended.
Definition: appendable.h:54
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types.
Definition: char16ptr.h:156
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition: ucharstrie.h:294
Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the root of a char16_t-serialized UCharsTrie.
Iterator & reset()
Resets this iterator to its initial state.
int32_t getValue() const
Definition: ucharstrie.h:366
UBool next(UErrorCode &errorCode)
Finds the next (string, value) pair if there is one.
const UnicodeString & getString() const
Definition: ucharstrie.h:361
Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the current state of the specified UCharsTrie.
UCharsTrie state object, for saving a trie's current state and resetting the trie back to this state ...
Definition: ucharstrie.h:138
State()
Constructs an empty State.
Definition: ucharstrie.h:144
Light-weight, non-const reader class for a UCharsTrie.
Definition: ucharstrie.h:53
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition: ucharstrie.h:69
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition: ucharstrie.h:85
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition: ucharstrie.h:275
uint64_t getState64() const
Returns the state of this trie as a 64-bit integer.
Definition: ucharstrie.h:108
UCharsTrie & reset()
Resets this trie to its initial state.
Definition: ucharstrie.h:94
UCharsTrie & resetToState64(uint64_t state)
Resets this trie to the saved state.
Definition: ucharstrie.h:127
UStringTrieResult next(int32_t uchar)
Traverses the trie from the current state for this input char16_t.
UStringTrieResult nextForCodePoint(UChar32 cp)
Traverses the trie from the current state for the one or two UTF-16 code units for this input code po...
int32_t getValue() const
Returns a matching string's value if called immediately after current()/first()/next() returned USTRI...
Definition: ucharstrie.h:258
UStringTrieResult firstForCodePoint(UChar32 cp)
Traverses the trie from the initial state for the one or two UTF-16 code units for this input code po...
UStringTrieResult current() const
Determines whether the string so far matches, whether it has a value, and whether another input char1...
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: ucharstrie.h:177
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition: ucharstrie.h:200
~UCharsTrie()
Destructor.
UStringTrieResult next(ConstChar16Ptr s, int32_t length)
Traverses the trie from the current state for this string.
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition: ucharstrie.h:160
int32_t getNextUChars(Appendable &out) const
Finds each char16_t which continues the string from the current state.
UMemory is the common ICU base class.
Definition: uobject.h:115
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:427
#define UINT64_C(c)
Provides a platform independent way to specify an unsigned 64-bit integer constant.
Definition: umachine.h:219
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
C++ API: Unicode String.
C++ API: Common ICU base class UObject.
C API: Helper definitions for dictionary trie APIs.
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:35
@ USTRINGTRIE_INTERMEDIATE_VALUE
The input unit(s) continued a matching string and there is a value for the string so far.
Definition: ustringtrie.h:66
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:430
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:315