ICU 77.1  77.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
rbbi.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 
9 **********************************************************************
10 * Date Name Description
11 * 10/22/99 alan Creation.
12 * 11/11/99 rgillam Complete port from Java.
13 **********************************************************************
14 */
15 
16 #ifndef RBBI_H
17 #define RBBI_H
18 
19 #include "unicode/utypes.h"
20 
21 #if U_SHOW_CPLUSPLUS_API
22 
28 #if !UCONFIG_NO_BREAK_ITERATION
29 
30 #include "unicode/brkiter.h"
31 #include "unicode/udata.h"
32 #include "unicode/parseerr.h"
33 #include "unicode/schriter.h"
34 
35 struct UCPTrie;
36 
37 U_NAMESPACE_BEGIN
38 
40 class LanguageBreakEngine;
41 struct RBBIDataHeader;
42 class RBBIDataWrapper;
43 class UnhandledEngine;
44 class UStack;
45 
46 
47 #ifndef U_HIDE_INTERNAL_API
60 class ExternalBreakEngine : public UObject {
61  public:
66  virtual ~ExternalBreakEngine() {}
67 
77  virtual bool isFor(UChar32 c, const char* locale) const = 0;
78 
87  virtual bool handles(UChar32 c) const = 0;
88 
102  virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
103  int32_t* foundBreaks, int32_t foundBreaksCapacity,
104  UErrorCode& status) const = 0;
105 };
106 #endif /* U_HIDE_INTERNAL_API */
107 
108 
121 
122 private:
127  UText fText = UTEXT_INITIALIZER;
128 
129 #ifndef U_HIDE_INTERNAL_API
130 public:
131 #endif /* U_HIDE_INTERNAL_API */
137  RBBIDataWrapper *fData = nullptr;
138 
139 private:
144  UErrorCode fErrorCode = U_ZERO_ERROR;
145 
150  int32_t fPosition = 0;
151 
155  int32_t fRuleStatusIndex = 0;
156 
160  class BreakCache;
161  BreakCache *fBreakCache = nullptr;
162 
167  class DictionaryCache;
168  DictionaryCache *fDictionaryCache = nullptr;
169 
177  UStack *fLanguageBreakEngines = nullptr;
178 
186  UnhandledEngine *fUnhandledBreakEngine = nullptr;
187 
193  uint32_t fDictionaryCharCount = 0;
194 
200  CharacterIterator *fCharIter = &fSCharIter;
201 
207  UCharCharacterIterator fSCharIter {u"", 0};
208 
212  bool fDone = false;
213 
217  int32_t *fLookAheadMatches = nullptr;
218 
222  UBool fIsPhraseBreaking = false;
223 
224  //=======================================================================
225  // constructors
226  //=======================================================================
227 
238  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
239 
253  RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
254 
256  friend class RBBIRuleBuilder;
258  friend class BreakIterator;
259 
267 
268 public:
269 
276 
284 
294  UParseError &parseError,
295  UErrorCode &status);
296 
320  RuleBasedBreakIterator(const uint8_t *compiledRules,
321  uint32_t ruleLength,
322  UErrorCode &status);
323 
337 
343 
352 
361  virtual bool operator==(const BreakIterator& that) const override;
362 
370  inline bool operator!=(const BreakIterator& that) const {
371  return !operator==(that);
372  }
373 
384  virtual RuleBasedBreakIterator* clone() const override;
385 
391  virtual int32_t hashCode() const;
392 
398  virtual const UnicodeString& getRules() const;
399 
400  //=======================================================================
401  // BreakIterator overrides
402  //=======================================================================
403 
428  virtual CharacterIterator& getText() const override;
429 
444  virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
445 
453  virtual void adoptText(CharacterIterator* newText) override;
454 
466  virtual void setText(const UnicodeString& newText) override;
467 
481  virtual void setText(UText *text, UErrorCode &status) override;
482 
488  virtual int32_t first() override;
489 
495  virtual int32_t last() override;
496 
507  virtual int32_t next(int32_t n) override;
508 
514  virtual int32_t next() override;
515 
521  virtual int32_t previous() override;
522 
530  virtual int32_t following(int32_t offset) override;
531 
539  virtual int32_t preceding(int32_t offset) override;
540 
549  virtual UBool isBoundary(int32_t offset) override;
550 
559  virtual int32_t current() const override;
560 
592  virtual int32_t getRuleStatus() const override;
593 
617  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
618 
630  virtual UClassID getDynamicClassID() const override;
631 
643  static UClassID U_EXPORT2 getStaticClassID();
644 
645 #ifndef U_FORCE_HIDE_DEPRECATED_API
672  virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
673  int32_t &BufferSize,
674  UErrorCode &status) override;
675 #endif // U_FORCE_HIDE_DEPRECATED_API
676 
694  virtual const uint8_t *getBinaryRules(uint32_t &length);
695 
721  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
722 
723 
724 private:
725  //=======================================================================
726  // implementation
727  //=======================================================================
737  int32_t handleSafePrevious(int32_t fromPosition);
738 
751  int32_t handleNext();
752 
753  /*
754  * Templatized version of handleNext() and handleSafePrevious().
755  *
756  * There will be exactly four instantiations, two each for 8 and 16 bit tables,
757  * two each for 8 and 16 bit trie.
758  * Having separate instantiations for the table types keeps conditional tests of
759  * the table type out of the inner loops, at the expense of replicated code.
760  *
761  * The template parameter for the Trie access function is a value, not a type.
762  * Doing it this way, the compiler will inline the Trie function in the
763  * expanded functions. (Both the 8 and 16 bit access functions have the same type
764  * signature)
765  */
766 
767  typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
768 
769  template<typename RowType, PTrieFunc trieFunc>
770  int32_t handleSafePrevious(int32_t fromPosition);
771 
772  template<typename RowType, PTrieFunc trieFunc>
773  int32_t handleNext();
774 
775 
783  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
784 
785  public:
786 #ifndef U_HIDE_INTERNAL_API
791  void dumpCache();
792 
797  void dumpTables();
798 #endif /* U_HIDE_INTERNAL_API */
799 
800 #ifndef U_HIDE_INTERNAL_API
810  static void U_EXPORT2 registerExternalBreakEngine(
811  ExternalBreakEngine* toAdopt, UErrorCode& status);
812 #endif /* U_HIDE_INTERNAL_API */
813 
814 };
815 
816 
817 U_NAMESPACE_END
818 
819 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
820 
821 #endif /* U_SHOW_CPLUSPLUS_API */
822 
823 #endif
C++ API: Break Iterator.
The BreakIterator class implements methods for finding the location of boundaries in text.
Definition: brkiter.h:108
virtual bool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:361
The ExternalBreakEngine class define an abstract interface for the host environment to provide a low ...
Definition: rbbi.h:60
virtual bool isFor(UChar32 c, const char *locale) const =0
virtual int32_t fillBreaks(UText *text, int32_t start, int32_t end, int32_t *foundBreaks, int32_t foundBreaksCapacity, UErrorCode &status) const =0
virtual bool handles(UChar32 c) const =0
virtual ~ExternalBreakEngine()
destructor
Definition: rbbi.h:66
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:120
virtual int32_t first() override
Sets the current iteration position to the beginning of the text, position zero.
virtual int32_t hashCode() const
Compute a hash code for this BreakIterator.
virtual int32_t following(int32_t offset) override
Sets the iterator to refer to the first boundary position following the specified position.
virtual bool operator==(const BreakIterator &that) const override
Equality operator.
virtual int32_t current() const override
Returns the current iteration position.
static UClassID getStaticClassID()
Returns the class ID for this class.
virtual RuleBasedBreakIterator & refreshInputText(UText *input, UErrorCode &status) override
Set the subject text string upon which the break iterator is operating without changing any other asp...
RuleBasedBreakIterator & operator=(const RuleBasedBreakIterator &that)
Assignment operator.
virtual UClassID getDynamicClassID() const override
Returns a unique class ID POLYMORPHICALLY.
void dumpTables()
Debugging function only.
virtual int32_t next() override
Advances the iterator to the next boundary position.
virtual int32_t getRuleStatus() const override
Return the status tag from the break rule that determined the boundary at the current iteration posit...
virtual RuleBasedBreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status) override
Deprecated functionality.
virtual UBool isBoundary(int32_t offset) override
Returns true if the specified position is a boundary position.
bool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:370
virtual RuleBasedBreakIterator * clone() const override
Returns a newly-constructed RuleBasedBreakIterator with the same behavior, and iterating over the sam...
void dumpCache()
Debugging function only.
static void registerExternalBreakEngine(ExternalBreakEngine *toAdopt, UErrorCode &status)
Register a new external break engine.
RuleBasedBreakIterator()
Default constructor.
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override
Get the status (tag) values from the break rule(s) that determined the boundary at the current iterat...
virtual const uint8_t * getBinaryRules(uint32_t &length)
Return the binary form of compiled break rules, which can then be used to create a new break iterator...
virtual int32_t last() override
Sets the current iteration position to the end of the text.
virtual int32_t preceding(int32_t offset) override
Sets the iterator to refer to the last boundary position before the specified position.
RuleBasedBreakIterator(const RuleBasedBreakIterator &that)
Copy constructor.
virtual void adoptText(CharacterIterator *newText) override
Set the iterator to analyze a new piece of text.
virtual UText * getUText(UText *fillIn, UErrorCode &status) const override
Get a UText for the text being analyzed.
virtual int32_t previous() override
Moves the iterator backwards, to the last boundary preceding this one.
RuleBasedBreakIterator(UDataMemory *image, UErrorCode &status)
This constructor uses the udata interface to create a BreakIterator whose internal tables live in a m...
virtual int32_t next(int32_t n) override
Advances the iterator either forward or backward the specified number of steps.
virtual CharacterIterator & getText() const override
virtual const UnicodeString & getRules() const
Returns the description used to create this iterator.
virtual void setText(UText *text, UErrorCode &status) override
Reset the break iterator to operate over the text represented by the UText.
virtual ~RuleBasedBreakIterator()
Destructor.
RuleBasedBreakIterator(const UnicodeString &rules, UParseError &parseError, UErrorCode &status)
Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
RuleBasedBreakIterator(const uint8_t *compiledRules, uint32_t ruleLength, UErrorCode &status)
Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
virtual void setText(const UnicodeString &newText) override
Set the iterator to analyze a new piece of text.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:38
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
C API: Parse Error Information.
C++ API: String Character Iterator.
Immutable Unicode code point trie structure.
Definition: ucptrie.h:59
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
UText struct.
Definition: utext.h:1328
C API: Data loading interface.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:161
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:427
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
#define UTEXT_INITIALIZER
initializer to be used with local (stack) instances of a UText struct.
Definition: utext.h:1558
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:430
@ U_ZERO_ERROR
No error, no warning.
Definition: utypes.h:465
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:315