ICU 74.1 74.1
rbbi.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016 International Business Machines Corporation *
6* and others. All rights reserved. *
7***************************************************************************
8
9**********************************************************************
10* Date Name Description
11* 10/22/99 alan Creation.
12* 11/11/99 rgillam Complete port from Java.
13**********************************************************************
14*/
15
16#ifndef RBBI_H
17#define RBBI_H
18
19#include "unicode/utypes.h"
20
21#if U_SHOW_CPLUSPLUS_API
22
28#if !UCONFIG_NO_BREAK_ITERATION
29
30#include "unicode/brkiter.h"
31#include "unicode/udata.h"
32#include "unicode/parseerr.h"
33#include "unicode/schriter.h"
34
35struct UCPTrie;
36
37U_NAMESPACE_BEGIN
38
40class LanguageBreakEngine;
41struct RBBIDataHeader;
42class RBBIDataWrapper;
43class UnhandledEngine;
44class UStack;
45
46
47#ifndef U_HIDE_DRAFT_API
48#if !UCONFIG_NO_SERVICE
62 public:
68
78 virtual bool isFor(UChar32 c, const char* locale) const = 0;
79
88 virtual bool handles(UChar32 c) const = 0;
89
103 virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
104 int32_t* foundBreaks, int32_t foundBreaksCapacity,
105 UErrorCode& status) const = 0;
106};
107#endif /* UCONFIG_NO_SERVICE */
108#endif /* U_HIDE_DRAFT_API */
109
110
123
124private:
129 UText fText = UTEXT_INITIALIZER;
130
131#ifndef U_HIDE_INTERNAL_API
132public:
133#endif /* U_HIDE_INTERNAL_API */
139 RBBIDataWrapper *fData = nullptr;
140
141private:
146 UErrorCode fErrorCode = U_ZERO_ERROR;
147
152 int32_t fPosition = 0;
153
157 int32_t fRuleStatusIndex = 0;
158
162 class BreakCache;
163 BreakCache *fBreakCache = nullptr;
164
169 class DictionaryCache;
170 DictionaryCache *fDictionaryCache = nullptr;
171
179 UStack *fLanguageBreakEngines = nullptr;
180
188 UnhandledEngine *fUnhandledBreakEngine = nullptr;
189
195 uint32_t fDictionaryCharCount = 0;
196
202 CharacterIterator *fCharIter = &fSCharIter;
203
209 UCharCharacterIterator fSCharIter {u"", 0};
210
214 bool fDone = false;
215
219 int32_t *fLookAheadMatches = nullptr;
220
224 UBool fIsPhraseBreaking = false;
225
226 //=======================================================================
227 // constructors
228 //=======================================================================
229
240 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
241
255 RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
256
258 friend class RBBIRuleBuilder;
260 friend class BreakIterator;
261
269
270public:
271
278
286
296 UParseError &parseError,
297 UErrorCode &status);
298
322 RuleBasedBreakIterator(const uint8_t *compiledRules,
323 uint32_t ruleLength,
324 UErrorCode &status);
325
339
345
354
363 virtual bool operator==(const BreakIterator& that) const override;
364
372 inline bool operator!=(const BreakIterator& that) const {
373 return !operator==(that);
374 }
375
386 virtual RuleBasedBreakIterator* clone() const override;
387
393 virtual int32_t hashCode(void) const;
394
400 virtual const UnicodeString& getRules(void) const;
401
402 //=======================================================================
403 // BreakIterator overrides
404 //=======================================================================
405
430 virtual CharacterIterator& getText(void) const override;
431
432
447 virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
448
456 virtual void adoptText(CharacterIterator* newText) override;
457
469 virtual void setText(const UnicodeString& newText) override;
470
484 virtual void setText(UText *text, UErrorCode &status) override;
485
491 virtual int32_t first(void) override;
492
498 virtual int32_t last(void) override;
499
510 virtual int32_t next(int32_t n) override;
511
517 virtual int32_t next(void) override;
518
524 virtual int32_t previous(void) override;
525
533 virtual int32_t following(int32_t offset) override;
534
542 virtual int32_t preceding(int32_t offset) override;
543
552 virtual UBool isBoundary(int32_t offset) override;
553
562 virtual int32_t current(void) const override;
563
564
596 virtual int32_t getRuleStatus() const override;
597
621 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
622
634 virtual UClassID getDynamicClassID(void) const override;
635
647 static UClassID U_EXPORT2 getStaticClassID(void);
648
649#ifndef U_FORCE_HIDE_DEPRECATED_API
676 virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
677 int32_t &BufferSize,
678 UErrorCode &status) override;
679#endif // U_FORCE_HIDE_DEPRECATED_API
680
698 virtual const uint8_t *getBinaryRules(uint32_t &length);
699
725 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
726
727
728private:
729 //=======================================================================
730 // implementation
731 //=======================================================================
741 int32_t handleSafePrevious(int32_t fromPosition);
742
755 int32_t handleNext();
756
757 /*
758 * Templatized version of handleNext() and handleSafePrevious().
759 *
760 * There will be exactly four instantiations, two each for 8 and 16 bit tables,
761 * two each for 8 and 16 bit trie.
762 * Having separate instantiations for the table types keeps conditional tests of
763 * the table type out of the inner loops, at the expense of replicated code.
764 *
765 * The template parameter for the Trie access function is a value, not a type.
766 * Doing it this way, the compiler will inline the Trie function in the
767 * expanded functions. (Both the 8 and 16 bit access functions have the same type
768 * signature)
769 */
770
771 typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
772
773 template<typename RowType, PTrieFunc trieFunc>
774 int32_t handleSafePrevious(int32_t fromPosition);
775
776 template<typename RowType, PTrieFunc trieFunc>
777 int32_t handleNext();
778
779
787 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
788
789 public:
790#ifndef U_HIDE_INTERNAL_API
795 void dumpCache();
796
802#endif /* U_HIDE_INTERNAL_API */
803
804#ifndef U_HIDE_DRAFT_API
805#if !UCONFIG_NO_SERVICE
815 static void U_EXPORT2 registerExternalBreakEngine(
816 ExternalBreakEngine* toAdopt, UErrorCode& status);
817#endif /* UCONFIG_NO_SERVICE */
818#endif /* U_HIDE_DRAFT_API */
819
820};
821
822
823U_NAMESPACE_END
824
825#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
826
827#endif /* U_SHOW_CPLUSPLUS_API */
828
829#endif
C++ API: Break Iterator.
The BreakIterator class implements methods for finding the location of boundaries in text.
Definition: brkiter.h:106
virtual bool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:361
The ExternalBreakEngine class define an abstract interface for the host environment to provide a low ...
Definition: rbbi.h:61
virtual bool isFor(UChar32 c, const char *locale) const =0
virtual int32_t fillBreaks(UText *text, int32_t start, int32_t end, int32_t *foundBreaks, int32_t foundBreaksCapacity, UErrorCode &status) const =0
virtual bool handles(UChar32 c) const =0
virtual ~ExternalBreakEngine()
destructor
Definition: rbbi.h:67
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:122
virtual int32_t current(void) const override
Returns the current iteration position.
virtual int32_t following(int32_t offset) override
Sets the iterator to refer to the first boundary position following the specified position.
virtual bool operator==(const BreakIterator &that) const override
Equality operator.
virtual RuleBasedBreakIterator * clone() const override
Returns a newly-constructed RuleBasedBreakIterator with the same behavior, and iterating over the sam...
virtual RuleBasedBreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status) override
Deprecated functionality.
void dumpTables()
Debugging function only.
virtual UClassID getDynamicClassID(void) const override
Returns a unique class ID POLYMORPHICALLY.
virtual int32_t last(void) override
Sets the current iteration position to the end of the text.
virtual int32_t getRuleStatus() const override
Return the status tag from the break rule that determined the boundary at the current iteration posit...
virtual UText * getUText(UText *fillIn, UErrorCode &status) const override
Get a UText for the text being analyzed.
virtual UBool isBoundary(int32_t offset) override
Returns true if the specified position is a boundary position.
bool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:372
void dumpCache()
Debugging function only.
static void registerExternalBreakEngine(ExternalBreakEngine *toAdopt, UErrorCode &status)
Register a new external break engine.
virtual RuleBasedBreakIterator & refreshInputText(UText *input, UErrorCode &status) override
Set the subject text string upon which the break iterator is operating without changing any other asp...
RuleBasedBreakIterator()
Default constructor.
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override
Get the status (tag) values from the break rule(s) that determined the boundary at the current iterat...
virtual int32_t first(void) override
Sets the current iteration position to the beginning of the text, position zero.
RuleBasedBreakIterator & operator=(const RuleBasedBreakIterator &that)
Assignment operator.
static UClassID getStaticClassID(void)
Returns the class ID for this class.
virtual int32_t preceding(int32_t offset) override
Sets the iterator to refer to the last boundary position before the specified position.
RuleBasedBreakIterator(const RuleBasedBreakIterator &that)
Copy constructor.
virtual CharacterIterator & getText(void) const override
virtual int32_t previous(void) override
Moves the iterator backwards, to the last boundary preceding this one.
virtual void adoptText(CharacterIterator *newText) override
Set the iterator to analyze a new piece of text.
RuleBasedBreakIterator(UDataMemory *image, UErrorCode &status)
This constructor uses the udata interface to create a BreakIterator whose internal tables live in a m...
virtual const UnicodeString & getRules(void) const
Returns the description used to create this iterator.
virtual int32_t next(int32_t n) override
Advances the iterator either forward or backward the specified number of steps.
virtual void setText(UText *text, UErrorCode &status) override
Reset the break iterator to operate over the text represented by the UText.
virtual ~RuleBasedBreakIterator()
Destructor.
RuleBasedBreakIterator(const UnicodeString &rules, UParseError &parseError, UErrorCode &status)
Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
virtual const uint8_t * getBinaryRules(uint32_t &length)
Return the binary form of compiled break rules, which can then be used to create a new break iterator...
virtual int32_t hashCode(void) const
Compute a hash code for this BreakIterator.
virtual int32_t next(void) override
Advances the iterator to the next boundary position.
RuleBasedBreakIterator(const uint8_t *compiledRules, uint32_t ruleLength, UErrorCode &status)
Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
virtual void setText(const UnicodeString &newText) override
Set the iterator to analyze a new piece of text.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:38
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
C API: Parse Error Information.
C++ API: String Character Iterator.
Immutable Unicode code point trie structure.
Definition: ucptrie.h:59
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
UText struct.
Definition: utext.h:1328
C API: Data loading interface.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:161
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:435
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
#define UTEXT_INITIALIZER
initializer to be used with local (stack) instances of a UText struct.
Definition: utext.h:1558
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
@ U_ZERO_ERROR
No error, no warning.
Definition: utypes.h:449
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:300