ICU 75.1 75.1
Loading...
Searching...
No Matches
uniset.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation
6* and others. All Rights Reserved.
7***************************************************************************
8* Date Name Description
9* 10/20/99 alan Creation.
10***************************************************************************
11*/
12
13#ifndef UNICODESET_H
14#define UNICODESET_H
15
16#include "unicode/utypes.h"
17
18#if U_SHOW_CPLUSPLUS_API
19
20#include "unicode/ucpmap.h"
21#include "unicode/unifilt.h"
22#include "unicode/unistr.h"
23#include "unicode/uset.h"
24
30U_NAMESPACE_BEGIN
31
32// Forward Declarations.
33class BMPSet;
34class ParsePosition;
35class RBBIRuleScanner;
36class SymbolTable;
37class UnicodeSetStringSpan;
38class UVector;
39class RuleCharacterIterator;
40
286private:
291 static constexpr int32_t INITIAL_CAPACITY = 25;
292 // fFlags constant
293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
294
295 UChar32* list = stackList; // MUST be terminated with HIGH
296 int32_t capacity = INITIAL_CAPACITY; // capacity of list
297 int32_t len = 1; // length of list used; 1 <= len <= capacity
298 uint8_t fFlags = 0; // Bit flag (see constants above)
299
300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr.
301 UChar32* buffer = nullptr; // internal buffer, may be nullptr
302 int32_t bufferCapacity = 0; // capacity of buffer
303
313 char16_t *pat = nullptr;
314 int32_t patLen = 0;
315
316 UVector* strings = nullptr; // maintained in sorted order
317 UnicodeSetStringSpan *stringSpan = nullptr;
318
324 UChar32 stackList[INITIAL_CAPACITY];
325
326public:
336 inline UBool isBogus() const;
337
355
356public:
357
358 enum {
363 MIN_VALUE = 0,
364
369 MAX_VALUE = 0x10ffff
370 };
371
372 //----------------------------------------------------------------
373 // Constructors &c
374 //----------------------------------------------------------------
375
376public:
377
383
393
394#ifndef U_HIDE_INTERNAL_API
399 kSerialized /* result of serialize() */
400 };
401
414#endif /* U_HIDE_INTERNAL_API */
415
424 UnicodeSet(const UnicodeString& pattern,
426
427#ifndef U_HIDE_INTERNAL_API
442 UnicodeSet(const UnicodeString& pattern,
443 uint32_t options,
444 const SymbolTable* symbols,
446#endif /* U_HIDE_INTERNAL_API */
447
464 uint32_t options,
465 const SymbolTable* symbols,
467
473
478 virtual ~UnicodeSet();
479
486
498 virtual bool operator==(const UnicodeSet& o) const;
499
505 inline bool operator!=(const UnicodeSet& o) const;
506
516 virtual UnicodeSet* clone() const override;
517
525 virtual int32_t hashCode() const;
526
535 inline static UnicodeSet *fromUSet(USet *uset);
536
545 inline static const UnicodeSet *fromUSet(const USet *uset);
546
554 inline USet *toUSet();
555
556
564 inline const USet * toUSet() const;
565
566
567 //----------------------------------------------------------------
568 // Freezable API
569 //----------------------------------------------------------------
570
579 inline UBool isFrozen() const;
580
595
605
606 //----------------------------------------------------------------
607 // Public API
608 //----------------------------------------------------------------
609
620
626 static UBool resemblesPattern(const UnicodeString& pattern,
627 int32_t pos);
628
643
644#ifndef U_HIDE_INTERNAL_API
664 uint32_t options,
665 const SymbolTable* symbols,
667#endif /* U_HIDE_INTERNAL_API */
668
703 ParsePosition& pos,
704 uint32_t options,
705 const SymbolTable* symbols,
707
722 UBool escapeUnprintable = false) const override;
723
747 int32_t value,
748 UErrorCode& ec);
749
780 const UnicodeString& value,
781 UErrorCode& ec);
782
795 virtual int32_t size() const;
796
803 virtual UBool isEmpty() const;
804
810
818 virtual UBool contains(UChar32 c) const override;
819
828 virtual UBool contains(UChar32 start, UChar32 end) const;
829
837 UBool contains(const UnicodeString& s) const;
838
846 virtual UBool containsAll(const UnicodeSet& c) const;
847
856
866
875
884
893 inline UBool containsSome(UChar32 start, UChar32 end) const;
894
902 inline UBool containsSome(const UnicodeSet& s) const;
903
911 inline UBool containsSome(const UnicodeString& s) const;
912
931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
932
945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
946
964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
965
979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
980
1000
1019
1024 virtual UMatchDegree matches(const Replaceable& text,
1025 int32_t& offset,
1026 int32_t limit,
1027 UBool incremental) override;
1028
1029private:
1052 static int32_t matchRest(const Replaceable& text,
1053 int32_t start, int32_t limit,
1054 const UnicodeString& s);
1055
1065 int32_t findCodePoint(UChar32 c) const;
1066
1067public:
1068
1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1077
1087
1103 UChar32 charAt(int32_t index) const;
1104
1119 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1120
1132
1145
1146 private:
1152 static int32_t getSingleCP(const UnicodeString& s);
1153
1154 void _add(const UnicodeString& s);
1155
1156 public:
1166
1175
1184
1193
1203
1204
1213
1225 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1226
1227
1237
1249
1263 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1264
1276
1287
1301
1314 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1315
1327
1338
1351 virtual UnicodeSet& addAll(const UnicodeSet& c);
1352
1364 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1365
1377 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1378
1390
1397 virtual UnicodeSet& clear();
1398
1427
1435
1443 virtual int32_t getRangeCount() const;
1444
1452 virtual UChar32 getRangeStart(int32_t index) const;
1453
1461 virtual UChar32 getRangeEnd(int32_t index) const;
1462
1512
1520
1533
1542 virtual UClassID getDynamicClassID() const override;
1543
1544 private:
1545
1546 // Private API for the USet API
1547
1548 friend class USetAccess;
1549
1550 const UnicodeString* getString(int32_t index) const;
1551
1552 //----------------------------------------------------------------
1553 // RuleBasedTransliterator support
1554 //----------------------------------------------------------------
1555
1556private:
1557
1563 virtual UBool matchesIndexValue(uint8_t v) const override;
1564
1565private:
1566 friend class RBBIRuleScanner;
1567
1568 //----------------------------------------------------------------
1569 // Implementation: Clone as thawed (see ICU4J Freezable)
1570 //----------------------------------------------------------------
1571
1572 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1573 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1574
1575 //----------------------------------------------------------------
1576 // Implementation: Pattern parsing
1577 //----------------------------------------------------------------
1578
1579 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1580 ParsePosition& pos,
1581 const SymbolTable* symbols,
1583
1584 void applyPattern(RuleCharacterIterator& chars,
1585 const SymbolTable* symbols,
1587 uint32_t options,
1589 int32_t depth,
1590 UErrorCode& ec);
1591
1592 void closeOverCaseInsensitive(bool simple);
1593 void closeOverAddCaseMappings();
1594
1595 //----------------------------------------------------------------
1596 // Implementation: Utility methods
1597 //----------------------------------------------------------------
1598
1599 static int32_t nextCapacity(int32_t minCapacity);
1600
1601 bool ensureCapacity(int32_t newLen);
1602
1603 bool ensureBufferCapacity(int32_t newLen);
1604
1605 void swapBuffers();
1606
1607 UBool allocateStrings(UErrorCode &status);
1608 int32_t stringsSize() const;
1609 UBool stringsContains(const UnicodeString &s) const;
1610
1611 UnicodeString& _toPattern(UnicodeString& result,
1612 UBool escapeUnprintable) const;
1613
1614 UnicodeString& _generatePattern(UnicodeString& result,
1615 UBool escapeUnprintable) const;
1616
1617 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1618
1619 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1620
1621 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1623
1624 //----------------------------------------------------------------
1625 // Implementation: Fundamental operators
1626 //----------------------------------------------------------------
1627
1628 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1629
1630 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1631
1632 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1633
1639 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1640 int32_t pos);
1641
1642 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1644
1684 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1686 UErrorCode &ec);
1687
1688 void applyPropertyPattern(RuleCharacterIterator& chars,
1690 UErrorCode& ec);
1691
1696 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1697
1707 void applyFilter(Filter filter,
1708 void* context,
1709 const UnicodeSet* inclusions,
1711
1715 void setPattern(const UnicodeString& newPat) {
1716 setPattern(newPat.getBuffer(), newPat.length());
1717 }
1718 void setPattern(const char16_t *newPat, int32_t newPatLen);
1722 void releasePattern();
1723
1724 friend class UnicodeSetIterator;
1725};
1726
1727
1728
1729inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1730 return !operator==(o);
1731}
1732
1733inline UBool UnicodeSet::isFrozen() const {
1734 return (UBool)(bmpSet!=nullptr || stringSpan!=nullptr);
1735}
1736
1737inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1738 return !containsNone(start, end);
1739}
1740
1741inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1742 return !containsNone(s);
1743}
1744
1745inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1746 return !containsNone(s);
1747}
1748
1749inline UBool UnicodeSet::isBogus() const {
1750 return (UBool)(fFlags & kIsBogus);
1751}
1752
1753inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1754 return reinterpret_cast<UnicodeSet *>(uset);
1755}
1756
1757inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1758 return reinterpret_cast<const UnicodeSet *>(uset);
1759}
1760
1761inline USet *UnicodeSet::toUSet() {
1762 return reinterpret_cast<USet *>(this);
1763}
1764
1765inline const USet *UnicodeSet::toUSet() const {
1766 return reinterpret_cast<const USet *>(this);
1767}
1768
1769inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1771 if(start<0) {
1772 start=0;
1773 } else if(start>sLength) {
1774 start=sLength;
1775 }
1776 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1777}
1778
1779inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1781 if(limit<0) {
1782 limit=0;
1783 } else if(limit>sLength) {
1784 limit=sLength;
1785 }
1786 return spanBack(s.getBuffer(), limit, spanCondition);
1787}
1788
1790
1791#endif /* U_SHOW_CPLUSPLUS_API */
1792
1793#endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition RunArrays.h:32
"Smart pointer" base class; do not use directly: use LocalPointer etc.
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition parsepos.h:52
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition rep.h:77
An interface that defines both lookup protocol and parsing of symbolic names.
Definition symtable.h:59
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition unifilt.h:65
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns true if this matcher will match a character c, where c & 0xFF == v, at offset,...
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition usetiter.h:67
A mutable set of Unicode characters and multicharacter strings.
Definition uniset.h:285
UnicodeSet & operator=(const UnicodeSet &o)
Assigns this object to be a copy of another.
UnicodeSet & addAll(const UnicodeString &s)
Adds each of the characters in this string to the set.
virtual UChar32 getRangeEnd(int32_t index) const
Iteration method that returns the last character in the specified range of this set.
UnicodeSet()
Constructs an empty set.
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet(UChar32 start, UChar32 end)
Constructs a set containing the given range.
static UnicodeSet * createFromAll(const UnicodeString &s)
Makes a set from each of the characters in the string.
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & complementAll(const UnicodeString &s)
Complement EACH of the characters in this string.
virtual UBool isEmpty() const
Returns true if this set contains no elements.
void setToBogus()
Make this UnicodeSet object invalid.
virtual UnicodeSet * clone() const override
Returns a copy of this object.
UnicodeSet & applyPropertyAlias(const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given property.
virtual UnicodeSet & removeAll(const UnicodeSet &c)
Removes from this set all of its elements that are contained in the specified set.
virtual UnicodeSet & remove(UChar32 start, UChar32 end)
Removes the specified range from this set if it is present.
virtual UClassID getDynamicClassID() const override
Implement UnicodeFunctor API.
virtual int32_t size() const
Returns the number of elements in this set (its cardinality).
UnicodeSet & applyPattern(const UnicodeString &pattern, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, ignoring Unicode Pattern_White...
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const override
Implementation of UnicodeMatcher API.
virtual UChar32 getRangeStart(int32_t index) const
Iteration method that returns the first character in the specified range of this set.
UnicodeSet & add(const UnicodeString &s)
Adds the specified multicharacter to this set if it is not already present.
UnicodeSet & applyPattern(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, optionally ignoring Unicode Pa...
UnicodeSet * cloneAsThawed() const
Clone the set and make the clone mutable.
int32_t indexOf(UChar32 c) const
Returns the index of the given character within this set, where the set is ordered by ascending code ...
UnicodeSet & closeOver(int32_t attribute)
Close this set over the given attribute.
UnicodeSet(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual UnicodeSet & complementAll(const UnicodeSet &c)
Complements in this set all elements contained in the specified set.
UnicodeSet & complement(const UnicodeString &s)
Complement the specified string in this set.
virtual UBool containsAll(const UnicodeSet &c) const
Returns true if this set contains all the characters and strings of the given set.
UBool containsNone(const UnicodeString &s) const
Returns true if this set contains none of the characters of the given string.
UnicodeSet(const UnicodeSet &o)
Constructs a set that is identical to the given UnicodeSet.
static UBool resemblesPattern(const UnicodeString &pattern, int32_t pos)
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet patt...
virtual ~UnicodeSet()
Destructs the set.
UnicodeSet & retain(UChar32 c)
Retain the specified character from this set if it is present.
virtual UnicodeSet & compact()
Reallocate this objects internal structures to take up the least possible space, without changing thi...
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & retainAll(const UnicodeString &s)
Retains EACH of the characters in this string.
virtual UnicodeSet & removeAllStrings()
Remove all strings from this set.
UChar32 charAt(int32_t index) const
Returns the character at the given index within this set, where the set is ordered by ascending code ...
static UClassID getStaticClassID()
Return the class ID for this class.
UBool containsNone(UChar32 start, UChar32 end) const
Returns true if this set contains none of the characters of the given range.
virtual UBool contains(UChar32 start, UChar32 end) const
Returns true if this set contains every character of the given range.
virtual UnicodeSet & clear()
Removes all of the elements from this set.
virtual int32_t getRangeCount() const
Iteration method that returns the number of ranges contained in this set.
UBool containsAll(const UnicodeString &s) const
Returns true if this set contains all the characters of the given string.
UnicodeSet & removeAll(const UnicodeString &s)
Remove EACH of the characters in this string.
int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const
Serializes this set into an array of 16-bit integers.
UnicodeSet & set(UChar32 start, UChar32 end)
Make this object represent the range start - end.
UBool contains(const UnicodeString &s) const
Returns true if this set contains the given multicharacter string.
virtual UnicodeSet & complement(UChar32 start, UChar32 end)
Complements the specified range in this set.
static UnicodeSet * createFrom(const UnicodeString &s)
Makes a set from a multicharacter string.
virtual int32_t hashCode() const
Returns the hash code value for this set.
virtual bool operator==(const UnicodeSet &o) const
Compares the specified object with this set for equality.
UnicodeSet(const UnicodeString &pattern, UErrorCode &status)
Constructs a set from the given pattern.
UBool hasStrings() const
UnicodeSet & remove(UChar32 c)
Removes the specified character from this set if it is present.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=false) const override
Returns a string representation of this set.
virtual UnicodeSet & complement()
This is equivalent to complement(MIN_VALUE, MAX_VALUE).
UnicodeSet & add(UChar32 c)
Adds the specified character to this set if it is not already present.
UnicodeSet & applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given binary or enu...
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet & remove(const UnicodeString &s)
Removes the specified string from this set if it is present.
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental) override
Implement UnicodeMatcher::matches()
virtual UnicodeSet & add(UChar32 start, UChar32 end)
Adds the specified range to this set if it is not already present.
UnicodeSet * freeze()
Freeze the set (make it immutable).
UnicodeSet & applyPattern(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Parses the given pattern, starting at the given position.
virtual UBool contains(UChar32 c) const override
Returns true if this set contains the given character.
UnicodeSet & retain(const UnicodeString &s)
Retains only the specified string from this set if it is present.
UnicodeSet(const uint16_t buffer[], int32_t bufferLen, ESerialization serialization, UErrorCode &status)
Constructs a set from the output of serialize().
virtual UnicodeSet & retain(UChar32 start, UChar32 end)
Retain only the elements in this set that are contained in the specified range.
virtual UnicodeSet & addAll(const UnicodeSet &c)
Adds all of the elements in the specified set to this set if they're not already present.
UnicodeSet(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual UnicodeSet & retainAll(const UnicodeSet &c)
Retains only the elements in this set that are contained in the specified set.
UnicodeSet & complement(UChar32 c)
Complements the specified character in this set.
UBool containsNone(const UnicodeSet &c) const
Returns true if this set contains none of the characters and strings of the given set.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:296
int32_t length() const
Return the length of the UnicodeString object.
Definition unistr.h:3901
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition unimatch.h:33
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
UProperty
Selection constants for Unicode properties.
Definition uchar.h:196
C API: This file defines an abstract map from Unicode code points to integer values.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:427
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition umachine.h:247
C++ API: Unicode Filter.
C++ API: Unicode String.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition uobject.h:96
C API: Unicode Set.
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition uset.h:182
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition uset.h:50
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:300