ICU 76.1 76.1
Loading...
Searching...
No Matches
normalizer2.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
86public:
92
104 static const Normalizer2 *
106
118 static const Normalizer2 *
120
132 static const Normalizer2 *
134
146 static const Normalizer2 *
148
163 static const Normalizer2 *
165
180 static const Normalizer2 *
182
204 static const Normalizer2 *
206 const char *name,
208 UErrorCode &errorCode);
209
221 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
223 normalize(src, result, errorCode);
224 return result;
225 }
239 virtual UnicodeString &
242 UErrorCode &errorCode) const = 0;
243
266 virtual void
268 Edits *edits, UErrorCode &errorCode) const;
269
284 virtual UnicodeString &
286 const UnicodeString &second,
287 UErrorCode &errorCode) const = 0;
302 virtual UnicodeString &
304 const UnicodeString &second,
305 UErrorCode &errorCode) const = 0;
306
320 virtual UBool
322
347 virtual UBool
349
365 virtual UChar32
367
376 virtual uint8_t
378
393 virtual UBool
394 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
414 virtual UBool
416
417
434 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
435
458 virtual int32_t
459 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
460
474 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
475
490 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
491
505 virtual UBool isInert(UChar32 c) const = 0;
506};
507
520public:
532 norm2(n2), set(filterSet) {}
533
539
553 virtual UnicodeString &
556 UErrorCode &errorCode) const override;
557
580 virtual void
582 Edits *edits, UErrorCode &errorCode) const override;
583
598 virtual UnicodeString &
600 const UnicodeString &second,
601 UErrorCode &errorCode) const override;
616 virtual UnicodeString &
618 const UnicodeString &second,
619 UErrorCode &errorCode) const override;
620
632 virtual UBool
634
646 virtual UBool
648
659 virtual UChar32
660 composePair(UChar32 a, UChar32 b) const override;
661
670 virtual uint8_t
671 getCombiningClass(UChar32 c) const override;
672
684 virtual UBool
685 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
705 virtual UBool
706 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
719 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
731 virtual int32_t
732 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
733
742 virtual UBool hasBoundaryBefore(UChar32 c) const override;
743
752 virtual UBool hasBoundaryAfter(UChar32 c) const override;
753
761 virtual UBool isInert(UChar32 c) const override;
762private:
764 normalize(const UnicodeString &src,
767 UErrorCode &errorCode) const;
768
769 void
770 normalizeUTF8(uint32_t options, const char *src, int32_t length,
773 UErrorCode &errorCode) const;
774
776 normalizeSecondAndAppend(UnicodeString &first,
777 const UnicodeString &second,
779 UErrorCode &errorCode) const;
780
781 const Normalizer2 &norm2;
782 const UnicodeSet &set;
783};
784
786
787#endif // !UCONFIG_NO_NORMALIZATION
788
789#endif /* U_SHOW_CPLUSPLUS_API */
790
791#endif // __NORMALIZER2_H__
A ByteSink can be filled with bytes.
Definition bytestream.h:53
Records lengths of string edits but not replacement text.
Definition edits.h:80
Normalization filtered by a UnicodeSet.
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override
Tests if the UTF-8 string is normalized.
~FilteredNormalizer2()
Destructor.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool isInert(UChar32 c) const override
Tests if the character is normalization-inert.
virtual UChar32 composePair(UChar32 a, UChar32 b) const override
Performs pairwise composition of a & b and returns the composite if there is one.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the decomposition mapping of c.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the raw decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const override
Gets the combining class of c.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const override
Writes the normalized form of the source string to the destination string (replacing its contents) an...
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const override
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the second string to the first string (merging them at the boundary) and returns the first st...
virtual UBool hasBoundaryBefore(UChar32 c) const override
Tests if the character always has a normalization boundary before it, regardless of context.
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
virtual UBool hasBoundaryAfter(UChar32 c) const override
Tests if the character always has a normalization boundary after it, regardless of context.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override
Returns the end of the normalized substring of the input string.
"Smart pointer" base class; do not use directly: use LocalPointer etc.
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition normalizer2.h:85
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
~Normalizer2()
Destructor.
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization which is equivalent to app...
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
static const Normalizer2 * getNFKCSimpleCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization which is equi...
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
A string-like object that points to a sized piece of memory.
Definition stringpiece.h:61
UObject is the common ICU "boilerplate" class.
Definition uobject.h:223
A mutable set of Unicode characters and multicharacter strings.
Definition uniset.h:285
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:296
C++ API: StringPiece: Read-only byte string wrapper class.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:427
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition umachine.h:247
C++ API: Unicode Set.
C++ API: Unicode String.
C API: New API for Unicode Normalization.
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition unorm2.h:97
UNormalization2Mode
Constants for normalization modes.
Definition unorm2.h:48
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition uset.h:185
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition utypes.h:430
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:315