ICU 74.1 74.1
normalizer2.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
86public:
92
104 static const Normalizer2 *
106
118 static const Normalizer2 *
120
132 static const Normalizer2 *
134
146 static const Normalizer2 *
148
163 static const Normalizer2 *
165
166#ifndef U_HIDE_DRAFT_API
181 static const Normalizer2 *
183#endif // U_HIDE_DRAFT_API
184
206 static const Normalizer2 *
207 getInstance(const char *packageName,
208 const char *name,
210 UErrorCode &errorCode);
211
223 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
224 UnicodeString result;
225 normalize(src, result, errorCode);
226 return result;
227 }
241 virtual UnicodeString &
243 UnicodeString &dest,
244 UErrorCode &errorCode) const = 0;
245
268 virtual void
269 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
270 Edits *edits, UErrorCode &errorCode) const;
271
286 virtual UnicodeString &
288 const UnicodeString &second,
289 UErrorCode &errorCode) const = 0;
304 virtual UnicodeString &
306 const UnicodeString &second,
307 UErrorCode &errorCode) const = 0;
308
322 virtual UBool
323 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
324
349 virtual UBool
350 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
351
367 virtual UChar32
369
378 virtual uint8_t
380
395 virtual UBool
396 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
416 virtual UBool
418
419
436 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
437
460 virtual int32_t
461 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
462
476 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
477
492 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
493
507 virtual UBool isInert(UChar32 c) const = 0;
508};
509
522public:
533 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
534 norm2(n2), set(filterSet) {}
535
541
555 virtual UnicodeString &
557 UnicodeString &dest,
558 UErrorCode &errorCode) const override;
559
582 virtual void
583 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
584 Edits *edits, UErrorCode &errorCode) const override;
585
600 virtual UnicodeString &
602 const UnicodeString &second,
603 UErrorCode &errorCode) const override;
618 virtual UnicodeString &
620 const UnicodeString &second,
621 UErrorCode &errorCode) const override;
622
634 virtual UBool
635 getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
636
648 virtual UBool
649 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
650
661 virtual UChar32
662 composePair(UChar32 a, UChar32 b) const override;
663
672 virtual uint8_t
673 getCombiningClass(UChar32 c) const override;
674
686 virtual UBool
687 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
707 virtual UBool
708 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
721 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
733 virtual int32_t
734 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
735
744 virtual UBool hasBoundaryBefore(UChar32 c) const override;
745
754 virtual UBool hasBoundaryAfter(UChar32 c) const override;
755
763 virtual UBool isInert(UChar32 c) const override;
764private:
766 normalize(const UnicodeString &src,
767 UnicodeString &dest,
768 USetSpanCondition spanCondition,
769 UErrorCode &errorCode) const;
770
771 void
772 normalizeUTF8(uint32_t options, const char *src, int32_t length,
773 ByteSink &sink, Edits *edits,
774 USetSpanCondition spanCondition,
775 UErrorCode &errorCode) const;
776
778 normalizeSecondAndAppend(UnicodeString &first,
779 const UnicodeString &second,
780 UBool doNormalize,
781 UErrorCode &errorCode) const;
782
783 const Normalizer2 &norm2;
784 const UnicodeSet &set;
785};
786
787U_NAMESPACE_END
788
789#endif // !UCONFIG_NO_NORMALIZATION
790
791#endif /* U_SHOW_CPLUSPLUS_API */
792
793#endif // __NORMALIZER2_H__
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
Records lengths of string edits but not replacement text.
Definition: edits.h:80
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:521
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override
Tests if the UTF-8 string is normalized.
~FilteredNormalizer2()
Destructor.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool isInert(UChar32 c) const override
Tests if the character is normalization-inert.
virtual UChar32 composePair(UChar32 a, UChar32 b) const override
Performs pairwise composition of a & b and returns the composite if there is one.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the decomposition mapping of c.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override
Tests if the string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override
Gets the raw decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const override
Gets the combining class of c.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const override
Writes the normalized form of the source string to the destination string (replacing its contents) an...
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const override
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const override
Appends the second string to the first string (merging them at the boundary) and returns the first st...
virtual UBool hasBoundaryBefore(UChar32 c) const override
Tests if the character always has a normalization boundary before it, regardless of context.
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:533
virtual UBool hasBoundaryAfter(UChar32 c) const override
Tests if the character always has a normalization boundary after it, regardless of context.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override
Returns the end of the normalized substring of the input string.
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
~Normalizer2()
Destructor.
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization which is equivalent to app...
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:223
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
static const Normalizer2 * getNFKCSimpleCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization which is equi...
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
C++ API: StringPiece: Read-only byte string wrapper class.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:435
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
C++ API: Unicode Set.
C++ API: Unicode String.
C API: New API for Unicode Normalization.
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:184
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:300