C API: Abstract Unicode Text API. More...

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/localpointer.h"
#include "unicode/rep.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"

Data Structures
struct	UTextFuncs
	(public) Function dispatch table for UText. More...

struct	UText
	UText struct. More...

Namespaces
	icu
	File coll.h.

Macros
#define	UTEXT_CURRENT32(ut)
	inline version of utext_current32(), for performance-critical situations. More...

#define	UTEXT_NEXT32(ut)
	inline version of utext_next32(), for performance-critical situations. More...

#define	UTEXT_PREVIOUS32(ut)
	inline version of utext_previous32(), for performance-critical situations. More...

#define	UTEXT_GETNATIVEINDEX(ut)
	inline version of utext_getNativeIndex(), for performance-critical situations. More...

#define	UTEXT_SETNATIVEINDEX(ut, ix)
	inline version of utext_setNativeIndex(), for performance-critical situations. More...

#define	UTEXT_INITIALIZER
	initializer to be used with local (stack) instances of a UText struct. More...

Typedefs
typedef struct UText	UText
	C typedef for struct UText. More...

typedef UText *	UTextClone(UText dest, const UText src, UBool deep, UErrorCode *status)
	Function type declaration for UText.clone(). More...

typedef int64_t	UTextNativeLength(UText *ut)
	Function type declaration for UText.nativeLength(). More...

typedef UBool	UTextAccess(UText *ut, int64_t nativeIndex, UBool forward)
	Function type declaration for UText.access(). More...

typedef int32_t	UTextExtract(UText ut, int64_t nativeStart, int64_t nativeLimit, UChar dest, int32_t destCapacity, UErrorCode *status)
	Function type declaration for UText.extract(). More...

typedef int32_t	UTextReplace(UText ut, int64_t nativeStart, int64_t nativeLimit, const UChar replacementText, int32_t replacmentLength, UErrorCode *status)
	Function type declaration for UText.replace(). More...

typedef void	UTextCopy(UText ut, int64_t nativeStart, int64_t nativeLimit, int64_t nativeDest, UBool move, UErrorCode status)
	Function type declaration for UText.copy(). More...

typedef int64_t	UTextMapOffsetToNative(const UText *ut)
	Function type declaration for UText.mapOffsetToNative(). More...

typedef int32_t	UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex)
	Function type declaration for UText.mapIndexToUTF16(). More...

typedef void	UTextClose(UText *ut)
	Function type declaration for UText.utextClose(). More...

typedef struct UTextFuncs	UTextFuncs
	Function dispatch table for UText. More...

Enumerations
enum	{ UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1 , UTEXT_PROVIDER_STABLE_CHUNKS = 2 , UTEXT_PROVIDER_WRITABLE = 3 , UTEXT_PROVIDER_HAS_META_DATA = 4 , UTEXT_PROVIDER_OWNS_TEXT = 5 }
	UText provider properties (bit field indexes). More...

enum	{ UTEXT_MAGIC = 0x345ad82c }

Functions
U_CAPI UText *	utext_close (UText *ut)
	Close function for UText instances. More...

U_CAPI UText *	utext_openUTF8 (UText ut, const char s, int64_t length, UErrorCode *status)
	Open a read-only UText implementation for UTF-8 strings. More...

U_CAPI UText *	utext_openUChars (UText ut, const UChar s, int64_t length, UErrorCode *status)
	Open a read-only UText for UChar * string. More...

U_CAPI UText *	utext_openUnicodeString (UText ut, icu::UnicodeString s, UErrorCode *status)
	Open a writable UText for a non-const UnicodeString. More...

U_CAPI UText *	utext_openConstUnicodeString (UText ut, const icu::UnicodeString s, UErrorCode *status)
	Open a UText for a const UnicodeString. More...

U_CAPI UText *	utext_openReplaceable (UText ut, icu::Replaceable rep, UErrorCode *status)
	Open a writable UText implementation for an ICU Replaceable object. More...

U_CAPI UText *	utext_openCharacterIterator (UText ut, icu::CharacterIterator ci, UErrorCode *status)
	Open a UText implementation over an ICU CharacterIterator. More...

U_CAPI UText *	utext_clone (UText dest, const UText src, UBool deep, UBool readOnly, UErrorCode *status)
	Clone a UText. More...

U_CAPI UBool	utext_equals (const UText a, const UText b)
	Compare two UText objects for equality. More...

U_CAPI int64_t	utext_nativeLength (UText *ut)
	Get the length of the text. More...

U_CAPI UBool	utext_isLengthExpensive (const UText *ut)
	Return true if calculating the length of the text could be expensive. More...

U_CAPI UChar32	utext_char32At (UText *ut, int64_t nativeIndex)
	Returns the code point at the requested index, or U_SENTINEL (-1) if it is out of bounds. More...

U_CAPI UChar32	utext_current32 (UText *ut)
	Get the code point at the current iteration position, or U_SENTINEL (-1) if the iteration has reached the end of the input text. More...

U_CAPI UChar32	utext_next32 (UText *ut)
	Get the code point at the current iteration position of the UText, and advance the position to the first index following the character. More...

U_CAPI UChar32	utext_previous32 (UText *ut)
	Move the iterator position to the character (code point) whose index precedes the current position, and return that character. More...

U_CAPI UChar32	utext_next32From (UText *ut, int64_t nativeIndex)
	Set the iteration index and return the code point at that index. More...

U_CAPI UChar32	utext_previous32From (UText *ut, int64_t nativeIndex)
	Set the iteration index, and return the code point preceding the one specified by the initial index. More...

U_CAPI int64_t	utext_getNativeIndex (const UText *ut)
	Get the current iterator position, which can range from 0 to the length of the text. More...

U_CAPI void	utext_setNativeIndex (UText *ut, int64_t nativeIndex)
	Set the current iteration position to the nearest code point boundary at or preceding the specified index. More...

U_CAPI UBool	utext_moveIndex32 (UText *ut, int32_t delta)
	Move the iterator position by delta code points. More...

U_CAPI int64_t	utext_getPreviousNativeIndex (UText *ut)
	Get the native index of the character preceding the current position. More...

U_CAPI int32_t	utext_extract (UText ut, int64_t nativeStart, int64_t nativeLimit, UChar dest, int32_t destCapacity, UErrorCode *status)
	Extract text from a UText into a UChar buffer. More...

U_CAPI UBool	utext_isWritable (const UText *ut)
	Return true if the text can be written (modified) with utext_replace() or utext_copy(). More...

U_CAPI UBool	utext_hasMetaData (const UText *ut)
	Test whether there is meta data associated with the text. More...

U_CAPI int32_t	utext_replace (UText ut, int64_t nativeStart, int64_t nativeLimit, const UChar replacementText, int32_t replacementLength, UErrorCode *status)
	Replace a range of the original text with a replacement text. More...

U_CAPI void	utext_copy (UText ut, int64_t nativeStart, int64_t nativeLimit, int64_t destIndex, UBool move, UErrorCode status)
	Copy or move a substring from one position to another within the text, while retaining any metadata associated with the text. More...

U_CAPI void	utext_freeze (UText *ut)

U_CAPI UText *	utext_setup (UText ut, int32_t extraSpace, UErrorCode status)
	Common function for use by Text Provider implementations to allocate and/or initialize a new UText struct. More...

Detailed Description

C API: Abstract Unicode Text API.

The Text Access API provides a means to allow text that is stored in alternative formats to work with ICU services. ICU normally operates on text that is stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type UnicodeString for C++ APIs.

ICU Text Access allows other formats, such as UTF-8 or non-contiguous UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services.

There are three general classes of usage for UText:

Application Level Use.  This is the simplest usage - applications would
use one of the utext_open() functions on their input text, and pass
the resulting UText to the desired ICU service.

Second is usage in ICU Services, such as break iteration, that will need to
operate on input presented to them as a UText.  These implementations
will need to use the iteration and related UText functions to gain
access to the actual text.

The third class of UText users are "text providers."  These are the
UText implementations for the various text storage formats.  An application
or system with a unique text storage format can implement a set of
UText provider functions for that format, which will then allow
ICU services to operate on that format.

Iterating over text

Here is sample code for a forward iteration over the contents of a UText

UChar32  c;
UText    *ut = whatever();
 
for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) {
   // do whatever with the codepoint c here.
}

And here is similar code to iterate in the reverse direction, from the end of the text towards the beginning.

UChar32  c;
UText    *ut = whatever();
int      textLength = utext_nativeLength(ut);
for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) {
   // do whatever with the codepoint c here.
}

Characters and Indexing

Indexing into text by UText functions is nearly always in terms of the native indexing of the underlying text storage. The storage format could be UTF-8 or UTF-32, for example. When coding to the UText access API, no assumptions can be made regarding the size of characters, or how far an index may move when iterating between characters.

All indices supplied to UText functions are pinned to the length of the text. An out-of-bounds index is not considered to be an error, but is adjusted to be in the range 0 <= index <= length of input text.

When an index position is returned from a UText function, it will be a native index to the underlying text. In the case of multi-unit characters, it will always refer to the first position of the character, never to the interior. This is essentially the same thing as saying that a returned index will always point to a boundary between characters.

When a native index is supplied to a UText function, all indices that refer to any part of a multi-unit character representation are considered to be equivalent. In the case of multi-unit characters, an incoming index will be logically normalized to refer to the start of the character.

It is possible to test whether a native index is on a code point boundary by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). If the index is returned unchanged, it was on a code point boundary. If an adjusted index is returned, the original index referred to the interior of a character.

Conventions for calling UText functions

Most UText access functions have as their first parameter a (UText *) pointer, which specifies the UText to be used. Unless otherwise noted, the pointer must refer to a valid, open UText. Attempting to use a closed UText or passing a NULL pointer is a programming error and will produce undefined results or NULL pointer exceptions.

The UText_Open family of functions can either open an existing (closed) UText, or heap allocate a new UText. Here is sample code for creating a stack-allocated UText.

char     *s = whatever();  // A utf-8 string 
U_ErrorCode status = U_ZERO_ERROR;
UText    ut = UTEXT_INITIALIZER;
utext_openUTF8(ut, s, -1, &status);
if (U_FAILURE(status)) {
    // error handling
} else {
    // work with the UText
}

Any existing UText passed to an open function must have been initialized, either by the UTEXT_INITIALIZER, or by having been originally heap-allocated by an open function. Passing NULL will cause the open function to heap-allocate and fully initialize a new UText.

Definition in file utext.h.

Macro Definition Documentation

◆ UTEXT_CURRENT32

#define UTEXT_CURRENT32 ( ut )

Value:

((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \

((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut))

utext_current32

U_CAPI UChar32 utext_current32(UText *ut)

Get the code point at the current iteration position, or U_SENTINEL (-1) if the iteration has reached...

inline version of utext_current32(), for performance-critical situations.

Get the code point at the current iteration position of the UText. Returns U_SENTINEL (-1) if the position is at the end of the text.

Internal:: Do not use. This API is for internal use only. ICU 4.4 technology preview

Definition at line 687 of file utext.h.

◆ UTEXT_GETNATIVEINDEX

#define UTEXT_GETNATIVEINDEX ( ut )

Value:

    ((ut)->chunkOffset <= (ut)->nativeIndexingLimit?   \
        (ut)->chunkNativeStart+(ut)->chunkOffset :     \
        (ut)->pFuncs->mapOffsetToNative(ut))

inline version of utext_getNativeIndex(), for performance-critical situations.

Get the current iterator position, which can range from 0 to the length of the text. The position is a native index into the input text, in whatever format it may have (possibly UTF-8 for example), and may not always be the same as the corresponding UChar (UTF-16) index. The returned position will always be aligned to a code point boundary.

Stable:: ICU 3.6

Definition at line 734 of file utext.h.

◆ UTEXT_INITIALIZER

#define UTEXT_INITIALIZER

Value:

                  {                                        \
                  UTEXT_MAGIC,          /* magic                */ \
                  0,                    /* flags                */ \
                  0,                    /* providerProps        */ \
                  sizeof(UText),        /* sizeOfStruct         */ \
                  0,                    /* chunkNativeLimit     */ \
                  0,                    /* extraSize            */ \
                  0,                    /* nativeIndexingLimit  */ \
                  0,                    /* chunkNativeStart     */ \
                  0,                    /* chunkOffset          */ \
                  0,                    /* chunkLength          */ \
                  NULL,                 /* chunkContents        */ \
                  NULL,                 /* pFuncs               */ \
                  NULL,                 /* pExtra               */ \
                  NULL,                 /* context              */ \
                  NULL, NULL, NULL,     /* p, q, r              */ \
                  NULL,                 /* privP                */ \
                  0, 0, 0,              /* a, b, c              */ \
                  0, 0, 0               /* privA,B,C,           */ \
                  }

initializer to be used with local (stack) instances of a UText struct.

UText structs must be initialized before passing them to one of the utext_open functions.

Stable:: ICU 3.6

Definition at line 1558 of file utext.h.

◆ UTEXT_NEXT32

#define UTEXT_NEXT32 ( ut )

Value:

((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \

((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut))

inline version of utext_next32(), for performance-critical situations.

Get the code point at the current iteration position of the UText, and advance the position to the first index following the character. This is a post-increment operation. Returns U_SENTINEL (-1) if the position is at the end of the text.

Stable:: ICU 3.4

Definition at line 703 of file utext.h.

◆ UTEXT_PREVIOUS32

#define UTEXT_PREVIOUS32 ( ut )

Value:

    ((ut)->chunkOffset > 0 && \
     (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \
          (ut)->chunkContents[--((ut)->chunkOffset)]  :  utext_previous32(ut))

inline version of utext_previous32(), for performance-critical situations.

Move the iterator position to the character (code point) whose index precedes the current position, and return that character. This is a pre-decrement operation. Returns U_SENTINEL (-1) if the position is at the start of the text.

Stable:: ICU 3.4

Definition at line 717 of file utext.h.

◆ UTEXT_SETNATIVEINDEX

#define UTEXT_SETNATIVEINDEX	(	ut,
		ix
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int64_t __offset = (ix) - (ut)->chunkNativeStart; \
    if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \
        (ut)->chunkOffset=(int32_t)__offset; \
    } else { \
        utext_setNativeIndex((ut), (ix)); \
    } \
} UPRV_BLOCK_MACRO_END

inline version of utext_setNativeIndex(), for performance-critical situations.

Set the current iteration position to the nearest code point boundary at or preceding the specified index. The index is in the native units of the original input text. If the index is out of range, it will be pinned to be within the range of the input text.

Stable:: ICU 3.8

Definition at line 750 of file utext.h.

Typedef Documentation

◆ UText

typedef struct UText UText

C typedef for struct UText.

Stable:: ICU 3.6

Definition at line 1 of file utext.h.

◆ UTextAccess

typedef UBool UTextAccess(UText *ut, int64_t nativeIndex, UBool forward)

Function type declaration for UText.access().

Get the description of the text chunk containing the text at a requested native index. The UText's iteration position will be left at the requested index. If the index is out of bounds, the iteration position will be left at the start or end of the string, as appropriate.

Chunks must begin and end on code point boundaries. A single code point comprised of multiple storage units must never span a chunk boundary.

Parameters

ut	the UText being accessed.
nativeIndex	Requested index of the text to be accessed.
forward	If true, then the returned chunk must contain text starting from the index, so that start<=index<limit. If false, then the returned chunk must contain text before the index, so that start<index<=limit.

Returns: True if the requested index could be accessed. The chunk will contain the requested text. False value if a chunk cannot be accessed (the requested index is out of bounds).

See also: UText

Stable:: ICU 3.4

Definition at line 1024 of file utext.h.

◆ UTextClone

typedef UText* UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)

Function type declaration for UText.clone().

clone a UText. Much like opening a UText where the source text is itself another UText.

A deep clone will copy both the UText data structures and the underlying text. The original and cloned UText will operate completely independently; modifications made to the text in one will not effect the other. Text providers are not required to support deep clones. The user of clone() must check the status return and be prepared to handle failures.

A shallow clone replicates only the UText data structures; it does not make a copy of the underlying text. Shallow clones can be used as an efficient way to have multiple iterators active in a single text string that is not being modified.

A shallow clone operation must not fail except for truly exceptional conditions such as memory allocation failures.

A UText and its clone may be safely concurrently accessed by separate threads. This is true for both shallow and deep clones. It is the responsibility of the Text Provider to ensure that this thread safety constraint is met.

Parameters

dest	A UText struct to be filled in with the result of the clone operation, or NULL if the clone function should heap-allocate a new UText struct.
src	The UText to be cloned.
deep	true to request a deep clone, false for a shallow clone.
status	Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR should be returned if the text provider is unable to clone the original text.

Returns: The newly created clone, or NULL if the clone operation failed.

Stable:: ICU 3.4

Definition at line 984 of file utext.h.

◆ UTextClose

typedef void UTextClose(UText *ut)

Function type declaration for UText.utextClose().

A Text Provider close function is only required for provider types that make allocations in their open function (or other functions) that must be cleaned when the UText is closed.

The allocation of the UText struct itself and any "extra" storage associated with the UText is handled by the common UText implementation and does not require provider specific cleanup in a close function.

Most UText provider implementations do not need to implement this function.

Parameters

ut	A UText object to be closed.

Stable:: ICU 3.4

Definition at line 1182 of file utext.h.

◆ UTextCopy

typedef void UTextCopy(UText *ut, int64_t nativeStart, int64_t nativeLimit, int64_t nativeDest, UBool move, UErrorCode *status)

Function type declaration for UText.copy().

Copy or move a substring from one position to another within the text, while retaining any metadata associated with the text. This function is used to duplicate or reorder substrings. The destination index must not overlap the source range.

The text to be copied or moved is inserted at destIndex; it does not replace or overwrite any existing text.

This function need only be implemented for UText types that support writing.

When using this function, there should be only a single UText opened onto the underlying native text string. The function is responsible for updating the text chunk within the UText to reflect the updated iteration position, taking into account any changes to the underlying string's structure caused by the replace operation.

Parameters

ut	The UText representing the text to be operated on.
nativeStart	The index of the start of the region to be copied or moved
nativeLimit	The index of the character following the region to be replaced.
nativeDest	The destination index to which the source substring is copied or moved.
move	If true, then the substring is moved, not copied/duplicated.
status	receives any error status. Possible errors include U_NO_WRITE_PERMISSION

Stable:: ICU 3.4

Definition at line 1123 of file utext.h.

◆ UTextExtract

typedef int32_t UTextExtract(UText *ut, int64_t nativeStart, int64_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status)

Function type declaration for UText.extract().

Extract text from a UText into a UChar buffer. The range of text to be extracted is specified in the native indices of the UText provider. These may not necessarily be UTF-16 indices.

The size (number of 16 bit UChars) in the data to be extracted is returned. The full amount is returned, even when the specified buffer size is smaller.

The extracted string will (if you are a user) / must (if you are a text provider) be NUL-terminated if there is sufficient space in the destination buffer.

Parameters

ut	the UText from which to extract data.
nativeStart	the native index of the first character to extract.
nativeLimit	the native string index of the position following the last character to extract.
dest	the UChar (UTF-16) buffer into which the extracted text is placed
destCapacity	The size, in UChars, of the destination buffer. May be zero for precomputing the required size.
status	receives any error status. If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for preflighting.

Returns: Number of UChars in the data. Does not include a trailing NUL.

Stable:: ICU 3.4

Definition at line 1054 of file utext.h.

◆ UTextFuncs

typedef struct UTextFuncs UTextFuncs

Function dispatch table for UText.

See also: UTextFuncs

Definition at line 1182 of file utext.h.

◆ UTextMapNativeIndexToUTF16

typedef int32_t UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex)

Function type declaration for UText.mapIndexToUTF16().

Map from a native index to a UChar offset within a text chunk. Behavior is undefined if the native index does not fall within the current chunk.

This function is required only for text providers that do not use native UTF-16 indexes.

Parameters

ut	The UText containing the text chunk.
nativeIndex	Absolute (native) text index, chunk->start<=index<=chunk->limit.

Returns: Chunk-relative UTF-16 offset corresponding to the specified native index.

Stable:: ICU 3.4

Definition at line 1161 of file utext.h.

◆ UTextMapOffsetToNative

typedef int64_t UTextMapOffsetToNative(const UText *ut)

Function type declaration for UText.mapOffsetToNative().

Map from the current UChar offset within the current text chunk to the corresponding native index in the original source text.

This is required only for text providers that do not use native UTF-16 indexes.

Parameters

ut	the UText.

Returns: Absolute (native) index corresponding to chunkOffset in the current chunk. The returned native index should always be to a code point boundary.

Stable:: ICU 3.4

Definition at line 1143 of file utext.h.

◆ UTextNativeLength

typedef int64_t UTextNativeLength(UText *ut)

Function type declaration for UText.nativeLength().

Parameters

ut	the UText to get the length of.

Returns: the length, in the native units of the original text string.

See also: UText

Stable:: ICU 3.4

Definition at line 996 of file utext.h.

◆ UTextReplace

typedef int32_t UTextReplace(UText *ut, int64_t nativeStart, int64_t nativeLimit, const UChar *replacementText, int32_t replacmentLength, UErrorCode *status)

Function type declaration for UText.replace().

Replace a range of the original text with a replacement text.

Leaves the current iteration position at the position following the newly inserted replacement text.

This function need only be implemented on UText types that support writing.

When using this function, there should be only a single UText opened onto the underlying native text string. The function is responsible for updating the text chunk within the UText to reflect the updated iteration position, taking into account any changes to the underlying string's structure caused by the replace operation.

Parameters

ut	the UText representing the text to be operated on.
nativeStart	the index of the start of the region to be replaced
nativeLimit	the index of the character following the region to be replaced.
replacementText	pointer to the replacement text
replacmentLength	length of the replacement text in UChars, or -1 if the text is NUL terminated.
status	receives any error status. Possible errors include U_NO_WRITE_PERMISSION

Returns: The signed number of (native) storage units by which the length of the text expanded or contracted.

Stable:: ICU 3.4

Definition at line 1089 of file utext.h.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum

UText provider properties (bit field indexes).

See also: UText

Stable:: ICU 3.4

Enumerator
UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE	It is potentially time consuming for the provider to determine the length of the text. Stable: ICU 3.4
UTEXT_PROVIDER_STABLE_CHUNKS	Text chunks remain valid and usable until the text object is modified or deleted, not just until the next time the access() function is called (which is the default). Stable: ICU 3.4
UTEXT_PROVIDER_WRITABLE	The provider supports modifying the text via the replace() and copy() functions. See also Replaceable Stable: ICU 3.4
UTEXT_PROVIDER_HAS_META_DATA	There is meta data associated with the text. See also Replaceable::hasMetaData() Stable: ICU 3.4
UTEXT_PROVIDER_OWNS_TEXT	Text provider owns the text storage. Generally occurs as the result of a deep clone of the UText. When closing the UText, the associated text must also be closed/deleted/freed/ whatever is appropriate. Stable: ICU 3.6

Definition at line 910 of file utext.h.

◆ anonymous enum

anonymous enum

Internal:: Do not use.

This API is for internal use only. Value used to help identify correctly initialized UText structs. Note: must be publicly visible so that UTEXT_INITIALIZER can access it.

Definition at line 1547 of file utext.h.

Function Documentation

◆ utext_char32At()

U_CAPI UChar32 utext_char32At	(	UText *	ut,
		int64_t	nativeIndex
	)

Returns the code point at the requested index, or U_SENTINEL (-1) if it is out of bounds.

If the specified index points to the interior of a multi-unit character - one of the trail bytes of a UTF-8 sequence, for example - the complete code point will be returned.

The iteration position will be set to the start of the returned code point.

This function is roughly equivalent to the sequence utext_setNativeIndex(index); utext_current32(); (There is a subtle difference if the index is out of bounds by being less than zero - utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() will return the char at zero. utext_char32At(negative index), on the other hand, will return the U_SENTINEL value of -1.)

Parameters

ut	the text to be accessed
nativeIndex	the native index of the character to be accessed. If the index points to other than the first unit of a multi-unit character, it will be adjusted to the start of the character.

Returns: the code point at the specified index.

Stable:: ICU 3.4

◆ utext_clone()

U_CAPI UText* utext_clone	(	UText *	dest,
		const UText *	src,
		UBool	deep,
		UBool	readOnly,
		UErrorCode *	status
	)

Clone a UText.

This is much like opening a UText where the source text is itself another UText.

A deep clone will copy both the UText data structures and the underlying text. The original and cloned UText will operate completely independently; modifications made to the text in one will not affect the other. Text providers are not required to support deep clones. The user of clone() must check the status return and be prepared to handle failures.

The standard UText implementations for UTF8, UChar *, UnicodeString and Replaceable all support deep cloning.

The UText returned from a deep clone will be writable, assuming that the text provider is able to support writing, even if the source UText had been made non-writable by means of UText_freeze().

A shallow clone replicates only the UText data structures; it does not make a copy of the underlying text. Shallow clones can be used as an efficient way to have multiple iterators active in a single text string that is not being modified.

A shallow clone operation will not fail, barring truly exceptional conditions such as memory allocation failures.

Shallow UText clones should be avoided if the UText functions that modify the text are expected to be used, either on the original or the cloned UText. Any such modifications can cause unpredictable behavior. Read Only shallow clones provide some protection against errors of this type by disabling text modification via the cloned UText.

A shallow clone made with the readOnly parameter == false will preserve the utext_isWritable() state of the source object. Note, however, that write operations must be avoided while more than one UText exists that refer to the same underlying text.

A UText and its clone may be safely concurrently accessed by separate threads. This is true for read access only with shallow clones, and for both read and write access with deep clones. It is the responsibility of the Text Provider to ensure that this thread safety constraint is met.

Parameters

dest	A UText struct to be filled in with the result of the clone operation, or NULL if the clone function should heap-allocate a new UText struct. If non-NULL, must refer to an already existing UText, which will then be reset to become the clone.
src	The UText to be cloned.
deep	true to request a deep clone, false for a shallow clone.
readOnly	true to request that the cloned UText have read only access to the underlying text.
status	Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR will be returned if the text provider is unable to clone the original text.

Returns: The newly created clone, or NULL if the clone operation failed.

Stable:: ICU 3.4

◆ utext_close()

U_CAPI UText* utext_close ( UText * ut )

Close function for UText instances.

Cleans up, releases any resources being held by an open UText.

If the UText was originally allocated by one of the utext_open functions, the storage associated with the utext will also be freed. If the UText storage originated with the application, as it would with a local or static instance, the storage will not be deleted.

An open UText can be reset to refer to new string by using one of the utext_open() functions without first closing the UText.

Parameters

ut	The UText to be closed.

Returns: NULL if the UText struct was deleted by the close. If the UText struct was originally provided by the caller to the open function, it is returned by this function, and may be safely used again in a subsequent utext_open.

Stable:: ICU 3.4

◆ utext_copy()

U_CAPI void utext_copy	(	UText *	ut,
		int64_t	nativeStart,
		int64_t	nativeLimit,
		int64_t	destIndex,
		UBool	move,
		UErrorCode *	status
	)

Copy or move a substring from one position to another within the text, while retaining any metadata associated with the text.

This function is used to duplicate or reorder substrings. The destination index must not overlap the source range.

The text to be copied or moved is inserted at destIndex; it does not replace or overwrite any existing text.

The iteration position is left following the newly inserted text at the destination position.

This function is only available on UText types that support writing, that is, ones where utext_isWritable() returns true.

When using this function, there should be only a single UText opened onto the underlying native text string. Behavior after a copy operation on a UText is undefined in any other additional UTexts that refer to the modified string.

Parameters

ut	The UText representing the text to be operated on.
nativeStart	The native index of the start of the region to be copied or moved
nativeLimit	The native index of the character position following the region to be copied.
destIndex	The native destination index to which the source substring is copied or moved.
move	If true, then the substring is moved, not copied/duplicated.
status	receives any error status. Possible errors include U_NO_WRITE_PERMISSION

Stable:: ICU 3.4

◆ utext_current32()

U_CAPI UChar32 utext_current32 ( UText * ut )

Get the code point at the current iteration position, or U_SENTINEL (-1) if the iteration has reached the end of the input text.

Parameters

ut	the text to be accessed.

Returns: the Unicode code point at the current iterator position.

Stable:: ICU 3.4

◆ utext_equals()

U_CAPI UBool utext_equals	(	const UText *	a,
		const UText *	b
	)

Compare two UText objects for equality.

UTexts are equal if they are iterating over the same text, and have the same iteration position within the text. If either or both of the parameters are NULL, the comparison is false.

Parameters

a	The first of the two UTexts to compare.
b	The other UText to be compared.

Returns: true if the two UTexts are equal.

Stable:: ICU 3.6

◆ utext_extract()

U_CAPI int32_t utext_extract	(	UText *	ut,
		int64_t	nativeStart,
		int64_t	nativeLimit,
		UChar *	dest,
		int32_t	destCapacity,
		UErrorCode *	status
	)

Extract text from a UText into a UChar buffer.

The range of text to be extracted is specified in the native indices of the UText provider. These may not necessarily be UTF-16 indices.

The size (number of 16 bit UChars) of the data to be extracted is returned. The full number of UChars is returned, even when the extracted text is truncated because the specified buffer size is too small.

The extracted string will (if you are a user) / must (if you are a text provider) be NUL-terminated if there is sufficient space in the destination buffer. This terminating NUL is not included in the returned length.

The iteration index is left at the position following the last extracted character.

Parameters

ut	the UText from which to extract data.
nativeStart	the native index of the first character to extract.\ If the specified index is out of range, it will be pinned to be within 0 <= index <= textLength
nativeLimit	the native string index of the position following the last character to extract. If the specified index is out of range, it will be pinned to be within 0 <= index <= textLength. nativeLimit must be >= nativeStart.
dest	the UChar (UTF-16) buffer into which the extracted text is placed
destCapacity	The size, in UChars, of the destination buffer. May be zero for precomputing the required size.
status	receives any error status. U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the buffer was too small. Returns number of UChars for preflighting.

Returns: Number of UChars in the data to be extracted. Does not include a trailing NUL.

Stable:: ICU 3.4

◆ utext_freeze()

U_CAPI void utext_freeze ( UText * ut )

Freeze a UText. This prevents any modification to the underlying text itself by means of functions operating on this UText.

Once frozen, a UText can not be unfrozen. The intent is to ensure that a the text underlying a frozen UText wrapper cannot be modified via that UText.

Caution: freezing a UText will disable changes made via the specific frozen UText wrapper only; it will not have any effect on the ability to directly modify the text by bypassing the UText. Any such backdoor modifications are always an error while UText access is occurring because the underlying text can get out of sync with UText's buffering.

Parameters

ut	The UText to be frozen.

See also: utext_isWritable()

Stable:: ICU 3.6

◆ utext_getNativeIndex()

U_CAPI int64_t utext_getNativeIndex ( const UText * ut )

Get the current iterator position, which can range from 0 to the length of the text.

The position is a native index into the input text, in whatever format it may have (possibly UTF-8 for example), and may not always be the same as the corresponding UChar (UTF-16) index. The returned position will always be aligned to a code point boundary.

Parameters

ut	the text to be accessed.

Returns: the current index position, in the native units of the text provider.

Stable:: ICU 3.4

◆ utext_getPreviousNativeIndex()

U_CAPI int64_t utext_getPreviousNativeIndex ( UText * ut )

Get the native index of the character preceding the current position.

If the iteration position is already at the start of the text, zero is returned. The value returned is the same as that obtained from the following sequence, but without the side effect of changing the iteration position.

UText  *ut = whatever;
  ...
utext_previous(ut)
utext_getNativeIndex(ut);

This function is most useful during forwards iteration, where it will get the native index of the character most recently returned from utext_next().

Parameters

ut	the text to be accessed

Returns: the native index of the character preceding the current index position, or zero if the current position is at the start of the text.

Stable:: ICU 3.6

◆ utext_hasMetaData()

U_CAPI UBool utext_hasMetaData ( const UText * ut )

Test whether there is meta data associated with the text.

See also: Replaceable::hasMetaData()

Parameters

ut	The UText to be tested

Returns: true if the underlying text includes meta data.

Stable:: ICU 3.4

◆ utext_isLengthExpensive()

U_CAPI UBool utext_isLengthExpensive ( const UText * ut )

Return true if calculating the length of the text could be expensive.

Finding the length of NUL terminated strings is considered to be expensive.

Note that the value of this function may change as the result of other operations on a UText. Once the length of a string has been discovered, it will no longer be expensive to report it.

Parameters

ut	the text to be accessed.

Returns: true if determining the length of the text could be time consuming.

Stable:: ICU 3.4

◆ utext_isWritable()

U_CAPI UBool utext_isWritable ( const UText * ut )

Return true if the text can be written (modified) with utext_replace() or utext_copy().

For the text to be writable, the text provider must be of a type that supports writing and the UText must not be frozen.

Attempting to modify text when utext_isWriteable() is false will fail - the text will not be modified, and an error will be returned from the function that attempted the modification.

Parameters

ut	the UText to be tested.

Returns: true if the text is modifiable.

See also: utext_freeze(); utext_replace(); utext_copy()

Stable:: ICU 3.4

◆ utext_moveIndex32()

U_CAPI UBool utext_moveIndex32	(	UText *	ut,
		int32_t	delta
	)

Move the iterator position by delta code points.

The number of code points is a signed number; a negative delta will move the iterator backwards, towards the start of the text.

The index is moved by delta code points forward or backward, but no further backward than to 0 and no further forward than to utext_nativeLength(). The resulting index value will be in between 0 and length, inclusive.

Parameters

ut	the text to be accessed.
delta	the signed number of code points to move the iteration position.

Returns: true if the position could be moved the requested number of positions while staying within the range [0 - text length].

Stable:: ICU 3.4

◆ utext_nativeLength()

U_CAPI int64_t utext_nativeLength ( UText * ut )

Get the length of the text.

Depending on the characteristics of the underlying text representation, this may be expensive.

See also: utext_isLengthExpensive()

Parameters

ut	the text to be accessed.

Returns: the length of the text, expressed in native units.

Stable:: ICU 3.4

◆ utext_next32()

U_CAPI UChar32 utext_next32 ( UText * ut )

Get the code point at the current iteration position of the UText, and advance the position to the first index following the character.

If the position is at the end of the text (the index following the last character, which is also the length of the text), return U_SENTINEL (-1) and do not advance the index.

This is a post-increment operation.

An inline macro version of this function, UTEXT_NEXT32(), is available for performance critical use.

Parameters

ut	the text to be accessed.

Returns: the Unicode code point at the iteration position.

See also: UTEXT_NEXT32

Stable:: ICU 3.4

◆ utext_next32From()

U_CAPI UChar32 utext_next32From	(	UText *	ut,
		int64_t	nativeIndex
	)

Set the iteration index and return the code point at that index.

Leave the iteration index at the start of the following code point.

This function is the most efficient and convenient way to begin a forward iteration. The results are identical to the those from the sequence

utext_setIndex();

utext_next32();

Parameters

ut	the text to be accessed.
nativeIndex	Iteration index, in the native units of the text provider.

Returns: Code point which starts at or before index, or U_SENTINEL (-1) if it is out of bounds.

Stable:: ICU 3.4

◆ utext_openCharacterIterator()

U_CAPI UText* utext_openCharacterIterator	(	UText *	ut,
		icu::CharacterIterator *	ci,
		UErrorCode *	status
	)

Open a UText implementation over an ICU CharacterIterator.

Parameters

ut	Pointer to a UText struct. If nullptr, a new UText will be created. If non-nullptr, must refer to an already existing UText, which will then be reset to reference the specified replaceable text.
ci	A Character Iterator.
status	Errors are returned here.

Returns: Pointer to the UText. If a UText was supplied as input, this will always be used and returned.

See also: Replaceable

Stable:: ICU 3.4

◆ utext_openConstUnicodeString()

U_CAPI UText* utext_openConstUnicodeString	(	UText *	ut,
		const icu::UnicodeString *	s,
		UErrorCode *	status
	)

Open a UText for a const UnicodeString.

The resulting UText will not be writable.

Parameters

ut	Pointer to a UText struct. If nullptr, a new UText will be created. If non-nullptr, must refer to an initialized UText struct, which will then be reset to reference the specified input string.
s	A const UnicodeString to be wrapped.
status	Errors are returned here.

Returns: Pointer to the UText. If a UText was supplied as input, this will always be used and returned.

Stable:: ICU 3.4

◆ utext_openReplaceable()

U_CAPI UText* utext_openReplaceable	(	UText *	ut,
		icu::Replaceable *	rep,
		UErrorCode *	status
	)

Open a writable UText implementation for an ICU Replaceable object.

Parameters

ut	Pointer to a UText struct. If nullptr, a new UText will be created. If non-nullptr, must refer to an already existing UText, which will then be reset to reference the specified replaceable text.
rep	A Replaceable text object.
status	Errors are returned here.

Returns: Pointer to the UText. If a UText was supplied as input, this will always be used and returned.

See also: Replaceable

Stable:: ICU 3.4

◆ utext_openUChars()

U_CAPI UText* utext_openUChars	(	UText *	ut,
		const UChar *	s,
		int64_t	length,
		UErrorCode *	status
	)

Open a read-only UText for UChar * string.

Parameters

ut	Pointer to a UText struct. If NULL, a new UText will be created. If non-NULL, must refer to an initialized UText struct, which will then be reset to reference the specified UChar string.
s	A UChar (UTF-16) string
length	The number of UChars in the input string, or -1 if the string is zero terminated.
status	Errors are returned here.

Returns: A pointer to the UText. If a pre-allocated UText was provided, it will always be used and returned.

Stable:: ICU 3.4

◆ utext_openUnicodeString()

U_CAPI UText* utext_openUnicodeString	(	UText *	ut,
		icu::UnicodeString *	s,
		UErrorCode *	status
	)

Open a writable UText for a non-const UnicodeString.

Parameters

ut	Pointer to a UText struct. If nullptr, a new UText will be created. If non-nullptr, must refer to an initialized UText struct, which will then be reset to reference the specified input string.
s	A UnicodeString.
status	Errors are returned here.

Returns: Pointer to the UText. If a UText was supplied as input, this will always be used and returned.

Stable:: ICU 3.4

◆ utext_openUTF8()

U_CAPI UText* utext_openUTF8	(	UText *	ut,
		const char *	s,
		int64_t	length,
		UErrorCode *	status
	)

Open a read-only UText implementation for UTF-8 strings.

Any invalid UTF-8 in the input will be handled in this way: a sequence of bytes that has the form of a truncated, but otherwise valid, UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. Any other illegal bytes will each be replaced by a \uFFFD.

Parameters

ut	Pointer to a UText struct. If NULL, a new UText will be created. If non-NULL, must refer to an initialized UText struct, which will then be reset to reference the specified UTF-8 string.
s	A UTF-8 string. Must not be NULL.
length	The length of the UTF-8 string in bytes, or -1 if the string is zero terminated.
status	Errors are returned here.

Returns: A pointer to the UText. If a pre-allocated UText was provided, it will always be used and returned.

Stable:: ICU 3.4

◆ utext_previous32()

U_CAPI UChar32 utext_previous32 ( UText * ut )

Move the iterator position to the character (code point) whose index precedes the current position, and return that character.

This is a pre-decrement operation.

If the initial position is at the start of the text (index of 0) return U_SENTINEL (-1), and leave the position unchanged.

An inline macro version of this function, UTEXT_PREVIOUS32(), is available for performance critical use.

Parameters

ut	the text to be accessed.

Returns: the previous UChar32 code point, or U_SENTINEL (-1) if the iteration has reached the start of the text.

See also: UTEXT_PREVIOUS32

Stable:: ICU 3.4

◆ utext_previous32From()

U_CAPI UChar32 utext_previous32From	(	UText *	ut,
		int64_t	nativeIndex
	)

Set the iteration index, and return the code point preceding the one specified by the initial index.

Leave the iteration position at the start of the returned code point.

This function is the most efficient and convenient way to begin a backwards iteration.

Parameters

ut	the text to be accessed.
nativeIndex	Iteration index in the native units of the text provider.

Returns: Code point preceding the one at the initial index, or U_SENTINEL (-1) if it is out of bounds.

Stable:: ICU 3.4

◆ utext_replace()

U_CAPI int32_t utext_replace	(	UText *	ut,
		int64_t	nativeStart,
		int64_t	nativeLimit,
		const UChar *	replacementText,
		int32_t	replacementLength,
		UErrorCode *	status
	)

Replace a range of the original text with a replacement text.

Leaves the current iteration position at the position following the newly inserted replacement text.

This function is only available on UText types that support writing, that is, ones where utext_isWritable() returns true.

When using this function, there should be only a single UText opened onto the underlying native text string. Behavior after a replace operation on a UText is undefined for any other additional UTexts that refer to the modified string.

Parameters

ut	the UText representing the text to be operated on.
nativeStart	the native index of the start of the region to be replaced
nativeLimit	the native index of the character following the region to be replaced.
replacementText	pointer to the replacement text
replacementLength	length of the replacement text, or -1 if the text is NUL terminated.
status	receives any error status. Possible errors include U_NO_WRITE_PERMISSION

Returns: The signed number of (native) storage units by which the length of the text expanded or contracted.

Stable:: ICU 3.4

◆ utext_setNativeIndex()

U_CAPI void utext_setNativeIndex	(	UText *	ut,
		int64_t	nativeIndex
	)

Set the current iteration position to the nearest code point boundary at or preceding the specified index.

The index is in the native units of the original input text. If the index is out of range, it will be pinned to be within the range of the input text.

It will usually be more efficient to begin an iteration using the functions utext_next32From() or utext_previous32From() rather than setIndex().

Moving the index position to an adjacent character is best done with utext_next32(), utext_previous32() or utext_moveIndex32(). Attempting to do direct arithmetic on the index position is complicated by the fact that the size (in native units) of a character depends on the underlying representation of the character (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not easily knowable.

Parameters

ut	the text to be accessed.
nativeIndex	the native unit index of the new iteration position.

Stable:: ICU 3.4

◆ utext_setup()

U_CAPI UText* utext_setup	(	UText *	ut,
		int32_t	extraSpace,
		UErrorCode *	status
	)

Common function for use by Text Provider implementations to allocate and/or initialize a new UText struct.

To be called in the implementation of utext_open() functions. If the supplied UText parameter is null, a new UText struct will be allocated on the heap. If the supplied UText is already open, the provider's close function will be called so that the struct can be reused by the open that is in progress.

Parameters

ut	pointer to a UText struct to be re-used, or null if a new UText should be allocated.
extraSpace	The amount of additional space to be allocated as part of this UText, for use by types of providers that require additional storage.
status	Errors are returned here.

Returns: pointer to the UText, allocated if necessary, with extra space set up if requested.

Stable:: ICU 3.4

Data Structures

Namespaces

Macros

Typedefs

Enumerations

Functions

Detailed Description

Macro Definition Documentation

◆ UTEXT_CURRENT32

◆ UTEXT_GETNATIVEINDEX

◆ UTEXT_INITIALIZER

◆ UTEXT_NEXT32

◆ UTEXT_PREVIOUS32

◆ UTEXT_SETNATIVEINDEX

Typedef Documentation

◆ UText

◆ UTextAccess

◆ UTextClone

◆ UTextClose

◆ UTextCopy

◆ UTextExtract

◆ UTextFuncs

◆ UTextMapNativeIndexToUTF16

◆ UTextMapOffsetToNative

◆ UTextNativeLength

◆ UTextReplace

Enumeration Type Documentation

◆ anonymous enum

◆ anonymous enum

Function Documentation

◆ utext_char32At()

◆ utext_clone()

◆ utext_close()

◆ utext_copy()

◆ utext_current32()

◆ utext_equals()

◆ utext_extract()

◆ utext_freeze()

◆ utext_getNativeIndex()

◆ utext_getPreviousNativeIndex()

◆ utext_hasMetaData()

◆ utext_isLengthExpensive()

◆ utext_isWritable()

◆ utext_moveIndex32()

◆ utext_nativeLength()

◆ utext_next32()

◆ utext_next32From()

◆ utext_openCharacterIterator()

◆ utext_openConstUnicodeString()

◆ utext_openReplaceable()

◆ utext_openUChars()

◆ utext_openUnicodeString()

◆ utext_openUTF8()

◆ utext_previous32()

◆ utext_previous32From()

◆ utext_replace()

◆ utext_setNativeIndex()

◆ utext_setup()