ICU 78.1  78.1
utfstring.h
Go to the documentation of this file.
1 // © 2025 and later: Unicode, Inc. and others.
2 // License & terms of use: https://www.unicode.org/copyright.html
3 
4 // utfstring.h
5 // created: 2025jul18 Markus W. Scherer
6 
7 #ifndef __UTFSTRING_H__
8 #define __UTFSTRING_H__
9 
10 #include "unicode/utypes.h"
11 
12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13 
14 #include "unicode/utf16.h"
15 
21 #ifndef U_HIDE_DRAFT_API
22 
23 namespace U_HEADER_ONLY_NAMESPACE {
24 namespace utfstring {
25 
26 // Write code points to strings -------------------------------------------- ***
27 
28 #ifndef U_IN_DOXYGEN
29 namespace prv {
30 
31 // This function, and the public wrappers,
32 // want to be U_FORCE_INLINE but the gcc-debug-build-and-test CI check failed with
33 // error: ‘always_inline’ function might not be inlinable [-Werror=attributes]
34 template<typename StringClass, bool validate>
35 inline StringClass &appendCodePoint(StringClass &s, uint32_t c) {
36  using Unit = typename StringClass::value_type;
37  if constexpr (sizeof(Unit) == 1) {
38  // UTF-8: Similar to U8_APPEND().
39  if (c <= 0x7f) {
40  s.push_back(static_cast<Unit>(c));
41  } else {
42  Unit buf[4];
43  uint8_t len;
44  if (c <= 0x7ff) {
45  len = 2;
46  buf[2] = (c >> 6) | 0xc0;
47  } else {
48  if (validate ?
49  c < 0xd800 ||
50  (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
51  c <= 0xffff) {
52  len = 3;
53  buf[1] = (c >> 12) | 0xe0;
54  } else {
55  len = 4;
56  buf[0] = (c >> 18) | 0xf0;
57  buf[1] = ((c >> 12) & 0x3f) | 0x80;
58  }
59  buf[2] = ((c >> 6) & 0x3f) | 0x80;
60  }
61  buf[3] = (c & 0x3f) | 0x80;
62  s.append(buf + 4 - len, len);
63  }
64  } else if constexpr (sizeof(Unit) == 2) {
65  // UTF-16: Similar to U16_APPEND().
66  if (validate ?
67  c < 0xd800 || (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
68  c <= 0xffff) {
69  s.push_back(static_cast<Unit>(c));
70  } else {
71  Unit buf[2] = { U16_LEAD(c), U16_TRAIL(c) };
72  s.append(buf, 2);
73  }
74  } else {
75  // UTF-32
76  s.push_back(!validate || U_IS_SCALAR_VALUE(c) ? c : 0xfffd);
77  }
78  return s;
79 }
80 
81 } // namespace prv
82 #endif // U_IN_DOXYGEN
83 
84 #ifndef U_HIDE_DRAFT_API
97 template<typename StringClass>
98 inline StringClass &appendOrFFFD(StringClass &s, UChar32 c) {
99  return prv::appendCodePoint<StringClass, true>(s, c);
100 }
101 
114 template<typename StringClass>
115 inline StringClass &appendUnsafe(StringClass &s, UChar32 c) {
116  return prv::appendCodePoint<StringClass, false>(s, c);
117 }
118 
130 template<typename StringClass>
131 inline StringClass encodeOrFFFD(UChar32 c) {
132  StringClass s;
133  prv::appendCodePoint<StringClass, true>(s, c);
134  return s;
135 }
136 
148 template<typename StringClass>
149 inline StringClass encodeUnsafe(UChar32 c) {
150  StringClass s;
151  prv::appendCodePoint<StringClass, false>(s, c);
152  return s;
153 }
154 #endif // U_HIDE_DRAFT_API
155 
156 } // namespace utfstring
157 } // namespace U_HEADER_ONLY_NAMESPACE
158 
159 #endif // U_HIDE_DRAFT_API
160 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
161 #endif // __UTFSTRING_H__
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:449
C API: 16-bit Unicode handling macros.
#define U16_TRAIL(supplementary)
Get the trail surrogate (0xdc00..0xdfff) for a supplementary code point (0x10000.....
Definition: utf16.h:132
#define U16_LEAD(supplementary)
Get the lead surrogate (0xd800..0xdbff) for a supplementary code point (0x10000..0x10ffff).
Definition: utf16.h:123
#define U_IS_SCALAR_VALUE(c)
Is c a Unicode scalar value, that is, a non-surrogate code point? Only scalar values can be represent...
Definition: utf.h:149
StringClass encodeUnsafe(UChar32 c)
Returns the code point as a string of code units.
Definition: utfstring.h:149
StringClass & appendOrFFFD(StringClass &s, UChar32 c)
Appends the code point to the string.
Definition: utfstring.h:98
StringClass & appendUnsafe(StringClass &s, UChar32 c)
Appends the code point to the string.
Definition: utfstring.h:115
StringClass encodeOrFFFD(UChar32 c)
Returns the code point as a string of code units.
Definition: utfstring.h:131
Basic definitions for ICU, for both C and C++ APIs.