ICU 78.1  78.1
utfiterator.h
Go to the documentation of this file.
1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: https://www.unicode.org/copyright.html
3 
4 // utfiterator.h
5 // created: 2024aug12 Markus W. Scherer
6 
7 #ifndef __UTFITERATOR_H__
8 #define __UTFITERATOR_H__
9 
10 #include "unicode/utypes.h"
11 
12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13 
14 #include <iterator>
15 #if defined(__cpp_lib_ranges)
16 #include <ranges>
17 #endif
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include "unicode/utf16.h"
22 #include "unicode/utf8.h"
23 #include "unicode/uversion.h"
24 
135 #ifndef U_HIDE_DRAFT_API
136 
149 typedef enum UTFIllFormedBehavior {
169 
170 namespace U_HEADER_ONLY_NAMESPACE {
171 
172 namespace prv {
173 #if U_CPLUSPLUS_VERSION >= 20
174 
176 template<typename Iter>
177 using iter_value_t = typename std::iter_value_t<Iter>;
178 
180 template<typename Iter>
181 using iter_difference_t = std::iter_difference_t<Iter>;
182 
184 template<typename Iter>
185 constexpr bool forward_iterator = std::forward_iterator<Iter>;
186 
188 template<typename Iter>
189 constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190 
192 template<typename Range>
193 constexpr bool range = std::ranges::range<Range>;
194 
195 #else
196 
198 template<typename Iter>
199 using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200 
202 template<typename Iter>
203 using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204 
206 template<typename Iter>
207 constexpr bool forward_iterator =
208  std::is_base_of_v<
209  std::forward_iterator_tag,
210  typename std::iterator_traits<Iter>::iterator_category>;
211 
213 template<typename Iter>
214 constexpr bool bidirectional_iterator =
215  std::is_base_of_v<
216  std::bidirectional_iterator_tag,
217  typename std::iterator_traits<Iter>::iterator_category>;
218 
220 template<typename Range, typename = void>
221 struct range_type : std::false_type {};
222 
224 template<typename Range>
225 struct range_type<
226  Range,
227  std::void_t<decltype(std::declval<Range>().begin()),
228  decltype(std::declval<Range>().end())>> : std::true_type {};
229 
231 template<typename Range>
233 
234 #endif
235 
237 template <typename T> struct is_basic_string_view : std::false_type {};
238 
240 template <typename... Args>
241 struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
242 
244 template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
245 
247 template<typename CP32, bool skipSurrogates>
249  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
250 public:
252  using value_type = CP32;
256  using pointer = CP32 *;
258  using difference_type = int32_t;
260  using iterator_category = std::forward_iterator_tag;
261 
263  inline CodePointsIterator(CP32 c) : c_(c) {}
265  inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
267  inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
269  inline CP32 operator*() const { return c_; }
271  inline CodePointsIterator &operator++() { // pre-increment
272  ++c_;
273  if (skipSurrogates && c_ == 0xd800) {
274  c_ = 0xe000;
275  }
276  return *this;
277  }
279  inline CodePointsIterator operator++(int) { // post-increment
280  CodePointsIterator result(*this);
281  ++(*this);
282  return result;
283  }
284 
285 private:
286  CP32 c_;
287 };
288 
289 } // namespace prv
290 
301 template<typename CP32>
303  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
304 public:
312  auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
317  auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
318 };
319 
332 template<typename CP32>
334  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
335 public:
343  auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
348  auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
349 };
350 
366 template<typename CP32, typename UnitIter, typename = void>
368  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
369  using Unit = typename prv::iter_value_t<UnitIter>;
370 public:
372  UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
373  c_(codePoint), len_(length), start_(start), limit_(limit) {}
374 
376  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
378  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
379 
387  CP32 codePoint() const { return c_; }
388 
394  UnitIter begin() const { return start_; }
395 
401  UnitIter end() const { return limit_; }
402 
407  uint8_t length() const { return len_; }
408 
409 #if U_CPLUSPLUS_VERSION >= 20
415  template<std::contiguous_iterator Iter = UnitIter>
416  std::basic_string_view<Unit> stringView() const {
417  return std::basic_string_view<Unit>(begin(), end());
418  }
419 #else
425  template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
426  std::enable_if_t<std::is_pointer_v<Iter> ||
427  std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
428  std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
429  std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
430  std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
431  std::basic_string_view<Unit>>
432  stringView() const {
433  return std::basic_string_view<Unit>(&*start_, len_);
434  }
435 #endif
436 
437 private:
438  // Order of fields with padding and access frequency in mind.
439  CP32 c_;
440  uint8_t len_;
441  UnitIter start_;
442  UnitIter limit_;
443 };
444 
445 #ifndef U_IN_DOXYGEN
446 // Partial template specialization for single-pass input iterator.
447 // No UnitIter field, no getter for it, no stringView().
448 template<typename CP32, typename UnitIter>
449 class UnsafeCodeUnits<
450  CP32,
451  UnitIter,
452  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
453  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
454 public:
455  UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
456 
457  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
458  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
459 
460  CP32 codePoint() const { return c_; }
461 
462  uint8_t length() const { return len_; }
463 
464 private:
465  // Order of fields with padding and access frequency in mind.
466  CP32 c_;
467  uint8_t len_;
468 };
469 #endif // U_IN_DOXYGEN
470 
486 template<typename CP32, typename UnitIter, typename = void>
487 class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
488 public:
490  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
491  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
492 
494  CodeUnits(const CodeUnits &other) = default;
496  CodeUnits &operator=(const CodeUnits &other) = default;
497 
502  bool wellFormed() const { return ok_; }
503 
504 private:
505  bool ok_;
506 };
507 
508 #ifndef U_IN_DOXYGEN
509 // Partial template specialization for single-pass input iterator.
510 // No UnitIter field, no getter for it, no stringView().
511 template<typename CP32, typename UnitIter>
512 class CodeUnits<
513  CP32,
514  UnitIter,
515  std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
516  public UnsafeCodeUnits<CP32, UnitIter> {
517 public:
518  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
519  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
520 
521  CodeUnits(const CodeUnits &other) = default;
522  CodeUnits &operator=(const CodeUnits &other) = default;
523 
524  bool wellFormed() const { return ok_; }
525 
526 private:
527  bool ok_;
528 };
529 #endif // U_IN_DOXYGEN
530 
531 // Validating implementations ---------------------------------------------- ***
532 
533 #ifndef U_IN_DOXYGEN
534 template<typename CP32, UTFIllFormedBehavior behavior,
535  typename UnitIter, typename LimitIter = UnitIter, typename = void>
536 class UTFImpl;
537 
538 // Note: readAndInc() functions take both a p0 and a p iterator.
539 // They must have the same value.
540 // For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
541 // and readAndInc() copies p0 and the incremented p into the CodeUnits.
542 // For a single-pass UnitIter, which may not be default-constructible nor coypable,
543 // the caller can pass p into both references, and readAndInc() does not use p0
544 // and constructs CodeUnits without them.
545 // Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
546 // which may not be possible for a single-pass iterator.
547 
548 // UTF-8
549 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
550 class UTFImpl<
551  CP32, behavior,
552  UnitIter, LimitIter,
553  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
554  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
555  static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
556  "For 8-bit strings, the SURROGATE option does not have an equivalent.");
557 public:
558  // Handle ill-formed UTF-8
559  U_FORCE_INLINE static CP32 sub() {
560  switch (behavior) {
561  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
562  case UTF_BEHAVIOR_FFFD: return 0xfffd;
563  }
564  }
565 
566  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
567  // Very similar to U8_FWD_1().
568  uint8_t b = *p;
569  ++p;
570  if (U8_IS_LEAD(b) && p != limit) {
571  uint8_t t1 = *p;
572  if ((0xe0 <= b && b < 0xf0)) {
573  if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
574  ++p != limit && U8_IS_TRAIL(*p)) {
575  ++p;
576  }
577  } else if (b < 0xe0) {
578  if (U8_IS_TRAIL(t1)) {
579  ++p;
580  }
581  } else /* b >= 0xf0 */ {
582  if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
583  ++p != limit && U8_IS_TRAIL(*p) &&
584  ++p != limit && U8_IS_TRAIL(*p)) {
585  ++p;
586  }
587  }
588  }
589  }
590 
591  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
592  // Very similar to U8_BACK_1().
593  uint8_t c = *--p;
594  if (U8_IS_TRAIL(c) && p != start) {
595  UnitIter p1 = p;
596  uint8_t b1 = *--p1;
597  if (U8_IS_LEAD(b1)) {
598  if (b1 < 0xe0 ||
599  (b1 < 0xf0 ?
600  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
601  U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
602  p = p1;
603  return;
604  }
605  } else if (U8_IS_TRAIL(b1) && p1 != start) {
606  uint8_t b2 = *--p1;
607  if (0xe0 <= b2 && b2 <= 0xf4) {
608  if (b2 < 0xf0 ?
609  U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
610  U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
611  p = p1;
612  return;
613  }
614  } else if (U8_IS_TRAIL(b2) && p1 != start) {
615  uint8_t b3 = *--p1;
616  if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
617  p = p1;
618  return;
619  }
620  }
621  }
622  }
623  }
624 
625  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
626  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
627  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
628  // Very similar to U8_NEXT_OR_FFFD().
629  CP32 c = uint8_t(*p);
630  ++p;
631  if (U8_IS_SINGLE(c)) {
632  if constexpr (isMultiPass) {
633  return {c, 1, true, p0, p};
634  } else {
635  return {c, 1, true};
636  }
637  }
638  uint8_t length = 1;
639  uint8_t t = 0;
640  if (p != limit &&
641  // fetch/validate/assemble all but last trail byte
642  (c >= 0xe0 ?
643  (c < 0xf0 ? // U+0800..U+FFFF except surrogates
644  U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
645  (t &= 0x3f, 1)
646  : // U+10000..U+10FFFF
647  (c -= 0xf0) <= 4 &&
648  U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
649  (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
650  (t = *p - 0x80) <= 0x3f) &&
651  // valid second-to-last trail byte
652  (c = (c << 6) | t, ++length, ++p != limit)
653  : // U+0080..U+07FF
654  c >= 0xc2 && (c &= 0x1f, 1)) &&
655  // last trail byte
656  (t = *p - 0x80) <= 0x3f) {
657  c = (c << 6) | t;
658  ++length;
659  ++p;
660  if constexpr (isMultiPass) {
661  return {c, length, true, p0, p};
662  } else {
663  return {c, length, true};
664  }
665  }
666  if constexpr (isMultiPass) {
667  return {sub(), length, false, p0, p};
668  } else {
669  return {sub(), length, false};
670  }
671  }
672 
673  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
674  // Very similar to U8_PREV_OR_FFFD().
675  UnitIter p0 = p;
676  CP32 c = uint8_t(*--p);
677  if (U8_IS_SINGLE(c)) {
678  return {c, 1, true, p, p0};
679  }
680  if (U8_IS_TRAIL(c) && p != start) {
681  UnitIter p1 = p;
682  uint8_t b1 = *--p1;
683  if (U8_IS_LEAD(b1)) {
684  if (b1 < 0xe0) {
685  p = p1;
686  c = ((b1 - 0xc0) << 6) | (c & 0x3f);
687  return {c, 2, true, p, p0};
688  } else if (b1 < 0xf0 ?
689  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
690  U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
691  // Truncated 3- or 4-byte sequence.
692  p = p1;
693  return {sub(), 2, false, p, p0};
694  }
695  } else if (U8_IS_TRAIL(b1) && p1 != start) {
696  // Extract the value bits from the last trail byte.
697  c &= 0x3f;
698  uint8_t b2 = *--p1;
699  if (0xe0 <= b2 && b2 <= 0xf4) {
700  if (b2 < 0xf0) {
701  b2 &= 0xf;
702  if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
703  p = p1;
704  c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
705  return {c, 3, true, p, p0};
706  }
707  } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
708  // Truncated 4-byte sequence.
709  p = p1;
710  return {sub(), 3, false, p, p0};
711  }
712  } else if (U8_IS_TRAIL(b2) && p1 != start) {
713  uint8_t b3 = *--p1;
714  if (0xf0 <= b3 && b3 <= 0xf4) {
715  b3 &= 7;
716  if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
717  p = p1;
718  c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
719  return {c, 4, true, p, p0};
720  }
721  }
722  }
723  }
724  }
725  return {sub(), 1, false, p, p0};
726  }
727 };
728 
729 // UTF-16
730 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
731 class UTFImpl<
732  CP32, behavior,
733  UnitIter, LimitIter,
734  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
735  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
736 public:
737  // Handle ill-formed UTF-16: One unpaired surrogate.
738  U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
739  switch (behavior) {
740  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
741  case UTF_BEHAVIOR_FFFD: return 0xfffd;
742  case UTF_BEHAVIOR_SURROGATE: return surrogate;
743  }
744  }
745 
746  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
747  // Very similar to U16_FWD_1().
748  auto c = *p;
749  ++p;
750  if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
751  ++p;
752  }
753  }
754 
755  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
756  // Very similar to U16_BACK_1().
757  UnitIter p1;
758  if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
759  p = p1;
760  }
761  }
762 
763  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
764  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
765  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
766  // Very similar to U16_NEXT_OR_FFFD().
767  CP32 c = static_cast<CP32>(*p);
768  ++p;
769  if (!U16_IS_SURROGATE(c)) {
770  if constexpr (isMultiPass) {
771  return {c, 1, true, p0, p};
772  } else {
773  return {c, 1, true};
774  }
775  } else {
776  uint16_t c2;
777  if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
778  ++p;
779  c = U16_GET_SUPPLEMENTARY(c, c2);
780  if constexpr (isMultiPass) {
781  return {c, 2, true, p0, p};
782  } else {
783  return {c, 2, true};
784  }
785  } else {
786  if constexpr (isMultiPass) {
787  return {sub(c), 1, false, p0, p};
788  } else {
789  return {sub(c), 1, false};
790  }
791  }
792  }
793  }
794 
795  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
796  // Very similar to U16_PREV_OR_FFFD().
797  UnitIter p0 = p;
798  CP32 c = static_cast<CP32>(*--p);
799  if (!U16_IS_SURROGATE(c)) {
800  return {c, 1, true, p, p0};
801  } else {
802  UnitIter p1;
803  uint16_t c2;
804  if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
805  p = p1;
806  c = U16_GET_SUPPLEMENTARY(c2, c);
807  return {c, 2, true, p, p0};
808  } else {
809  return {sub(c), 1, false, p, p0};
810  }
811  }
812  }
813 };
814 
815 // UTF-32: trivial, but still validating
816 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
817 class UTFImpl<
818  CP32, behavior,
819  UnitIter, LimitIter,
820  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
821  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
822 public:
823  // Handle ill-formed UTF-32
824  U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
825  switch (behavior) {
826  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
827  case UTF_BEHAVIOR_FFFD: return 0xfffd;
828  case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
829  }
830  }
831 
832  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
833  ++p;
834  }
835 
836  U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
837  --p;
838  }
839 
840  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
841  UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
842  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
843  uint32_t uc = *p;
844  CP32 c = uc;
845  ++p;
846  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
847  if constexpr (isMultiPass) {
848  return {c, 1, true, p0, p};
849  } else {
850  return {c, 1, true};
851  }
852  } else {
853  if constexpr (isMultiPass) {
854  return {sub(uc < 0xe000, c), 1, false, p0, p};
855  } else {
856  return {sub(uc < 0xe000, c), 1, false};
857  }
858  }
859  }
860 
861  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
862  UnitIter p0 = p;
863  uint32_t uc = *--p;
864  CP32 c = uc;
865  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
866  return {c, 1, true, p, p0};
867  } else {
868  return {sub(uc < 0xe000, c), 1, false, p, p0};
869  }
870  }
871 };
872 
873 // Non-validating implementations ------------------------------------------ ***
874 
875 template<typename CP32, typename UnitIter, typename = void>
876 class UnsafeUTFImpl;
877 
878 // UTF-8
879 template<typename CP32, typename UnitIter>
880 class UnsafeUTFImpl<
881  CP32,
882  UnitIter,
883  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
884  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
885 public:
886  U_FORCE_INLINE static void inc(UnitIter &p) {
887  // Very similar to U8_FWD_1_UNSAFE().
888  uint8_t b = *p;
889  std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
890  }
891 
892  U_FORCE_INLINE static void dec(UnitIter &p) {
893  // Very similar to U8_BACK_1_UNSAFE().
894  while (U8_IS_TRAIL(*--p)) {}
895  }
896 
897  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
898  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
899  // Very similar to U8_NEXT_UNSAFE().
900  CP32 c = uint8_t(*p);
901  ++p;
902  if (U8_IS_SINGLE(c)) {
903  if constexpr (isMultiPass) {
904  return {c, 1, p0, p};
905  } else {
906  return {c, 1};
907  }
908  } else if (c < 0xe0) {
909  c = ((c & 0x1f) << 6) | (*p & 0x3f);
910  ++p;
911  if constexpr (isMultiPass) {
912  return {c, 2, p0, p};
913  } else {
914  return {c, 2};
915  }
916  } else if (c < 0xf0) {
917  // No need for (c&0xf) because the upper bits are truncated
918  // after <<12 in the cast to uint16_t.
919  c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
920  ++p;
921  c |= *p & 0x3f;
922  ++p;
923  if constexpr (isMultiPass) {
924  return {c, 3, p0, p};
925  } else {
926  return {c, 3};
927  }
928  } else {
929  c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
930  ++p;
931  c |= (*p & 0x3f) << 6;
932  ++p;
933  c |= *p & 0x3f;
934  ++p;
935  if constexpr (isMultiPass) {
936  return {c, 4, p0, p};
937  } else {
938  return {c, 4};
939  }
940  }
941  }
942 
943  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
944  // Very similar to U8_PREV_UNSAFE().
945  UnitIter p0 = p;
946  CP32 c = uint8_t(*--p);
947  if (U8_IS_SINGLE(c)) {
948  return {c, 1, p, p0};
949  }
950  // U8_IS_TRAIL(c) if well-formed
951  c &= 0x3f;
952  uint8_t count = 1;
953  for (uint8_t shift = 6;;) {
954  uint8_t b = *--p;
955  if (b >= 0xc0) {
956  U8_MASK_LEAD_BYTE(b, count);
957  c |= uint32_t{b} << shift;
958  break;
959  } else {
960  c |= (uint32_t{b} & 0x3f) << shift;
961  ++count;
962  shift += 6;
963  }
964  }
965  ++count;
966  return {c, count, p, p0};
967  }
968 };
969 
970 // UTF-16
971 template<typename CP32, typename UnitIter>
972 class UnsafeUTFImpl<
973  CP32,
974  UnitIter,
975  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
976  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
977 public:
978  U_FORCE_INLINE static void inc(UnitIter &p) {
979  // Very similar to U16_FWD_1_UNSAFE().
980  auto c = *p;
981  ++p;
982  if (U16_IS_LEAD(c)) {
983  ++p;
984  }
985  }
986 
987  U_FORCE_INLINE static void dec(UnitIter &p) {
988  // Very similar to U16_BACK_1_UNSAFE().
989  if (U16_IS_TRAIL(*--p)) {
990  --p;
991  }
992  }
993 
994  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
995  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
996  // Very similar to U16_NEXT_UNSAFE().
997  CP32 c = static_cast<CP32>(*p);
998  ++p;
999  if (!U16_IS_LEAD(c)) {
1000  if constexpr (isMultiPass) {
1001  return {c, 1, p0, p};
1002  } else {
1003  return {c, 1};
1004  }
1005  } else {
1006  uint16_t c2 = *p;
1007  ++p;
1008  c = U16_GET_SUPPLEMENTARY(c, c2);
1009  if constexpr (isMultiPass) {
1010  return {c, 2, p0, p};
1011  } else {
1012  return {c, 2};
1013  }
1014  }
1015  }
1016 
1017  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1018  // Very similar to U16_PREV_UNSAFE().
1019  UnitIter p0 = p;
1020  CP32 c = static_cast<CP32>(*--p);
1021  if (!U16_IS_TRAIL(c)) {
1022  return {c, 1, p, p0};
1023  } else {
1024  uint16_t c2 = *--p;
1025  c = U16_GET_SUPPLEMENTARY(c2, c);
1026  return {c, 2, p, p0};
1027  }
1028  }
1029 };
1030 
1031 // UTF-32: trivial
1032 template<typename CP32, typename UnitIter>
1033 class UnsafeUTFImpl<
1034  CP32,
1035  UnitIter,
1036  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1037  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1038 public:
1039  U_FORCE_INLINE static void inc(UnitIter &p) {
1040  ++p;
1041  }
1042 
1043  U_FORCE_INLINE static void dec(UnitIter &p) {
1044  --p;
1045  }
1046 
1047  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1048  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1049  CP32 c = *p;
1050  ++p;
1051  if constexpr (isMultiPass) {
1052  return {c, 1, p0, p};
1053  } else {
1054  return {c, 1};
1055  }
1056  }
1057 
1058  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1059  UnitIter p0 = p;
1060  CP32 c = *--p;
1061  return {c, 1, p, p0};
1062  }
1063 };
1064 
1065 #endif
1066 
1067 // Validating iterators ---------------------------------------------------- ***
1068 
1092 template<typename CP32, UTFIllFormedBehavior behavior,
1093  typename UnitIter, typename LimitIter = UnitIter, typename = void>
1095  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1096  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1097 
1098  // Proxy type for operator->() (required by LegacyInputIterator)
1099  // so that we don't promise always returning CodeUnits.
1100  class Proxy {
1101  public:
1102  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1103  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1104  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1105  private:
1107  };
1108 
1109 public:
1115  using pointer = Proxy;
1119  using iterator_category = std::conditional_t<
1120  prv::bidirectional_iterator<UnitIter>,
1121  std::bidirectional_iterator_tag,
1122  std::forward_iterator_tag>;
1123 
1137  U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1138  p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1150  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1151  p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1163  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1169  U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1170 
1172  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1174  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1175 
1177  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1180 
1186  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1187  return getLogicalPosition() == other.getLogicalPosition();
1188  }
1194  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1195 
1196  // Asymmetric equality & nonequality with a sentinel type.
1197 
1204  template<typename Sentinel> U_FORCE_INLINE friend
1205  std::enable_if_t<
1206  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1207  bool>
1208  operator==(const UTFIterator &iter, const Sentinel &s) {
1209  return iter.getLogicalPosition() == s;
1210  }
1211 
1212 #if U_CPLUSPLUS_VERSION < 20
1213  // C++17: Need to define all four combinations of == / != vs. parameter order.
1214  // Once we require C++20, we could remove all but the first == because
1215  // the compiler would generate the rest.
1216 
1223  template<typename Sentinel> U_FORCE_INLINE friend
1224  std::enable_if_t<
1225  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1226  bool>
1227  operator==(const Sentinel &s, const UTFIterator &iter) {
1228  return iter.getLogicalPosition() == s;
1229  }
1236  template<typename Sentinel> U_FORCE_INLINE friend
1237  std::enable_if_t<
1238  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1239  bool>
1240  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1247  template<typename Sentinel> U_FORCE_INLINE friend
1248  std::enable_if_t<
1249  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1250  bool>
1251  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1252 #endif // C++17
1253 
1261  if (state_ == 0) {
1262  UnitIter p0 = p_;
1263  units_ = Impl::readAndInc(p0, p_, limit_);
1264  state_ = 1;
1265  }
1266  return units_;
1267  }
1268 
1277  U_FORCE_INLINE Proxy operator->() const {
1278  if (state_ == 0) {
1279  UnitIter p0 = p_;
1280  units_ = Impl::readAndInc(p0, p_, limit_);
1281  state_ = 1;
1282  }
1283  return Proxy(units_);
1284  }
1285 
1292  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1293  if (state_ > 0) {
1294  // operator*() called readAndInc() so p_ is already ahead.
1295  state_ = 0;
1296  } else if (state_ == 0) {
1297  Impl::inc(p_, limit_);
1298  } else /* state_ < 0 */ {
1299  // operator--() called decAndRead() so we know how far to skip.
1300  p_ = units_.end();
1301  state_ = 0;
1302  }
1303  return *this;
1304  }
1305 
1314  U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1315  if (state_ > 0) {
1316  // operator*() called readAndInc() so p_ is already ahead.
1317  UTFIterator result(*this);
1318  state_ = 0;
1319  return result;
1320  } else if (state_ == 0) {
1321  UnitIter p0 = p_;
1322  units_ = Impl::readAndInc(p0, p_, limit_);
1323  UTFIterator result(*this);
1324  result.state_ = 1;
1325  // keep this->state_ == 0
1326  return result;
1327  } else /* state_ < 0 */ {
1328  UTFIterator result(*this);
1329  // operator--() called decAndRead() so we know how far to skip.
1330  p_ = units_.end();
1331  state_ = 0;
1332  return result;
1333  }
1334  }
1335 
1343  template<typename Iter = UnitIter>
1345  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1346  operator--() { // pre-decrement
1347  if (state_ > 0) {
1348  // operator*() called readAndInc() so p_ is ahead of the logical position.
1349  p_ = units_.begin();
1350  }
1351  units_ = Impl::decAndRead(start_, p_);
1352  state_ = -1;
1353  return *this;
1354  }
1355 
1363  template<typename Iter = UnitIter>
1365  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1366  operator--(int) { // post-decrement
1367  UTFIterator result(*this);
1368  operator--();
1369  return result;
1370  }
1371 
1372 private:
1373  friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1374 
1375  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1376  return state_ <= 0 ? p_ : units_.begin();
1377  }
1378 
1379  // operator*() etc. are logically const.
1380  mutable UnitIter p_;
1381  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1382  // (forward or backward) we can test if there are enough code units.
1383  UnitIter start_;
1384  LimitIter limit_;
1385  // Keep state so that we call readAndInc() only once for both operator*() and ++
1386  // to make it easy for the compiler to optimize.
1387  mutable CodeUnits<CP32, UnitIter> units_;
1388  // >0: units_ = readAndInc(), p_ = units limit
1389  // which means that p_ is ahead of its logical position
1390  // 0: initial state
1391  // <0: units_ = decAndRead(), p_ = units start
1392  mutable int8_t state_ = 0;
1393 };
1394 
1395 #ifndef U_IN_DOXYGEN
1396 // Partial template specialization for single-pass input iterator.
1397 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1398 class UTFIterator<
1399  CP32, behavior,
1400  UnitIter, LimitIter,
1401  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1402  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1403  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1404 
1405  // Proxy type for post-increment return value, to make *iter++ work.
1406  // Also for operator->() (required by LegacyInputIterator)
1407  // so that we don't promise always returning CodeUnits.
1408  class Proxy {
1409  public:
1410  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1411  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1412  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1413  private:
1414  CodeUnits<CP32, UnitIter> units_;
1415  };
1416 
1417 public:
1418  using value_type = CodeUnits<CP32, UnitIter>;
1419  using reference = value_type;
1420  using pointer = Proxy;
1421  using difference_type = prv::iter_difference_t<UnitIter>;
1422  using iterator_category = std::input_iterator_tag;
1423 
1424  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1425 
1426  // Constructs an iterator start or limit sentinel.
1427  // Requires p to be copyable.
1428  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1429 
1430  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1431  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1432 
1433  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1434  U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1435 
1436  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1437  return p_ == other.p_ && ahead_ == other.ahead_;
1438  // Strictly speaking, we should check if the logical position is the same.
1439  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1440  }
1441  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1442 
1443  template<typename Sentinel> U_FORCE_INLINE friend
1444  std::enable_if_t<
1445  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1446  bool>
1447  operator==(const UTFIterator &iter, const Sentinel &s) {
1448  return !iter.ahead_ && iter.p_ == s;
1449  }
1450 
1451 #if U_CPLUSPLUS_VERSION < 20
1452  template<typename Sentinel> U_FORCE_INLINE friend
1453  std::enable_if_t<
1454  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1455  bool>
1456  operator==(const Sentinel &s, const UTFIterator &iter) {
1457  return !iter.ahead_ && iter.p_ == s;
1458  }
1459 
1460  template<typename Sentinel> U_FORCE_INLINE friend
1461  std::enable_if_t<
1462  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1463  bool>
1464  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1465 
1466  template<typename Sentinel> U_FORCE_INLINE friend
1467  std::enable_if_t<
1468  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1469  bool>
1470  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1471 #endif // C++17
1472 
1473  U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1474  if (!ahead_) {
1475  units_ = Impl::readAndInc(p_, p_, limit_);
1476  ahead_ = true;
1477  }
1478  return units_;
1479  }
1480 
1481  U_FORCE_INLINE Proxy operator->() const {
1482  if (!ahead_) {
1483  units_ = Impl::readAndInc(p_, p_, limit_);
1484  ahead_ = true;
1485  }
1486  return Proxy(units_);
1487  }
1488 
1489  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1490  if (ahead_) {
1491  // operator*() called readAndInc() so p_ is already ahead.
1492  ahead_ = false;
1493  } else {
1494  Impl::inc(p_, limit_);
1495  }
1496  return *this;
1497  }
1498 
1499  U_FORCE_INLINE Proxy operator++(int) { // post-increment
1500  if (ahead_) {
1501  // operator*() called readAndInc() so p_ is already ahead.
1502  ahead_ = false;
1503  } else {
1504  units_ = Impl::readAndInc(p_, p_, limit_);
1505  // keep this->ahead_ == false
1506  }
1507  return Proxy(units_);
1508  }
1509 
1510 private:
1511  // operator*() etc. are logically const.
1512  mutable UnitIter p_;
1513  // In a validating iterator, we need limit_ so that when we read a code point
1514  // we can test if there are enough code units.
1515  LimitIter limit_;
1516  // Keep state so that we call readAndInc() only once for both operator*() and ++
1517  // so that we can use a single-pass input iterator for UnitIter.
1518  mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1519  // true: units_ = readAndInc(), p_ = units limit
1520  // which means that p_ is ahead of its logical position
1521  // false: initial state
1522  mutable bool ahead_ = false;
1523 };
1524 #endif // U_IN_DOXYGEN
1525 
1526 } // namespace U_HEADER_ONLY_NAMESPACE
1527 
1528 #ifndef U_IN_DOXYGEN
1529 // Bespoke specialization of reverse_iterator.
1530 // The default implementation implements reverse operator*() and ++ in a way
1531 // that does most of the same work twice for reading variable-length sequences.
1532 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1533 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1534  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1535  using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1537 
1538  // Proxy type for operator->() (required by LegacyInputIterator)
1539  // so that we don't promise always returning CodeUnits.
1540  class Proxy {
1541  public:
1542  explicit Proxy(CodeUnits_ units) : units_(units) {}
1543  CodeUnits_ &operator*() { return units_; }
1544  CodeUnits_ *operator->() { return &units_; }
1545  private:
1546  CodeUnits_ units_;
1547  };
1548 
1549 public:
1550  using value_type = CodeUnits_;
1551  using reference = value_type;
1552  using pointer = Proxy;
1554  using iterator_category = std::bidirectional_iterator_tag;
1555 
1557  p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1558  units_(0, 0, false, p_, p_) {}
1559  U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1560 
1561  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1562  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1563 
1564  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1565  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1566 
1567  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1568  return getLogicalPosition() == other.getLogicalPosition();
1569  }
1570  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1571 
1572  U_FORCE_INLINE CodeUnits_ operator*() const {
1573  if (state_ == 0) {
1574  units_ = Impl::decAndRead(start_, p_);
1575  state_ = -1;
1576  }
1577  return units_;
1578  }
1579 
1580  U_FORCE_INLINE Proxy operator->() const {
1581  if (state_ == 0) {
1582  units_ = Impl::decAndRead(start_, p_);
1583  state_ = -1;
1584  }
1585  return Proxy(units_);
1586  }
1587 
1588  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1589  if (state_ < 0) {
1590  // operator*() called decAndRead() so p_ is already behind.
1591  state_ = 0;
1592  } else if (state_ == 0) {
1593  Impl::dec(start_, p_);
1594  } else /* state_ > 0 */ {
1595  // operator--() called readAndInc() so we know how far to skip.
1596  p_ = units_.begin();
1597  state_ = 0;
1598  }
1599  return *this;
1600  }
1601 
1602  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1603  if (state_ < 0) {
1604  // operator*() called decAndRead() so p_ is already behind.
1605  reverse_iterator result(*this);
1606  state_ = 0;
1607  return result;
1608  } else if (state_ == 0) {
1609  units_ = Impl::decAndRead(start_, p_);
1610  reverse_iterator result(*this);
1611  result.state_ = -1;
1612  // keep this->state_ == 0
1613  return result;
1614  } else /* state_ > 0 */ {
1615  reverse_iterator result(*this);
1616  // operator--() called readAndInc() so we know how far to skip.
1617  p_ = units_.begin();
1618  state_ = 0;
1619  return result;
1620  }
1621  }
1622 
1623  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1624  if (state_ < 0) {
1625  // operator*() called decAndRead() so p_ is behind the logical position.
1626  p_ = units_.end();
1627  }
1628  UnitIter p0 = p_;
1629  units_ = Impl::readAndInc(p0, p_, limit_);
1630  state_ = 1;
1631  return *this;
1632  }
1633 
1634  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1635  reverse_iterator result(*this);
1636  operator--();
1637  return result;
1638  }
1639 
1640 private:
1641  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1642  return state_ >= 0 ? p_ : units_.end();
1643  }
1644 
1645  // operator*() etc. are logically const.
1646  mutable UnitIter p_;
1647  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1648  // (forward or backward) we can test if there are enough code units.
1649  UnitIter start_;
1650  UnitIter limit_;
1651  // Keep state so that we call decAndRead() only once for both operator*() and ++
1652  // to make it easy for the compiler to optimize.
1653  mutable CodeUnits_ units_;
1654  // >0: units_ = readAndInc(), p_ = units limit
1655  // 0: initial state
1656  // <0: units_ = decAndRead(), p_ = units start
1657  // which means that p_ is behind its logical position
1658  mutable int8_t state_ = 0;
1659 };
1660 #endif // U_IN_DOXYGEN
1661 
1662 namespace U_HEADER_ONLY_NAMESPACE {
1663 
1686 template<typename CP32, UTFIllFormedBehavior behavior,
1687  typename UnitIter, typename LimitIter = UnitIter>
1688 auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1690  std::move(start), std::move(p), std::move(limit));
1691 }
1692 
1713 template<typename CP32, UTFIllFormedBehavior behavior,
1714  typename UnitIter, typename LimitIter = UnitIter>
1715 auto utfIterator(UnitIter p, LimitIter limit) {
1717  std::move(p), std::move(limit));
1718 }
1719 
1720 // Note: We should only enable the following factory function for a copyable UnitIter.
1721 // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1722 // but a function template partial specialization is not allowed.
1723 // In C++20, we might be able to require the std::copyable concept.
1724 
1744 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1745 auto utfIterator(UnitIter p) {
1746  return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1747 }
1748 
1776 template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1778  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1779 public:
1784  UTFStringCodePoints() = default;
1785 
1791  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1792  explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1801  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1802  explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1803 
1805  UTFStringCodePoints(const UTFStringCodePoints &other) = default;
1806 
1809 
1814  auto begin() {
1815  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1816  }
1817 
1822  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1823  auto begin() const {
1824  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1825  }
1826 
1831  auto end() {
1832  using UnitIter = decltype(unitRange.begin());
1833  using LimitIter = decltype(unitRange.end());
1834  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1835  // Return the code unit sentinel.
1836  return unitRange.end();
1837  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1838  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1839  } else {
1840  // The input iterator specialization has no three-argument constructor.
1841  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1842  }
1843  }
1844 
1849  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1850  auto end() const {
1851  using UnitIter = decltype(unitRange.begin());
1852  using LimitIter = decltype(unitRange.end());
1853  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1854  // Return the code unit sentinel.
1855  return unitRange.end();
1856  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1857  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1858  } else {
1859  // The input iterator specialization has no three-argument constructor.
1860  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1861  }
1862  }
1863 
1868  auto rbegin() const {
1869  return std::make_reverse_iterator(end());
1870  }
1871 
1876  auto rend() const {
1877  return std::make_reverse_iterator(begin());
1878  }
1879 
1880 private:
1881  Range unitRange;
1882 };
1883 
1885 template<typename CP32, UTFIllFormedBehavior behavior>
1887 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1888  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1889  : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1890 #endif
1891 {
1893  template<typename Range>
1894  auto operator()(Range &&unitRange) const {
1895 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1897  std::forward<Range>(unitRange));
1898 #else
1899  if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
1900  // Take basic_string_view by copy, not by reference. In C++20 this is handled by
1901  // all_t<Range>, which is Range if Range is a view.
1903  std::forward<Range>(unitRange));
1904  } else {
1905  return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1906  }
1907 #endif
1908  }
1909 };
1910 
1925 template<typename CP32, UTFIllFormedBehavior behavior>
1927 
1928 // Non-validating iterators ------------------------------------------------ ***
1929 
1951 template<typename CP32, typename UnitIter, typename = void>
1953  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1954  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1955 
1956  // Proxy type for operator->() (required by LegacyInputIterator)
1957  // so that we don't promise always returning UnsafeCodeUnits.
1958  class Proxy {
1959  public:
1960  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1961  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1962  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1963  private:
1965  };
1966 
1967 public:
1973  using pointer = Proxy;
1977  using iterator_category = std::conditional_t<
1978  prv::bidirectional_iterator<UnitIter>,
1979  std::bidirectional_iterator_tag,
1980  std::forward_iterator_tag>;
1981 
1991  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
1997  U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
1998 
2003 
2008 
2014  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2015  return getLogicalPosition() == other.getLogicalPosition();
2016  }
2022  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2023 
2030  template<typename Sentinel> U_FORCE_INLINE friend
2031  std::enable_if_t<
2032  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2033  bool>
2034  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2035  return iter.getLogicalPosition() == s;
2036  }
2037 
2038 #if U_CPLUSPLUS_VERSION < 20
2045  template<typename Sentinel> U_FORCE_INLINE friend
2046  std::enable_if_t<
2047  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2048  bool>
2049  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2050  return iter.getLogicalPosition() == s;
2051  }
2058  template<typename Sentinel> U_FORCE_INLINE friend
2059  std::enable_if_t<
2060  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2061  bool>
2062  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2069  template<typename Sentinel> U_FORCE_INLINE friend
2070  std::enable_if_t<
2071  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2072  bool>
2073  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2074 #endif // C++17
2075 
2083  if (state_ == 0) {
2084  UnitIter p0 = p_;
2085  units_ = Impl::readAndInc(p0, p_);
2086  state_ = 1;
2087  }
2088  return units_;
2089  }
2090 
2099  U_FORCE_INLINE Proxy operator->() const {
2100  if (state_ == 0) {
2101  UnitIter p0 = p_;
2102  units_ = Impl::readAndInc(p0, p_);
2103  state_ = 1;
2104  }
2105  return Proxy(units_);
2106  }
2107 
2115  if (state_ > 0) {
2116  // operator*() called readAndInc() so p_ is already ahead.
2117  state_ = 0;
2118  } else if (state_ == 0) {
2119  Impl::inc(p_);
2120  } else /* state_ < 0 */ {
2121  // operator--() called decAndRead() so we know how far to skip.
2122  p_ = units_.end();
2123  state_ = 0;
2124  }
2125  return *this;
2126  }
2127 
2137  if (state_ > 0) {
2138  // operator*() called readAndInc() so p_ is already ahead.
2139  UnsafeUTFIterator result(*this);
2140  state_ = 0;
2141  return result;
2142  } else if (state_ == 0) {
2143  UnitIter p0 = p_;
2144  units_ = Impl::readAndInc(p0, p_);
2145  UnsafeUTFIterator result(*this);
2146  result.state_ = 1;
2147  // keep this->state_ == 0
2148  return result;
2149  } else /* state_ < 0 */ {
2150  UnsafeUTFIterator result(*this);
2151  // operator--() called decAndRead() so we know how far to skip.
2152  p_ = units_.end();
2153  state_ = 0;
2154  return result;
2155  }
2156  }
2157 
2165  template<typename Iter = UnitIter>
2167  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2168  operator--() { // pre-decrement
2169  if (state_ > 0) {
2170  // operator*() called readAndInc() so p_ is ahead of the logical position.
2171  p_ = units_.begin();
2172  }
2173  units_ = Impl::decAndRead(p_);
2174  state_ = -1;
2175  return *this;
2176  }
2177 
2185  template<typename Iter = UnitIter>
2187  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2188  operator--(int) { // post-decrement
2189  UnsafeUTFIterator result(*this);
2190  operator--();
2191  return result;
2192  }
2193 
2194 private:
2195  friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2196 
2197  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2198  return state_ <= 0 ? p_ : units_.begin();
2199  }
2200 
2201  // operator*() etc. are logically const.
2202  mutable UnitIter p_;
2203  // Keep state so that we call readAndInc() only once for both operator*() and ++
2204  // to make it easy for the compiler to optimize.
2205  mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2206  // >0: units_ = readAndInc(), p_ = units limit
2207  // which means that p_ is ahead of its logical position
2208  // 0: initial state
2209  // <0: units_ = decAndRead(), p_ = units start
2210  mutable int8_t state_ = 0;
2211 };
2212 
2213 #ifndef U_IN_DOXYGEN
2214 // Partial template specialization for single-pass input iterator.
2215 template<typename CP32, typename UnitIter>
2216 class UnsafeUTFIterator<
2217  CP32,
2218  UnitIter,
2219  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2220  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2221  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2222 
2223  // Proxy type for post-increment return value, to make *iter++ work.
2224  // Also for operator->() (required by LegacyInputIterator)
2225  // so that we don't promise always returning UnsafeCodeUnits.
2226  class Proxy {
2227  public:
2228  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2229  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2230  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2231  private:
2232  UnsafeCodeUnits<CP32, UnitIter> units_;
2233  };
2234 
2235 public:
2236  using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2237  using reference = value_type;
2238  using pointer = Proxy;
2239  using difference_type = prv::iter_difference_t<UnitIter>;
2240  using iterator_category = std::input_iterator_tag;
2241 
2242  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2243 
2244  U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2245  U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2246 
2247  U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2249 
2250  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2251  return p_ == other.p_ && ahead_ == other.ahead_;
2252  // Strictly speaking, we should check if the logical position is the same.
2253  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2254  }
2255  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2256 
2257  template<typename Sentinel> U_FORCE_INLINE friend
2258  std::enable_if_t<
2259  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2260  bool>
2261  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2262  return !iter.ahead_ && iter.p_ == s;
2263  }
2264 
2265 #if U_CPLUSPLUS_VERSION < 20
2266  template<typename Sentinel> U_FORCE_INLINE friend
2267  std::enable_if_t<
2268  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2269  bool>
2270  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2271  return !iter.ahead_ && iter.p_ == s;
2272  }
2273 
2274  template<typename Sentinel> U_FORCE_INLINE friend
2275  std::enable_if_t<
2276  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2277  bool>
2278  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2279 
2280  template<typename Sentinel> U_FORCE_INLINE friend
2281  std::enable_if_t<
2282  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2283  bool>
2284  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2285 #endif // C++17
2286 
2287  U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2288  if (!ahead_) {
2289  units_ = Impl::readAndInc(p_, p_);
2290  ahead_ = true;
2291  }
2292  return units_;
2293  }
2294 
2295  U_FORCE_INLINE Proxy operator->() const {
2296  if (!ahead_) {
2297  units_ = Impl::readAndInc(p_, p_);
2298  ahead_ = true;
2299  }
2300  return Proxy(units_);
2301  }
2302 
2303  U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2304  if (ahead_) {
2305  // operator*() called readAndInc() so p_ is already ahead.
2306  ahead_ = false;
2307  } else {
2308  Impl::inc(p_);
2309  }
2310  return *this;
2311  }
2312 
2313  U_FORCE_INLINE Proxy operator++(int) { // post-increment
2314  if (ahead_) {
2315  // operator*() called readAndInc() so p_ is already ahead.
2316  ahead_ = false;
2317  } else {
2318  units_ = Impl::readAndInc(p_, p_);
2319  // keep this->ahead_ == false
2320  }
2321  return Proxy(units_);
2322  }
2323 
2324 private:
2325  // operator*() etc. are logically const.
2326  mutable UnitIter p_;
2327  // Keep state so that we call readAndInc() only once for both operator*() and ++
2328  // so that we can use a single-pass input iterator for UnitIter.
2329  mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2330  // true: units_ = readAndInc(), p_ = units limit
2331  // which means that p_ is ahead of its logical position
2332  // false: initial state
2333  mutable bool ahead_ = false;
2334 };
2335 #endif // U_IN_DOXYGEN
2336 
2337 } // namespace U_HEADER_ONLY_NAMESPACE
2338 
2339 #ifndef U_IN_DOXYGEN
2340 // Bespoke specialization of reverse_iterator.
2341 // The default implementation implements reverse operator*() and ++ in a way
2342 // that does most of the same work twice for reading variable-length sequences.
2343 template<typename CP32, typename UnitIter>
2344 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2345  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2346  using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2348 
2349  // Proxy type for operator->() (required by LegacyInputIterator)
2350  // so that we don't promise always returning UnsafeCodeUnits.
2351  class Proxy {
2352  public:
2353  explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2354  UnsafeCodeUnits_ &operator*() { return units_; }
2355  UnsafeCodeUnits_ *operator->() { return &units_; }
2356  private:
2357  UnsafeCodeUnits_ units_;
2358  };
2359 
2360 public:
2361  using value_type = UnsafeCodeUnits_;
2362  using reference = value_type;
2363  using pointer = Proxy;
2365  using iterator_category = std::bidirectional_iterator_tag;
2366 
2368  p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2369  U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2370 
2371  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2372  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2373 
2374  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2375  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2376 
2377  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2378  return getLogicalPosition() == other.getLogicalPosition();
2379  }
2380  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2381 
2382  U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2383  if (state_ == 0) {
2384  units_ = Impl::decAndRead(p_);
2385  state_ = -1;
2386  }
2387  return units_;
2388  }
2389 
2390  U_FORCE_INLINE Proxy operator->() const {
2391  if (state_ == 0) {
2392  units_ = Impl::decAndRead(p_);
2393  state_ = -1;
2394  }
2395  return Proxy(units_);
2396  }
2397 
2398  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2399  if (state_ < 0) {
2400  // operator*() called decAndRead() so p_ is already behind.
2401  state_ = 0;
2402  } else if (state_ == 0) {
2403  Impl::dec(p_);
2404  } else /* state_ > 0 */ {
2405  // operator--() called readAndInc() so we know how far to skip.
2406  p_ = units_.begin();
2407  state_ = 0;
2408  }
2409  return *this;
2410  }
2411 
2412  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2413  if (state_ < 0) {
2414  // operator*() called decAndRead() so p_ is already behind.
2415  reverse_iterator result(*this);
2416  state_ = 0;
2417  return result;
2418  } else if (state_ == 0) {
2419  units_ = Impl::decAndRead(p_);
2420  reverse_iterator result(*this);
2421  result.state_ = -1;
2422  // keep this->state_ == 0
2423  return result;
2424  } else /* state_ > 0 */ {
2425  reverse_iterator result(*this);
2426  // operator--() called readAndInc() so we know how far to skip.
2427  p_ = units_.begin();
2428  state_ = 0;
2429  return result;
2430  }
2431  }
2432 
2433  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2434  if (state_ < 0) {
2435  // operator*() called decAndRead() so p_ is behind the logical position.
2436  p_ = units_.end();
2437  }
2438  UnitIter p0 = p_;
2439  units_ = Impl::readAndInc(p0, p_);
2440  state_ = 1;
2441  return *this;
2442  }
2443 
2444  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2445  reverse_iterator result(*this);
2446  operator--();
2447  return result;
2448  }
2449 
2450 private:
2451  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2452  return state_ >= 0 ? p_ : units_.end();
2453  }
2454 
2455  // operator*() etc. are logically const.
2456  mutable UnitIter p_;
2457  // Keep state so that we call decAndRead() only once for both operator*() and ++
2458  // to make it easy for the compiler to optimize.
2459  mutable UnsafeCodeUnits_ units_;
2460  // >0: units_ = readAndInc(), p_ = units limit
2461  // 0: initial state
2462  // <0: units_ = decAndRead(), p_ = units start
2463  // which means that p_ is behind its logical position
2464  mutable int8_t state_ = 0;
2465 };
2466 #endif // U_IN_DOXYGEN
2467 
2468 namespace U_HEADER_ONLY_NAMESPACE {
2469 
2485 template<typename CP32, typename UnitIter>
2486 auto unsafeUTFIterator(UnitIter iter) {
2487  return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2488 }
2489 
2517 template<typename CP32, typename Range>
2519  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2520 public:
2526 
2532  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2533  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2542  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2543  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2544 
2547 
2550 
2555  auto begin() {
2556  return unsafeUTFIterator<CP32>(unitRange.begin());
2557  }
2558 
2563  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2564  auto begin() const {
2565  return unsafeUTFIterator<CP32>(unitRange.begin());
2566  }
2567 
2572  auto end() {
2573  using UnitIter = decltype(unitRange.begin());
2574  using LimitIter = decltype(unitRange.end());
2575  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2576  // Return the code unit sentinel.
2577  return unitRange.end();
2578  } else {
2579  return unsafeUTFIterator<CP32>(unitRange.end());
2580  }
2581  }
2582 
2587  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2588  auto end() const {
2589  using UnitIter = decltype(unitRange.begin());
2590  using LimitIter = decltype(unitRange.end());
2591  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2592  // Return the code unit sentinel.
2593  return unitRange.end();
2594  } else {
2595  return unsafeUTFIterator<CP32>(unitRange.end());
2596  }
2597  }
2598 
2603  auto rbegin() const {
2604  return std::make_reverse_iterator(end());
2605  }
2606 
2611  auto rend() const {
2612  return std::make_reverse_iterator(begin());
2613  }
2614 
2615 private:
2616  Range unitRange;
2617 };
2618 
2620 template<typename CP32>
2622 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2623  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2624  : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2625 #endif
2626 {
2628  template<typename Range>
2629  auto operator()(Range &&unitRange) const {
2630 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2631  return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2632 #else
2633  if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
2634  // Take basic_string_view by copy, not by reference. In C++20 this is handled by
2635  // all_t<Range>, which is Range if Range is a view.
2636  return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2637  } else {
2638  return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2639  }
2640 #endif
2641  }
2642 };
2643 
2644 
2657 template<typename CP32>
2659 
2660 } // namespace U_HEADER_ONLY_NAMESPACE
2661 
2662 
2663 #if defined(__cpp_lib_ranges)
2664 template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
2665 constexpr bool std::ranges::enable_borrowed_range<
2667  std::ranges::enable_borrowed_range<Range>;
2668 
2669 template <typename CP32, typename Range>
2670 constexpr bool std::ranges::enable_borrowed_range<
2672  std::ranges::enable_borrowed_range<Range>;
2673 #endif
2674 
2675 #endif // U_HIDE_DRAFT_API
2676 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2677 #endif // __UTFITERATOR_H__
A C++ "range" over all Unicode code points U+0000..U+10FFFF.
Definition: utfiterator.h:302
A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
Definition: utfiterator.h:333
Result of validating and decoding a code unit sequence for one code point.
Definition: utfiterator.h:487
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Definition: utfiterator.h:490
Validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1094
U_FORCE_INLINE UTFIterator()
Default constructor.
Definition: utfiterator.h:1169
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1227
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:1366
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1277
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1113
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1111
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1240
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1122
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
Definition: utfiterator.h:1163
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1251
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1208
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:1346
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Definition: utfiterator.h:1194
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1115
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
Definition: utfiterator.h:1137
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1260
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:1314
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:1292
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
Definition: utfiterator.h:1186
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
Definition: utfiterator.h:1150
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1117
A C++ "range" for validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:1777
UTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UTFStringCodePoints & operator=(const UTFStringCodePoints &other)=default
Copy assignment operator.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:1792
UTFStringCodePoints(const UTFStringCodePoints &other)=default
Copy constructor.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:1802
Result of decoding a code unit sequence for one code point.
Definition: utfiterator.h:367
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
Definition: utfiterator.h:432
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
Definition: utfiterator.h:372
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
Non-validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1952
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:2168
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2049
U_FORCE_INLINE UnsafeUTFIterator()
Default constructor.
Definition: utfiterator.h:1997
UnsafeCodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1969
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:2022
U_FORCE_INLINE UnsafeUTFIterator & operator=(const UnsafeUTFIterator &other)=default
Copy assignment operator.
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1971
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2062
U_FORCE_INLINE UnsafeCodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2082
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1980
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2034
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1973
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1975
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:2188
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2099
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE UnsafeUTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:2136
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UnsafeUTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:2114
U_FORCE_INLINE UnsafeUTFIterator(UnitIter p)
Constructor; the iterator/pointer should be at a code point boundary.
Definition: utfiterator.h:1991
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2073
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:2014
U_FORCE_INLINE UnsafeUTFIterator & operator=(UnsafeUTFIterator &&src) noexcept=default
Move assignment operator.
A C++ "range" for non-validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:2518
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:2543
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other)=default
Copy constructor.
UnsafeUTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:2533
UnsafeUTFStringCodePoints & operator=(const UnsafeUTFStringCodePoints &other)=default
Copy assignment operator.
int32_t difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:258
bool operator==(const CodePointsIterator &other) const
Definition: utfiterator.h:265
bool operator!=(const CodePointsIterator &other) const
Definition: utfiterator.h:267
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:254
CP32 value_type
C++ iterator boilerplate.
Definition: utfiterator.h:252
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:260
CP32 * pointer
C++ iterator boilerplate.
Definition: utfiterator.h:256
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:346
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ...
Definition: platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition: umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition: umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition: utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition: utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition: utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition: utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition: utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition: utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition: utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition: utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition: utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition: utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition: utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition: utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
Definition: utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition: utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
Definition: utf8.h:190
auto unsafeUTFIterator(UnitIter iter)
UnsafeUTFIterator factory function.
Definition: utfiterator.h:2486
typename std::iterator_traits< Iter >::difference_type iter_difference_t
Definition: utfiterator.h:203
constexpr bool is_basic_string_view_v
Definition: utfiterator.h:244
constexpr bool forward_iterator
Definition: utfiterator.h:207
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit)
UTFIterator factory function for start <= p < limit.
Definition: utfiterator.h:1688
constexpr UTFStringCodePointsAdaptor< CP32, behavior > utfStringCodePoints
Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of cod...
Definition: utfiterator.h:1926
typename std::iterator_traits< Iter >::value_type iter_value_t
Definition: utfiterator.h:199
constexpr bool bidirectional_iterator
Definition: utfiterator.h:214
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > unsafeUTFStringCodePoints
Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range"...
Definition: utfiterator.h:2658
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
Definition: utfiterator.h:149
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
Definition: utfiterator.h:159
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
Definition: utfiterator.h:167
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Definition: utfiterator.h:157
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.