ICU 78.3  78.3
utfiterator.h
Go to the documentation of this file.
1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: https://www.unicode.org/copyright.html
3 
4 // utfiterator.h
5 // created: 2024aug12 Markus W. Scherer
6 
7 #ifndef __UTFITERATOR_H__
8 #define __UTFITERATOR_H__
9 
10 #include "unicode/utypes.h"
11 
12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13 
14 #include <iterator>
15 #if defined(__cpp_lib_ranges)
16 #include <ranges>
17 #endif
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include "unicode/utf16.h"
22 #include "unicode/utf8.h"
23 #include "unicode/uversion.h"
24 
135 #ifndef U_HIDE_DRAFT_API
136 
149 typedef enum UTFIllFormedBehavior {
169 
170 namespace U_HEADER_ONLY_NAMESPACE {
171 
172 namespace prv {
173 #if U_CPLUSPLUS_VERSION >= 20
174 
176 template<typename Iter>
177 using iter_value_t = typename std::iter_value_t<Iter>;
178 
180 template<typename Iter>
181 using iter_difference_t = std::iter_difference_t<Iter>;
182 
184 template<typename Iter>
185 constexpr bool forward_iterator = std::forward_iterator<Iter>;
186 
188 template<typename Iter>
189 constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190 
192 template<typename Range>
193 constexpr bool range = std::ranges::range<Range>;
194 
195 #else
196 
198 template<typename Iter>
199 using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200 
202 template<typename Iter>
203 using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204 
206 template<typename Iter>
207 constexpr bool forward_iterator =
208  std::is_base_of_v<
209  std::forward_iterator_tag,
210  typename std::iterator_traits<Iter>::iterator_category>;
211 
213 template<typename Iter>
214 constexpr bool bidirectional_iterator =
215  std::is_base_of_v<
216  std::bidirectional_iterator_tag,
217  typename std::iterator_traits<Iter>::iterator_category>;
218 
220 template<typename Range, typename = void>
221 struct range_type : std::false_type {};
222 
224 template<typename Range>
225 struct range_type<
226  Range,
227  std::void_t<decltype(std::declval<Range>().begin()),
228  decltype(std::declval<Range>().end())>> : std::true_type {};
229 
231 template<typename Range>
233 
234 #endif
235 
237 template <typename T> struct is_basic_string_view : std::false_type {};
238 
240 template <typename... Args>
241 struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
242 
244 template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
245 
247 template<typename CP32, bool skipSurrogates>
249  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
250 public:
252  using value_type = CP32;
256  using pointer = CP32 *;
258  using difference_type = int32_t;
260  using iterator_category = std::forward_iterator_tag;
261 
263  inline CodePointsIterator(CP32 c) : c_(c) {}
265  inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
267  inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
269  inline CP32 operator*() const { return c_; }
271  inline CodePointsIterator &operator++() { // pre-increment
272  ++c_;
273  if (skipSurrogates && c_ == 0xd800) {
274  c_ = 0xe000;
275  }
276  return *this;
277  }
279  inline CodePointsIterator operator++(int) { // post-increment
280  CodePointsIterator result(*this);
281  ++(*this);
282  return result;
283  }
284 
285 private:
286  CP32 c_;
287 };
288 
289 } // namespace prv
290 
301 template<typename CP32>
303  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
304 public:
312  auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
317  auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
318 };
319 
332 template<typename CP32>
334  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
335 public:
343  auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
348  auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
349 };
350 
366 template<typename CP32, typename UnitIter, typename = void>
368  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
369  using Unit = typename prv::iter_value_t<UnitIter>;
370 public:
372  UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
373  c_(codePoint), len_(length), start_(start), limit_(limit) {}
374 
376  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
378  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
379 
387  CP32 codePoint() const { return c_; }
388 
394  UnitIter begin() const { return start_; }
395 
401  UnitIter end() const { return limit_; }
402 
407  uint8_t length() const { return len_; }
408 
409 #if U_CPLUSPLUS_VERSION >= 20
415  template<std::contiguous_iterator Iter = UnitIter>
416  std::basic_string_view<Unit> stringView() const {
417  return std::basic_string_view<Unit>(begin(), end());
418  }
419 #else
425  template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
426  std::enable_if_t<std::is_pointer_v<Iter> ||
427  std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
428  std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
429  std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
430  std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
431  std::basic_string_view<Unit>>
432  stringView() const {
433  return std::basic_string_view<Unit>(&*start_, len_);
434  }
435 #endif
436 
437 private:
438  // Order of fields with padding and access frequency in mind.
439  CP32 c_;
440  uint8_t len_;
441  UnitIter start_;
442  UnitIter limit_;
443 };
444 
445 #ifndef U_IN_DOXYGEN
446 // Partial template specialization for single-pass input iterator.
447 // No UnitIter field, no getter for it, no stringView().
448 template<typename CP32, typename UnitIter>
449 class UnsafeCodeUnits<
450  CP32,
451  UnitIter,
452  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
453  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
454 public:
455  UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
456 
457  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
458  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
459 
460  CP32 codePoint() const { return c_; }
461 
462  uint8_t length() const { return len_; }
463 
464 private:
465  // Order of fields with padding and access frequency in mind.
466  CP32 c_;
467  uint8_t len_;
468 };
469 #endif // U_IN_DOXYGEN
470 
486 template<typename CP32, typename UnitIter, typename = void>
487 class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
488 public:
490  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
491  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
492 
494  CodeUnits(const CodeUnits &other) = default;
496  CodeUnits &operator=(const CodeUnits &other) = default;
497 
502  bool wellFormed() const { return ok_; }
503 
504 private:
505  bool ok_;
506 };
507 
508 #ifndef U_IN_DOXYGEN
509 // Partial template specialization for single-pass input iterator.
510 // No UnitIter field, no getter for it, no stringView().
511 template<typename CP32, typename UnitIter>
512 class CodeUnits<
513  CP32,
514  UnitIter,
515  std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
516  public UnsafeCodeUnits<CP32, UnitIter> {
517 public:
518  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
519  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
520 
521  CodeUnits(const CodeUnits &other) = default;
522  CodeUnits &operator=(const CodeUnits &other) = default;
523 
524  bool wellFormed() const { return ok_; }
525 
526 private:
527  bool ok_;
528 };
529 #endif // U_IN_DOXYGEN
530 
531 // Validating implementations ---------------------------------------------- ***
532 
533 #ifndef U_IN_DOXYGEN
534 template<typename CP32, UTFIllFormedBehavior behavior,
535  typename UnitIter, typename LimitIter = UnitIter, typename = void>
536 class UTFImpl;
537 
538 // Note: readAndInc() functions take both a p0 and a p iterator.
539 // They must have the same value.
540 // For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
541 // and readAndInc() copies p0 and the incremented p into the CodeUnits.
542 // For a single-pass UnitIter, which may not be default-constructible nor coypable,
543 // the caller can pass p into both references, and readAndInc() does not use p0
544 // and constructs CodeUnits without them.
545 // Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
546 // which may not be possible for a single-pass iterator.
547 
548 // UTF-8
549 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
550 class UTFImpl<
551  CP32, behavior,
552  UnitIter, LimitIter,
553  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
554  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
555  static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
556  "For 8-bit strings, the SURROGATE option does not have an equivalent.");
557 public:
558  // Handle ill-formed UTF-8
559  U_FORCE_INLINE static CP32 sub() {
560  if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
561  return U_SENTINEL;
562  } else {
563  static_assert(behavior == UTF_BEHAVIOR_FFFD);
564  return 0xfffd;
565  }
566  }
567 
568  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
569  // Very similar to U8_FWD_1().
570  uint8_t b = *p;
571  ++p;
572  if (U8_IS_LEAD(b) && p != limit) {
573  uint8_t t1 = *p;
574  if ((0xe0 <= b && b < 0xf0)) {
575  if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
576  ++p != limit && U8_IS_TRAIL(*p)) {
577  ++p;
578  }
579  } else if (b < 0xe0) {
580  if (U8_IS_TRAIL(t1)) {
581  ++p;
582  }
583  } else /* b >= 0xf0 */ {
584  if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
585  ++p != limit && U8_IS_TRAIL(*p) &&
586  ++p != limit && U8_IS_TRAIL(*p)) {
587  ++p;
588  }
589  }
590  }
591  }
592 
593  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
594  // Very similar to U8_BACK_1().
595  uint8_t c = *--p;
596  if (U8_IS_TRAIL(c) && p != start) {
597  UnitIter p1 = p;
598  uint8_t b1 = *--p1;
599  if (U8_IS_LEAD(b1)) {
600  if (b1 < 0xe0 ||
601  (b1 < 0xf0 ?
602  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
603  U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
604  p = p1;
605  return;
606  }
607  } else if (U8_IS_TRAIL(b1) && p1 != start) {
608  uint8_t b2 = *--p1;
609  if (0xe0 <= b2 && b2 <= 0xf4) {
610  if (b2 < 0xf0 ?
611  U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
612  U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
613  p = p1;
614  return;
615  }
616  } else if (U8_IS_TRAIL(b2) && p1 != start) {
617  uint8_t b3 = *--p1;
618  if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
619  p = p1;
620  return;
621  }
622  }
623  }
624  }
625  }
626 
627  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
628  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
629  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
630  // Very similar to U8_NEXT_OR_FFFD().
631  CP32 c = uint8_t(*p);
632  ++p;
633  if (U8_IS_SINGLE(c)) {
634  if constexpr (isMultiPass) {
635  return {c, 1, true, p0, p};
636  } else {
637  return {c, 1, true};
638  }
639  }
640  uint8_t length = 1;
641  uint8_t t = 0;
642  if (p != limit &&
643  // fetch/validate/assemble all but last trail byte
644  (c >= 0xe0 ?
645  (c < 0xf0 ? // U+0800..U+FFFF except surrogates
646  U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
647  (t &= 0x3f, 1)
648  : // U+10000..U+10FFFF
649  (c -= 0xf0) <= 4 &&
650  U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
651  (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
652  (t = *p - 0x80) <= 0x3f) &&
653  // valid second-to-last trail byte
654  (c = (c << 6) | t, ++length, ++p != limit)
655  : // U+0080..U+07FF
656  c >= 0xc2 && (c &= 0x1f, 1)) &&
657  // last trail byte
658  (t = *p - 0x80) <= 0x3f) {
659  c = (c << 6) | t;
660  ++length;
661  ++p;
662  if constexpr (isMultiPass) {
663  return {c, length, true, p0, p};
664  } else {
665  return {c, length, true};
666  }
667  }
668  if constexpr (isMultiPass) {
669  return {sub(), length, false, p0, p};
670  } else {
671  return {sub(), length, false};
672  }
673  }
674 
675  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
676  // Very similar to U8_PREV_OR_FFFD().
677  UnitIter p0 = p;
678  CP32 c = uint8_t(*--p);
679  if (U8_IS_SINGLE(c)) {
680  return {c, 1, true, p, p0};
681  }
682  if (U8_IS_TRAIL(c) && p != start) {
683  UnitIter p1 = p;
684  uint8_t b1 = *--p1;
685  if (U8_IS_LEAD(b1)) {
686  if (b1 < 0xe0) {
687  p = p1;
688  c = ((b1 - 0xc0) << 6) | (c & 0x3f);
689  return {c, 2, true, p, p0};
690  } else if (b1 < 0xf0 ?
691  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
692  U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
693  // Truncated 3- or 4-byte sequence.
694  p = p1;
695  return {sub(), 2, false, p, p0};
696  }
697  } else if (U8_IS_TRAIL(b1) && p1 != start) {
698  // Extract the value bits from the last trail byte.
699  c &= 0x3f;
700  uint8_t b2 = *--p1;
701  if (0xe0 <= b2 && b2 <= 0xf4) {
702  if (b2 < 0xf0) {
703  b2 &= 0xf;
704  if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
705  p = p1;
706  c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
707  return {c, 3, true, p, p0};
708  }
709  } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
710  // Truncated 4-byte sequence.
711  p = p1;
712  return {sub(), 3, false, p, p0};
713  }
714  } else if (U8_IS_TRAIL(b2) && p1 != start) {
715  uint8_t b3 = *--p1;
716  if (0xf0 <= b3 && b3 <= 0xf4) {
717  b3 &= 7;
718  if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
719  p = p1;
720  c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
721  return {c, 4, true, p, p0};
722  }
723  }
724  }
725  }
726  }
727  return {sub(), 1, false, p, p0};
728  }
729 };
730 
731 // UTF-16
732 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
733 class UTFImpl<
734  CP32, behavior,
735  UnitIter, LimitIter,
736  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
737  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
738 public:
739  // Handle ill-formed UTF-16: One unpaired surrogate.
740  U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
741  if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
742  return U_SENTINEL;
743  } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
744  return 0xfffd;
745  } else {
746  static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
747  return surrogate;
748  }
749  }
750 
751  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
752  // Very similar to U16_FWD_1().
753  auto c = *p;
754  ++p;
755  if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
756  ++p;
757  }
758  }
759 
760  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
761  // Very similar to U16_BACK_1().
762  UnitIter p1;
763  if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
764  p = p1;
765  }
766  }
767 
768  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
769  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
770  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
771  // Very similar to U16_NEXT_OR_FFFD().
772  CP32 c = static_cast<CP32>(*p);
773  ++p;
774  if (!U16_IS_SURROGATE(c)) {
775  if constexpr (isMultiPass) {
776  return {c, 1, true, p0, p};
777  } else {
778  return {c, 1, true};
779  }
780  } else {
781  uint16_t c2;
782  if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
783  ++p;
784  c = U16_GET_SUPPLEMENTARY(c, c2);
785  if constexpr (isMultiPass) {
786  return {c, 2, true, p0, p};
787  } else {
788  return {c, 2, true};
789  }
790  } else {
791  if constexpr (isMultiPass) {
792  return {sub(c), 1, false, p0, p};
793  } else {
794  return {sub(c), 1, false};
795  }
796  }
797  }
798  }
799 
800  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
801  // Very similar to U16_PREV_OR_FFFD().
802  UnitIter p0 = p;
803  CP32 c = static_cast<CP32>(*--p);
804  if (!U16_IS_SURROGATE(c)) {
805  return {c, 1, true, p, p0};
806  } else {
807  UnitIter p1;
808  uint16_t c2;
809  if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
810  p = p1;
811  c = U16_GET_SUPPLEMENTARY(c2, c);
812  return {c, 2, true, p, p0};
813  } else {
814  return {sub(c), 1, false, p, p0};
815  }
816  }
817  }
818 };
819 
820 // UTF-32: trivial, but still validating
821 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
822 class UTFImpl<
823  CP32, behavior,
824  UnitIter, LimitIter,
825  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
826  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
827 public:
828  // Handle ill-formed UTF-32
829  U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
830  if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
831  return U_SENTINEL;
832  } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
833  return 0xfffd;
834  } else {
835  static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
836  return forSurrogate ? surrogate : 0xfffd;
837  }
838  }
839 
840  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
841  ++p;
842  }
843 
844  U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
845  --p;
846  }
847 
848  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
849  UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
850  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
851  uint32_t uc = *p;
852  CP32 c = uc;
853  ++p;
854  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
855  if constexpr (isMultiPass) {
856  return {c, 1, true, p0, p};
857  } else {
858  return {c, 1, true};
859  }
860  } else {
861  if constexpr (isMultiPass) {
862  return {sub(uc < 0xe000, c), 1, false, p0, p};
863  } else {
864  return {sub(uc < 0xe000, c), 1, false};
865  }
866  }
867  }
868 
869  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
870  UnitIter p0 = p;
871  uint32_t uc = *--p;
872  CP32 c = uc;
873  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
874  return {c, 1, true, p, p0};
875  } else {
876  return {sub(uc < 0xe000, c), 1, false, p, p0};
877  }
878  }
879 };
880 
881 // Non-validating implementations ------------------------------------------ ***
882 
883 template<typename CP32, typename UnitIter, typename = void>
884 class UnsafeUTFImpl;
885 
886 // UTF-8
887 template<typename CP32, typename UnitIter>
888 class UnsafeUTFImpl<
889  CP32,
890  UnitIter,
891  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
892  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
893 public:
894  U_FORCE_INLINE static void inc(UnitIter &p) {
895  // Very similar to U8_FWD_1_UNSAFE().
896  uint8_t b = *p;
897  std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
898  }
899 
900  U_FORCE_INLINE static void dec(UnitIter &p) {
901  // Very similar to U8_BACK_1_UNSAFE().
902  while (U8_IS_TRAIL(*--p)) {}
903  }
904 
905  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
906  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
907  // Very similar to U8_NEXT_UNSAFE().
908  CP32 c = uint8_t(*p);
909  ++p;
910  if (U8_IS_SINGLE(c)) {
911  if constexpr (isMultiPass) {
912  return {c, 1, p0, p};
913  } else {
914  return {c, 1};
915  }
916  } else if (c < 0xe0) {
917  c = ((c & 0x1f) << 6) | (*p & 0x3f);
918  ++p;
919  if constexpr (isMultiPass) {
920  return {c, 2, p0, p};
921  } else {
922  return {c, 2};
923  }
924  } else if (c < 0xf0) {
925  // No need for (c&0xf) because the upper bits are truncated
926  // after <<12 in the cast to uint16_t.
927  c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
928  ++p;
929  c |= *p & 0x3f;
930  ++p;
931  if constexpr (isMultiPass) {
932  return {c, 3, p0, p};
933  } else {
934  return {c, 3};
935  }
936  } else {
937  c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
938  ++p;
939  c |= (*p & 0x3f) << 6;
940  ++p;
941  c |= *p & 0x3f;
942  ++p;
943  if constexpr (isMultiPass) {
944  return {c, 4, p0, p};
945  } else {
946  return {c, 4};
947  }
948  }
949  }
950 
951  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
952  // Very similar to U8_PREV_UNSAFE().
953  UnitIter p0 = p;
954  CP32 c = uint8_t(*--p);
955  if (U8_IS_SINGLE(c)) {
956  return {c, 1, p, p0};
957  }
958  // U8_IS_TRAIL(c) if well-formed
959  c &= 0x3f;
960  uint8_t count = 1;
961  for (uint8_t shift = 6;;) {
962  uint8_t b = *--p;
963  if (b >= 0xc0) {
964  U8_MASK_LEAD_BYTE(b, count);
965  c |= uint32_t{b} << shift;
966  break;
967  } else {
968  c |= (uint32_t{b} & 0x3f) << shift;
969  ++count;
970  shift += 6;
971  }
972  }
973  ++count;
974  return {c, count, p, p0};
975  }
976 };
977 
978 // UTF-16
979 template<typename CP32, typename UnitIter>
980 class UnsafeUTFImpl<
981  CP32,
982  UnitIter,
983  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
984  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
985 public:
986  U_FORCE_INLINE static void inc(UnitIter &p) {
987  // Very similar to U16_FWD_1_UNSAFE().
988  auto c = *p;
989  ++p;
990  if (U16_IS_LEAD(c)) {
991  ++p;
992  }
993  }
994 
995  U_FORCE_INLINE static void dec(UnitIter &p) {
996  // Very similar to U16_BACK_1_UNSAFE().
997  if (U16_IS_TRAIL(*--p)) {
998  --p;
999  }
1000  }
1001 
1002  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1003  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1004  // Very similar to U16_NEXT_UNSAFE().
1005  CP32 c = static_cast<CP32>(*p);
1006  ++p;
1007  if (!U16_IS_LEAD(c)) {
1008  if constexpr (isMultiPass) {
1009  return {c, 1, p0, p};
1010  } else {
1011  return {c, 1};
1012  }
1013  } else {
1014  uint16_t c2 = *p;
1015  ++p;
1016  c = U16_GET_SUPPLEMENTARY(c, c2);
1017  if constexpr (isMultiPass) {
1018  return {c, 2, p0, p};
1019  } else {
1020  return {c, 2};
1021  }
1022  }
1023  }
1024 
1025  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1026  // Very similar to U16_PREV_UNSAFE().
1027  UnitIter p0 = p;
1028  CP32 c = static_cast<CP32>(*--p);
1029  if (!U16_IS_TRAIL(c)) {
1030  return {c, 1, p, p0};
1031  } else {
1032  uint16_t c2 = *--p;
1033  c = U16_GET_SUPPLEMENTARY(c2, c);
1034  return {c, 2, p, p0};
1035  }
1036  }
1037 };
1038 
1039 // UTF-32: trivial
1040 template<typename CP32, typename UnitIter>
1041 class UnsafeUTFImpl<
1042  CP32,
1043  UnitIter,
1044  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1045  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1046 public:
1047  U_FORCE_INLINE static void inc(UnitIter &p) {
1048  ++p;
1049  }
1050 
1051  U_FORCE_INLINE static void dec(UnitIter &p) {
1052  --p;
1053  }
1054 
1055  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1056  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1057  CP32 c = *p;
1058  ++p;
1059  if constexpr (isMultiPass) {
1060  return {c, 1, p0, p};
1061  } else {
1062  return {c, 1};
1063  }
1064  }
1065 
1066  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1067  UnitIter p0 = p;
1068  CP32 c = *--p;
1069  return {c, 1, p, p0};
1070  }
1071 };
1072 
1073 #endif
1074 
1075 // Validating iterators ---------------------------------------------------- ***
1076 
1100 template<typename CP32, UTFIllFormedBehavior behavior,
1101  typename UnitIter, typename LimitIter = UnitIter, typename = void>
1103  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1104  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1105 
1106  // Proxy type for operator->() (required by LegacyInputIterator)
1107  // so that we don't promise always returning CodeUnits.
1108  class Proxy {
1109  public:
1110  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1111  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1112  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1113  private:
1115  };
1116 
1117 public:
1123  using pointer = Proxy;
1127  using iterator_category = std::conditional_t<
1128  prv::bidirectional_iterator<UnitIter>,
1129  std::bidirectional_iterator_tag,
1130  std::forward_iterator_tag>;
1131 
1145  U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1146  p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1158  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1159  p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1171  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1177  U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1178 
1180  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1182  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1183 
1185  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1188 
1194  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1195  return getLogicalPosition() == other.getLogicalPosition();
1196  }
1202  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1203 
1204  // Asymmetric equality & nonequality with a sentinel type.
1205 
1212  template<typename Sentinel> U_FORCE_INLINE friend
1213  std::enable_if_t<
1214  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1215  bool>
1216  operator==(const UTFIterator &iter, const Sentinel &s) {
1217  return iter.getLogicalPosition() == s;
1218  }
1219 
1220 #if U_CPLUSPLUS_VERSION < 20
1221  // C++17: Need to define all four combinations of == / != vs. parameter order.
1222  // Once we require C++20, we could remove all but the first == because
1223  // the compiler would generate the rest.
1224 
1231  template<typename Sentinel> U_FORCE_INLINE friend
1232  std::enable_if_t<
1233  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1234  bool>
1235  operator==(const Sentinel &s, const UTFIterator &iter) {
1236  return iter.getLogicalPosition() == s;
1237  }
1244  template<typename Sentinel> U_FORCE_INLINE friend
1245  std::enable_if_t<
1246  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1247  bool>
1248  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1255  template<typename Sentinel> U_FORCE_INLINE friend
1256  std::enable_if_t<
1257  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1258  bool>
1259  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1260 #endif // C++17
1261 
1269  if (state_ == 0) {
1270  UnitIter p0 = p_;
1271  units_ = Impl::readAndInc(p0, p_, limit_);
1272  state_ = 1;
1273  }
1274  return units_;
1275  }
1276 
1285  U_FORCE_INLINE Proxy operator->() const {
1286  if (state_ == 0) {
1287  UnitIter p0 = p_;
1288  units_ = Impl::readAndInc(p0, p_, limit_);
1289  state_ = 1;
1290  }
1291  return Proxy(units_);
1292  }
1293 
1300  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1301  if (state_ > 0) {
1302  // operator*() called readAndInc() so p_ is already ahead.
1303  state_ = 0;
1304  } else if (state_ == 0) {
1305  Impl::inc(p_, limit_);
1306  } else /* state_ < 0 */ {
1307  // operator--() called decAndRead() so we know how far to skip.
1308  p_ = units_.end();
1309  state_ = 0;
1310  }
1311  return *this;
1312  }
1313 
1322  U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1323  if (state_ > 0) {
1324  // operator*() called readAndInc() so p_ is already ahead.
1325  UTFIterator result(*this);
1326  state_ = 0;
1327  return result;
1328  } else if (state_ == 0) {
1329  UnitIter p0 = p_;
1330  units_ = Impl::readAndInc(p0, p_, limit_);
1331  UTFIterator result(*this);
1332  result.state_ = 1;
1333  // keep this->state_ == 0
1334  return result;
1335  } else /* state_ < 0 */ {
1336  UTFIterator result(*this);
1337  // operator--() called decAndRead() so we know how far to skip.
1338  p_ = units_.end();
1339  state_ = 0;
1340  return result;
1341  }
1342  }
1343 
1351  template<typename Iter = UnitIter>
1353  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1354  operator--() { // pre-decrement
1355  if (state_ > 0) {
1356  // operator*() called readAndInc() so p_ is ahead of the logical position.
1357  p_ = units_.begin();
1358  }
1359  units_ = Impl::decAndRead(start_, p_);
1360  state_ = -1;
1361  return *this;
1362  }
1363 
1371  template<typename Iter = UnitIter>
1373  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1374  operator--(int) { // post-decrement
1375  UTFIterator result(*this);
1376  operator--();
1377  return result;
1378  }
1379 
1380 private:
1381  friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1382 
1383  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1384  return state_ <= 0 ? p_ : units_.begin();
1385  }
1386 
1387  // operator*() etc. are logically const.
1388  mutable UnitIter p_;
1389  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1390  // (forward or backward) we can test if there are enough code units.
1391  UnitIter start_;
1392  LimitIter limit_;
1393  // Keep state so that we call readAndInc() only once for both operator*() and ++
1394  // to make it easy for the compiler to optimize.
1395  mutable CodeUnits<CP32, UnitIter> units_;
1396  // >0: units_ = readAndInc(), p_ = units limit
1397  // which means that p_ is ahead of its logical position
1398  // 0: initial state
1399  // <0: units_ = decAndRead(), p_ = units start
1400  mutable int8_t state_ = 0;
1401 };
1402 
1403 #ifndef U_IN_DOXYGEN
1404 // Partial template specialization for single-pass input iterator.
1405 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1406 class UTFIterator<
1407  CP32, behavior,
1408  UnitIter, LimitIter,
1409  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1410  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1411  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1412 
1413  // Proxy type for post-increment return value, to make *iter++ work.
1414  // Also for operator->() (required by LegacyInputIterator)
1415  // so that we don't promise always returning CodeUnits.
1416  class Proxy {
1417  public:
1418  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1419  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1420  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1421  private:
1422  CodeUnits<CP32, UnitIter> units_;
1423  };
1424 
1425 public:
1426  using value_type = CodeUnits<CP32, UnitIter>;
1427  using reference = value_type;
1428  using pointer = Proxy;
1429  using difference_type = prv::iter_difference_t<UnitIter>;
1430  using iterator_category = std::input_iterator_tag;
1431 
1432  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1433 
1434  // Constructs an iterator start or limit sentinel.
1435  // Requires p to be copyable.
1436  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1437 
1438  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1439  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1440 
1441  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1442  U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1443 
1444  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1445  return p_ == other.p_ && ahead_ == other.ahead_;
1446  // Strictly speaking, we should check if the logical position is the same.
1447  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1448  }
1449  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1450 
1451  template<typename Sentinel> U_FORCE_INLINE friend
1452  std::enable_if_t<
1453  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1454  bool>
1455  operator==(const UTFIterator &iter, const Sentinel &s) {
1456  return !iter.ahead_ && iter.p_ == s;
1457  }
1458 
1459 #if U_CPLUSPLUS_VERSION < 20
1460  template<typename Sentinel> U_FORCE_INLINE friend
1461  std::enable_if_t<
1462  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1463  bool>
1464  operator==(const Sentinel &s, const UTFIterator &iter) {
1465  return !iter.ahead_ && iter.p_ == s;
1466  }
1467 
1468  template<typename Sentinel> U_FORCE_INLINE friend
1469  std::enable_if_t<
1470  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1471  bool>
1472  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1473 
1474  template<typename Sentinel> U_FORCE_INLINE friend
1475  std::enable_if_t<
1476  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1477  bool>
1478  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1479 #endif // C++17
1480 
1481  U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1482  if (!ahead_) {
1483  units_ = Impl::readAndInc(p_, p_, limit_);
1484  ahead_ = true;
1485  }
1486  return units_;
1487  }
1488 
1489  U_FORCE_INLINE Proxy operator->() const {
1490  if (!ahead_) {
1491  units_ = Impl::readAndInc(p_, p_, limit_);
1492  ahead_ = true;
1493  }
1494  return Proxy(units_);
1495  }
1496 
1497  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1498  if (ahead_) {
1499  // operator*() called readAndInc() so p_ is already ahead.
1500  ahead_ = false;
1501  } else {
1502  Impl::inc(p_, limit_);
1503  }
1504  return *this;
1505  }
1506 
1507  U_FORCE_INLINE Proxy operator++(int) { // post-increment
1508  if (ahead_) {
1509  // operator*() called readAndInc() so p_ is already ahead.
1510  ahead_ = false;
1511  } else {
1512  units_ = Impl::readAndInc(p_, p_, limit_);
1513  // keep this->ahead_ == false
1514  }
1515  return Proxy(units_);
1516  }
1517 
1518 private:
1519  // operator*() etc. are logically const.
1520  mutable UnitIter p_;
1521  // In a validating iterator, we need limit_ so that when we read a code point
1522  // we can test if there are enough code units.
1523  LimitIter limit_;
1524  // Keep state so that we call readAndInc() only once for both operator*() and ++
1525  // so that we can use a single-pass input iterator for UnitIter.
1526  mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1527  // true: units_ = readAndInc(), p_ = units limit
1528  // which means that p_ is ahead of its logical position
1529  // false: initial state
1530  mutable bool ahead_ = false;
1531 };
1532 #endif // U_IN_DOXYGEN
1533 
1534 } // namespace U_HEADER_ONLY_NAMESPACE
1535 
1536 #ifndef U_IN_DOXYGEN
1537 // Bespoke specialization of reverse_iterator.
1538 // The default implementation implements reverse operator*() and ++ in a way
1539 // that does most of the same work twice for reading variable-length sequences.
1540 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1541 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1542  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1543  using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1545 
1546  // Proxy type for operator->() (required by LegacyInputIterator)
1547  // so that we don't promise always returning CodeUnits.
1548  class Proxy {
1549  public:
1550  explicit Proxy(CodeUnits_ units) : units_(units) {}
1551  CodeUnits_ &operator*() { return units_; }
1552  CodeUnits_ *operator->() { return &units_; }
1553  private:
1554  CodeUnits_ units_;
1555  };
1556 
1557 public:
1558  using value_type = CodeUnits_;
1559  using reference = value_type;
1560  using pointer = Proxy;
1562  using iterator_category = std::bidirectional_iterator_tag;
1563 
1565  p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1566  units_(0, 0, false, p_, p_) {}
1567  U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1568 
1569  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1570  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1571 
1572  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1573  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1574 
1575  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1576  return getLogicalPosition() == other.getLogicalPosition();
1577  }
1578  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1579 
1580  U_FORCE_INLINE CodeUnits_ operator*() const {
1581  if (state_ == 0) {
1582  units_ = Impl::decAndRead(start_, p_);
1583  state_ = -1;
1584  }
1585  return units_;
1586  }
1587 
1588  U_FORCE_INLINE Proxy operator->() const {
1589  if (state_ == 0) {
1590  units_ = Impl::decAndRead(start_, p_);
1591  state_ = -1;
1592  }
1593  return Proxy(units_);
1594  }
1595 
1596  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1597  if (state_ < 0) {
1598  // operator*() called decAndRead() so p_ is already behind.
1599  state_ = 0;
1600  } else if (state_ == 0) {
1601  Impl::dec(start_, p_);
1602  } else /* state_ > 0 */ {
1603  // operator--() called readAndInc() so we know how far to skip.
1604  p_ = units_.begin();
1605  state_ = 0;
1606  }
1607  return *this;
1608  }
1609 
1610  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1611  if (state_ < 0) {
1612  // operator*() called decAndRead() so p_ is already behind.
1613  reverse_iterator result(*this);
1614  state_ = 0;
1615  return result;
1616  } else if (state_ == 0) {
1617  units_ = Impl::decAndRead(start_, p_);
1618  reverse_iterator result(*this);
1619  result.state_ = -1;
1620  // keep this->state_ == 0
1621  return result;
1622  } else /* state_ > 0 */ {
1623  reverse_iterator result(*this);
1624  // operator--() called readAndInc() so we know how far to skip.
1625  p_ = units_.begin();
1626  state_ = 0;
1627  return result;
1628  }
1629  }
1630 
1631  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1632  if (state_ < 0) {
1633  // operator*() called decAndRead() so p_ is behind the logical position.
1634  p_ = units_.end();
1635  }
1636  UnitIter p0 = p_;
1637  units_ = Impl::readAndInc(p0, p_, limit_);
1638  state_ = 1;
1639  return *this;
1640  }
1641 
1642  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1643  reverse_iterator result(*this);
1644  operator--();
1645  return result;
1646  }
1647 
1648 private:
1649  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1650  return state_ >= 0 ? p_ : units_.end();
1651  }
1652 
1653  // operator*() etc. are logically const.
1654  mutable UnitIter p_;
1655  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1656  // (forward or backward) we can test if there are enough code units.
1657  UnitIter start_;
1658  UnitIter limit_;
1659  // Keep state so that we call decAndRead() only once for both operator*() and ++
1660  // to make it easy for the compiler to optimize.
1661  mutable CodeUnits_ units_;
1662  // >0: units_ = readAndInc(), p_ = units limit
1663  // 0: initial state
1664  // <0: units_ = decAndRead(), p_ = units start
1665  // which means that p_ is behind its logical position
1666  mutable int8_t state_ = 0;
1667 };
1668 #endif // U_IN_DOXYGEN
1669 
1670 namespace U_HEADER_ONLY_NAMESPACE {
1671 
1694 template<typename CP32, UTFIllFormedBehavior behavior,
1695  typename UnitIter, typename LimitIter = UnitIter>
1696 auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1698  std::move(start), std::move(p), std::move(limit));
1699 }
1700 
1721 template<typename CP32, UTFIllFormedBehavior behavior,
1722  typename UnitIter, typename LimitIter = UnitIter>
1723 auto utfIterator(UnitIter p, LimitIter limit) {
1725  std::move(p), std::move(limit));
1726 }
1727 
1728 // Note: We should only enable the following factory function for a copyable UnitIter.
1729 // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1730 // but a function template partial specialization is not allowed.
1731 // In C++20, we might be able to require the std::copyable concept.
1732 
1752 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1753 auto utfIterator(UnitIter p) {
1754  return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1755 }
1756 
1784 template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1786  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1787 public:
1792  UTFStringCodePoints() = default;
1793 
1799  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1800  explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1809  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1810  explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1811 
1813  UTFStringCodePoints(const UTFStringCodePoints &other) = default;
1814 
1817 
1822  auto begin() {
1823  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1824  }
1825 
1830  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1831  auto begin() const {
1832  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1833  }
1834 
1839  auto end() {
1840  using UnitIter = decltype(unitRange.begin());
1841  using LimitIter = decltype(unitRange.end());
1842  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1843  // Return the code unit sentinel.
1844  return unitRange.end();
1845  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1846  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1847  } else {
1848  // The input iterator specialization has no three-argument constructor.
1849  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1850  }
1851  }
1852 
1857  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1858  auto end() const {
1859  using UnitIter = decltype(unitRange.begin());
1860  using LimitIter = decltype(unitRange.end());
1861  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1862  // Return the code unit sentinel.
1863  return unitRange.end();
1864  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1865  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1866  } else {
1867  // The input iterator specialization has no three-argument constructor.
1868  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1869  }
1870  }
1871 
1876  auto rbegin() const {
1877  return std::make_reverse_iterator(end());
1878  }
1879 
1884  auto rend() const {
1885  return std::make_reverse_iterator(begin());
1886  }
1887 
1888 private:
1889  Range unitRange;
1890 };
1891 
1893 template<typename CP32, UTFIllFormedBehavior behavior>
1895 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1896  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1897  : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1898 #endif
1899 {
1901  template<typename Range>
1902  auto operator()(Range &&unitRange) const {
1903 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1905  std::forward<Range>(unitRange));
1906 #else
1907  if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
1908  // Take basic_string_view by copy, not by reference. In C++20 this is handled by
1909  // all_t<Range>, which is Range if Range is a view.
1911  std::forward<Range>(unitRange));
1912  } else {
1913  return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1914  }
1915 #endif
1916  }
1917 };
1918 
1933 template<typename CP32, UTFIllFormedBehavior behavior>
1935 
1936 // Non-validating iterators ------------------------------------------------ ***
1937 
1959 template<typename CP32, typename UnitIter, typename = void>
1961  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1962  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1963 
1964  // Proxy type for operator->() (required by LegacyInputIterator)
1965  // so that we don't promise always returning UnsafeCodeUnits.
1966  class Proxy {
1967  public:
1968  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1969  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1970  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1971  private:
1973  };
1974 
1975 public:
1981  using pointer = Proxy;
1985  using iterator_category = std::conditional_t<
1986  prv::bidirectional_iterator<UnitIter>,
1987  std::bidirectional_iterator_tag,
1988  std::forward_iterator_tag>;
1989 
1999  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
2005  U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
2006 
2011 
2016 
2022  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2023  return getLogicalPosition() == other.getLogicalPosition();
2024  }
2030  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2031 
2038  template<typename Sentinel> U_FORCE_INLINE friend
2039  std::enable_if_t<
2040  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2041  bool>
2042  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2043  return iter.getLogicalPosition() == s;
2044  }
2045 
2046 #if U_CPLUSPLUS_VERSION < 20
2053  template<typename Sentinel> U_FORCE_INLINE friend
2054  std::enable_if_t<
2055  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2056  bool>
2057  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2058  return iter.getLogicalPosition() == s;
2059  }
2066  template<typename Sentinel> U_FORCE_INLINE friend
2067  std::enable_if_t<
2068  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2069  bool>
2070  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2077  template<typename Sentinel> U_FORCE_INLINE friend
2078  std::enable_if_t<
2079  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2080  bool>
2081  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2082 #endif // C++17
2083 
2091  if (state_ == 0) {
2092  UnitIter p0 = p_;
2093  units_ = Impl::readAndInc(p0, p_);
2094  state_ = 1;
2095  }
2096  return units_;
2097  }
2098 
2107  U_FORCE_INLINE Proxy operator->() const {
2108  if (state_ == 0) {
2109  UnitIter p0 = p_;
2110  units_ = Impl::readAndInc(p0, p_);
2111  state_ = 1;
2112  }
2113  return Proxy(units_);
2114  }
2115 
2123  if (state_ > 0) {
2124  // operator*() called readAndInc() so p_ is already ahead.
2125  state_ = 0;
2126  } else if (state_ == 0) {
2127  Impl::inc(p_);
2128  } else /* state_ < 0 */ {
2129  // operator--() called decAndRead() so we know how far to skip.
2130  p_ = units_.end();
2131  state_ = 0;
2132  }
2133  return *this;
2134  }
2135 
2145  if (state_ > 0) {
2146  // operator*() called readAndInc() so p_ is already ahead.
2147  UnsafeUTFIterator result(*this);
2148  state_ = 0;
2149  return result;
2150  } else if (state_ == 0) {
2151  UnitIter p0 = p_;
2152  units_ = Impl::readAndInc(p0, p_);
2153  UnsafeUTFIterator result(*this);
2154  result.state_ = 1;
2155  // keep this->state_ == 0
2156  return result;
2157  } else /* state_ < 0 */ {
2158  UnsafeUTFIterator result(*this);
2159  // operator--() called decAndRead() so we know how far to skip.
2160  p_ = units_.end();
2161  state_ = 0;
2162  return result;
2163  }
2164  }
2165 
2173  template<typename Iter = UnitIter>
2175  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2176  operator--() { // pre-decrement
2177  if (state_ > 0) {
2178  // operator*() called readAndInc() so p_ is ahead of the logical position.
2179  p_ = units_.begin();
2180  }
2181  units_ = Impl::decAndRead(p_);
2182  state_ = -1;
2183  return *this;
2184  }
2185 
2193  template<typename Iter = UnitIter>
2195  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2196  operator--(int) { // post-decrement
2197  UnsafeUTFIterator result(*this);
2198  operator--();
2199  return result;
2200  }
2201 
2202 private:
2203  friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2204 
2205  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2206  return state_ <= 0 ? p_ : units_.begin();
2207  }
2208 
2209  // operator*() etc. are logically const.
2210  mutable UnitIter p_;
2211  // Keep state so that we call readAndInc() only once for both operator*() and ++
2212  // to make it easy for the compiler to optimize.
2213  mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2214  // >0: units_ = readAndInc(), p_ = units limit
2215  // which means that p_ is ahead of its logical position
2216  // 0: initial state
2217  // <0: units_ = decAndRead(), p_ = units start
2218  mutable int8_t state_ = 0;
2219 };
2220 
2221 #ifndef U_IN_DOXYGEN
2222 // Partial template specialization for single-pass input iterator.
2223 template<typename CP32, typename UnitIter>
2224 class UnsafeUTFIterator<
2225  CP32,
2226  UnitIter,
2227  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2228  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2229  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2230 
2231  // Proxy type for post-increment return value, to make *iter++ work.
2232  // Also for operator->() (required by LegacyInputIterator)
2233  // so that we don't promise always returning UnsafeCodeUnits.
2234  class Proxy {
2235  public:
2236  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2237  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2238  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2239  private:
2240  UnsafeCodeUnits<CP32, UnitIter> units_;
2241  };
2242 
2243 public:
2244  using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2245  using reference = value_type;
2246  using pointer = Proxy;
2247  using difference_type = prv::iter_difference_t<UnitIter>;
2248  using iterator_category = std::input_iterator_tag;
2249 
2250  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2251 
2252  U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2253  U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2254 
2255  U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2257 
2258  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2259  return p_ == other.p_ && ahead_ == other.ahead_;
2260  // Strictly speaking, we should check if the logical position is the same.
2261  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2262  }
2263  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2264 
2265  template<typename Sentinel> U_FORCE_INLINE friend
2266  std::enable_if_t<
2267  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2268  bool>
2269  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2270  return !iter.ahead_ && iter.p_ == s;
2271  }
2272 
2273 #if U_CPLUSPLUS_VERSION < 20
2274  template<typename Sentinel> U_FORCE_INLINE friend
2275  std::enable_if_t<
2276  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2277  bool>
2278  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2279  return !iter.ahead_ && iter.p_ == s;
2280  }
2281 
2282  template<typename Sentinel> U_FORCE_INLINE friend
2283  std::enable_if_t<
2284  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2285  bool>
2286  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2287 
2288  template<typename Sentinel> U_FORCE_INLINE friend
2289  std::enable_if_t<
2290  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2291  bool>
2292  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2293 #endif // C++17
2294 
2295  U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2296  if (!ahead_) {
2297  units_ = Impl::readAndInc(p_, p_);
2298  ahead_ = true;
2299  }
2300  return units_;
2301  }
2302 
2303  U_FORCE_INLINE Proxy operator->() const {
2304  if (!ahead_) {
2305  units_ = Impl::readAndInc(p_, p_);
2306  ahead_ = true;
2307  }
2308  return Proxy(units_);
2309  }
2310 
2311  U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2312  if (ahead_) {
2313  // operator*() called readAndInc() so p_ is already ahead.
2314  ahead_ = false;
2315  } else {
2316  Impl::inc(p_);
2317  }
2318  return *this;
2319  }
2320 
2321  U_FORCE_INLINE Proxy operator++(int) { // post-increment
2322  if (ahead_) {
2323  // operator*() called readAndInc() so p_ is already ahead.
2324  ahead_ = false;
2325  } else {
2326  units_ = Impl::readAndInc(p_, p_);
2327  // keep this->ahead_ == false
2328  }
2329  return Proxy(units_);
2330  }
2331 
2332 private:
2333  // operator*() etc. are logically const.
2334  mutable UnitIter p_;
2335  // Keep state so that we call readAndInc() only once for both operator*() and ++
2336  // so that we can use a single-pass input iterator for UnitIter.
2337  mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2338  // true: units_ = readAndInc(), p_ = units limit
2339  // which means that p_ is ahead of its logical position
2340  // false: initial state
2341  mutable bool ahead_ = false;
2342 };
2343 #endif // U_IN_DOXYGEN
2344 
2345 } // namespace U_HEADER_ONLY_NAMESPACE
2346 
2347 #ifndef U_IN_DOXYGEN
2348 // Bespoke specialization of reverse_iterator.
2349 // The default implementation implements reverse operator*() and ++ in a way
2350 // that does most of the same work twice for reading variable-length sequences.
2351 template<typename CP32, typename UnitIter>
2352 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2353  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2354  using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2356 
2357  // Proxy type for operator->() (required by LegacyInputIterator)
2358  // so that we don't promise always returning UnsafeCodeUnits.
2359  class Proxy {
2360  public:
2361  explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2362  UnsafeCodeUnits_ &operator*() { return units_; }
2363  UnsafeCodeUnits_ *operator->() { return &units_; }
2364  private:
2365  UnsafeCodeUnits_ units_;
2366  };
2367 
2368 public:
2369  using value_type = UnsafeCodeUnits_;
2370  using reference = value_type;
2371  using pointer = Proxy;
2373  using iterator_category = std::bidirectional_iterator_tag;
2374 
2376  p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2377  U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2378 
2379  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2380  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2381 
2382  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2383  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2384 
2385  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2386  return getLogicalPosition() == other.getLogicalPosition();
2387  }
2388  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2389 
2390  U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2391  if (state_ == 0) {
2392  units_ = Impl::decAndRead(p_);
2393  state_ = -1;
2394  }
2395  return units_;
2396  }
2397 
2398  U_FORCE_INLINE Proxy operator->() const {
2399  if (state_ == 0) {
2400  units_ = Impl::decAndRead(p_);
2401  state_ = -1;
2402  }
2403  return Proxy(units_);
2404  }
2405 
2406  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2407  if (state_ < 0) {
2408  // operator*() called decAndRead() so p_ is already behind.
2409  state_ = 0;
2410  } else if (state_ == 0) {
2411  Impl::dec(p_);
2412  } else /* state_ > 0 */ {
2413  // operator--() called readAndInc() so we know how far to skip.
2414  p_ = units_.begin();
2415  state_ = 0;
2416  }
2417  return *this;
2418  }
2419 
2420  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2421  if (state_ < 0) {
2422  // operator*() called decAndRead() so p_ is already behind.
2423  reverse_iterator result(*this);
2424  state_ = 0;
2425  return result;
2426  } else if (state_ == 0) {
2427  units_ = Impl::decAndRead(p_);
2428  reverse_iterator result(*this);
2429  result.state_ = -1;
2430  // keep this->state_ == 0
2431  return result;
2432  } else /* state_ > 0 */ {
2433  reverse_iterator result(*this);
2434  // operator--() called readAndInc() so we know how far to skip.
2435  p_ = units_.begin();
2436  state_ = 0;
2437  return result;
2438  }
2439  }
2440 
2441  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2442  if (state_ < 0) {
2443  // operator*() called decAndRead() so p_ is behind the logical position.
2444  p_ = units_.end();
2445  }
2446  UnitIter p0 = p_;
2447  units_ = Impl::readAndInc(p0, p_);
2448  state_ = 1;
2449  return *this;
2450  }
2451 
2452  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2453  reverse_iterator result(*this);
2454  operator--();
2455  return result;
2456  }
2457 
2458 private:
2459  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2460  return state_ >= 0 ? p_ : units_.end();
2461  }
2462 
2463  // operator*() etc. are logically const.
2464  mutable UnitIter p_;
2465  // Keep state so that we call decAndRead() only once for both operator*() and ++
2466  // to make it easy for the compiler to optimize.
2467  mutable UnsafeCodeUnits_ units_;
2468  // >0: units_ = readAndInc(), p_ = units limit
2469  // 0: initial state
2470  // <0: units_ = decAndRead(), p_ = units start
2471  // which means that p_ is behind its logical position
2472  mutable int8_t state_ = 0;
2473 };
2474 #endif // U_IN_DOXYGEN
2475 
2476 namespace U_HEADER_ONLY_NAMESPACE {
2477 
2493 template<typename CP32, typename UnitIter>
2494 auto unsafeUTFIterator(UnitIter iter) {
2495  return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2496 }
2497 
2525 template<typename CP32, typename Range>
2527  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2528 public:
2534 
2540  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2541  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2550  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2551  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2552 
2555 
2558 
2563  auto begin() {
2564  return unsafeUTFIterator<CP32>(unitRange.begin());
2565  }
2566 
2571  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2572  auto begin() const {
2573  return unsafeUTFIterator<CP32>(unitRange.begin());
2574  }
2575 
2580  auto end() {
2581  using UnitIter = decltype(unitRange.begin());
2582  using LimitIter = decltype(unitRange.end());
2583  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2584  // Return the code unit sentinel.
2585  return unitRange.end();
2586  } else {
2587  return unsafeUTFIterator<CP32>(unitRange.end());
2588  }
2589  }
2590 
2595  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2596  auto end() const {
2597  using UnitIter = decltype(unitRange.begin());
2598  using LimitIter = decltype(unitRange.end());
2599  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2600  // Return the code unit sentinel.
2601  return unitRange.end();
2602  } else {
2603  return unsafeUTFIterator<CP32>(unitRange.end());
2604  }
2605  }
2606 
2611  auto rbegin() const {
2612  return std::make_reverse_iterator(end());
2613  }
2614 
2619  auto rend() const {
2620  return std::make_reverse_iterator(begin());
2621  }
2622 
2623 private:
2624  Range unitRange;
2625 };
2626 
2628 template<typename CP32>
2630 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2631  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2632  : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2633 #endif
2634 {
2636  template<typename Range>
2637  auto operator()(Range &&unitRange) const {
2638 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2639  return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2640 #else
2641  if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
2642  // Take basic_string_view by copy, not by reference. In C++20 this is handled by
2643  // all_t<Range>, which is Range if Range is a view.
2644  return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2645  } else {
2646  return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2647  }
2648 #endif
2649  }
2650 };
2651 
2652 
2665 template<typename CP32>
2667 
2668 } // namespace U_HEADER_ONLY_NAMESPACE
2669 
2670 
2671 #if defined(__cpp_lib_ranges)
2672 template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
2673 constexpr bool std::ranges::enable_borrowed_range<
2675  std::ranges::enable_borrowed_range<Range>;
2676 
2677 template <typename CP32, typename Range>
2678 constexpr bool std::ranges::enable_borrowed_range<
2680  std::ranges::enable_borrowed_range<Range>;
2681 #endif
2682 
2683 #endif // U_HIDE_DRAFT_API
2684 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2685 #endif // __UTFITERATOR_H__
A C++ "range" over all Unicode code points U+0000..U+10FFFF.
Definition: utfiterator.h:302
A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
Definition: utfiterator.h:333
Result of validating and decoding a code unit sequence for one code point.
Definition: utfiterator.h:487
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Definition: utfiterator.h:490
Validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1102
U_FORCE_INLINE UTFIterator()
Default constructor.
Definition: utfiterator.h:1177
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1235
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:1374
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1285
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1121
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1119
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1248
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1130
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
Definition: utfiterator.h:1171
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1259
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1216
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:1354
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Definition: utfiterator.h:1202
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1123
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
Definition: utfiterator.h:1145
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1268
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:1322
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:1300
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
Definition: utfiterator.h:1194
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
Definition: utfiterator.h:1158
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1125
A C++ "range" for validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:1785
UTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UTFStringCodePoints & operator=(const UTFStringCodePoints &other)=default
Copy assignment operator.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:1800
UTFStringCodePoints(const UTFStringCodePoints &other)=default
Copy constructor.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:1810
Result of decoding a code unit sequence for one code point.
Definition: utfiterator.h:367
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
Definition: utfiterator.h:432
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
Definition: utfiterator.h:372
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
Non-validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1960
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:2176
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2057
U_FORCE_INLINE UnsafeUTFIterator()
Default constructor.
Definition: utfiterator.h:2005
UnsafeCodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1977
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:2030
U_FORCE_INLINE UnsafeUTFIterator & operator=(const UnsafeUTFIterator &other)=default
Copy assignment operator.
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1979
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2070
U_FORCE_INLINE UnsafeCodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2090
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1988
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2042
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1981
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1983
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:2196
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2107
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE UnsafeUTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:2144
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UnsafeUTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:2122
U_FORCE_INLINE UnsafeUTFIterator(UnitIter p)
Constructor; the iterator/pointer should be at a code point boundary.
Definition: utfiterator.h:1999
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2081
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:2022
U_FORCE_INLINE UnsafeUTFIterator & operator=(UnsafeUTFIterator &&src) noexcept=default
Move assignment operator.
A C++ "range" for non-validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:2526
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:2551
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other)=default
Copy constructor.
UnsafeUTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:2541
UnsafeUTFStringCodePoints & operator=(const UnsafeUTFStringCodePoints &other)=default
Copy assignment operator.
int32_t difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:258
bool operator==(const CodePointsIterator &other) const
Definition: utfiterator.h:265
bool operator!=(const CodePointsIterator &other) const
Definition: utfiterator.h:267
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:254
CP32 value_type
C++ iterator boilerplate.
Definition: utfiterator.h:252
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:260
CP32 * pointer
C++ iterator boilerplate.
Definition: utfiterator.h:256
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:346
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ...
Definition: platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition: umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition: umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition: utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition: utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition: utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition: utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition: utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition: utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition: utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition: utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition: utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition: utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition: utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition: utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
Definition: utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition: utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
Definition: utf8.h:190
auto unsafeUTFIterator(UnitIter iter)
UnsafeUTFIterator factory function.
Definition: utfiterator.h:2494
typename std::iterator_traits< Iter >::difference_type iter_difference_t
Definition: utfiterator.h:203
constexpr bool is_basic_string_view_v
Definition: utfiterator.h:244
constexpr bool forward_iterator
Definition: utfiterator.h:207
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit)
UTFIterator factory function for start <= p < limit.
Definition: utfiterator.h:1696
constexpr UTFStringCodePointsAdaptor< CP32, behavior > utfStringCodePoints
Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of cod...
Definition: utfiterator.h:1934
typename std::iterator_traits< Iter >::value_type iter_value_t
Definition: utfiterator.h:199
constexpr bool bidirectional_iterator
Definition: utfiterator.h:214
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > unsafeUTFStringCodePoints
Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range"...
Definition: utfiterator.h:2666
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
Definition: utfiterator.h:149
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
Definition: utfiterator.h:159
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
Definition: utfiterator.h:167
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Definition: utfiterator.h:157
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.