ICU 78.1  78.1
utfiterator.h
Go to the documentation of this file.
1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: https://www.unicode.org/copyright.html
3 
4 // utfiterator.h
5 // created: 2024aug12 Markus W. Scherer
6 
7 #ifndef __UTFITERATOR_H__
8 #define __UTFITERATOR_H__
9 
10 #include "unicode/utypes.h"
11 
12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13 
14 #include <iterator>
15 #if defined(__cpp_lib_ranges)
16 #include <ranges>
17 #endif
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include "unicode/utf16.h"
22 #include "unicode/utf8.h"
23 #include "unicode/uversion.h"
24 
135 #ifndef U_HIDE_DRAFT_API
136 
149 typedef enum UTFIllFormedBehavior {
169 
170 namespace U_HEADER_ONLY_NAMESPACE {
171 
172 namespace prv {
173 #if U_CPLUSPLUS_VERSION >= 20
174 
176 template<typename Iter>
177 using iter_value_t = typename std::iter_value_t<Iter>;
178 
180 template<typename Iter>
181 using iter_difference_t = std::iter_difference_t<Iter>;
182 
184 template<typename Iter>
185 constexpr bool forward_iterator = std::forward_iterator<Iter>;
186 
188 template<typename Iter>
189 constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
190 
192 template<typename Range>
193 constexpr bool range = std::ranges::range<Range>;
194 
195 #else
196 
198 template<typename Iter>
199 using iter_value_t = typename std::iterator_traits<Iter>::value_type;
200 
202 template<typename Iter>
203 using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
204 
206 template<typename Iter>
207 constexpr bool forward_iterator =
208  std::is_base_of_v<
209  std::forward_iterator_tag,
210  typename std::iterator_traits<Iter>::iterator_category>;
211 
213 template<typename Iter>
214 constexpr bool bidirectional_iterator =
215  std::is_base_of_v<
216  std::bidirectional_iterator_tag,
217  typename std::iterator_traits<Iter>::iterator_category>;
218 
220 template<typename Range, typename = void>
221 struct range_type : std::false_type {};
222 
224 template<typename Range>
225 struct range_type<
226  Range,
227  std::void_t<decltype(std::declval<Range>().begin()),
228  decltype(std::declval<Range>().end())>> : std::true_type {};
229 
231 template<typename Range>
233 
234 #endif
235 
237 template<typename CP32, bool skipSurrogates>
239  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
240 public:
242  using value_type = CP32;
246  using pointer = CP32 *;
248  using difference_type = int32_t;
250  using iterator_category = std::forward_iterator_tag;
251 
253  inline CodePointsIterator(CP32 c) : c_(c) {}
255  inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
257  inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
259  inline CP32 operator*() const { return c_; }
261  inline CodePointsIterator &operator++() { // pre-increment
262  ++c_;
263  if (skipSurrogates && c_ == 0xd800) {
264  c_ = 0xe000;
265  }
266  return *this;
267  }
269  inline CodePointsIterator operator++(int) { // post-increment
270  CodePointsIterator result(*this);
271  ++(*this);
272  return result;
273  }
274 
275 private:
276  CP32 c_;
277 };
278 
279 } // namespace prv
280 
291 template<typename CP32>
293  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
294 public:
302  auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
307  auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
308 };
309 
322 template<typename CP32>
324  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
325 public:
333  auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
338  auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
339 };
340 
356 template<typename CP32, typename UnitIter, typename = void>
358  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
359  using Unit = typename prv::iter_value_t<UnitIter>;
360 public:
362  UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
363  c_(codePoint), len_(length), start_(start), limit_(limit) {}
364 
366  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
368  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
369 
377  CP32 codePoint() const { return c_; }
378 
384  UnitIter begin() const { return start_; }
385 
391  UnitIter end() const { return limit_; }
392 
397  uint8_t length() const { return len_; }
398 
399 #if U_CPLUSPLUS_VERSION >= 20
405  template<std::contiguous_iterator Iter = UnitIter>
406  std::basic_string_view<Unit> stringView() const {
407  return std::basic_string_view<Unit>(begin(), end());
408  }
409 #else
415  template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
416  std::enable_if_t<std::is_pointer_v<Iter> ||
417  std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
418  std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
419  std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
420  std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
421  std::basic_string_view<Unit>>
422  stringView() const {
423  return std::basic_string_view<Unit>(&*start_, len_);
424  }
425 #endif
426 
427 private:
428  // Order of fields with padding and access frequency in mind.
429  CP32 c_;
430  uint8_t len_;
431  UnitIter start_;
432  UnitIter limit_;
433 };
434 
435 #ifndef U_IN_DOXYGEN
436 // Partial template specialization for single-pass input iterator.
437 // No UnitIter field, no getter for it, no stringView().
438 template<typename CP32, typename UnitIter>
439 class UnsafeCodeUnits<
440  CP32,
441  UnitIter,
442  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
443  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
444 public:
445  UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
446 
447  UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
448  UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
449 
450  CP32 codePoint() const { return c_; }
451 
452  uint8_t length() const { return len_; }
453 
454 private:
455  // Order of fields with padding and access frequency in mind.
456  CP32 c_;
457  uint8_t len_;
458 };
459 #endif // U_IN_DOXYGEN
460 
476 template<typename CP32, typename UnitIter, typename = void>
477 class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
478 public:
480  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
481  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
482 
484  CodeUnits(const CodeUnits &other) = default;
486  CodeUnits &operator=(const CodeUnits &other) = default;
487 
492  bool wellFormed() const { return ok_; }
493 
494 private:
495  bool ok_;
496 };
497 
498 #ifndef U_IN_DOXYGEN
499 // Partial template specialization for single-pass input iterator.
500 // No UnitIter field, no getter for it, no stringView().
501 template<typename CP32, typename UnitIter>
502 class CodeUnits<
503  CP32,
504  UnitIter,
505  std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
506  public UnsafeCodeUnits<CP32, UnitIter> {
507 public:
508  CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
509  UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
510 
511  CodeUnits(const CodeUnits &other) = default;
512  CodeUnits &operator=(const CodeUnits &other) = default;
513 
514  bool wellFormed() const { return ok_; }
515 
516 private:
517  bool ok_;
518 };
519 #endif // U_IN_DOXYGEN
520 
521 // Validating implementations ---------------------------------------------- ***
522 
523 #ifndef U_IN_DOXYGEN
524 template<typename CP32, UTFIllFormedBehavior behavior,
525  typename UnitIter, typename LimitIter = UnitIter, typename = void>
526 class UTFImpl;
527 
528 // Note: readAndInc() functions take both a p0 and a p iterator.
529 // They must have the same value.
530 // For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
531 // and readAndInc() copies p0 and the incremented p into the CodeUnits.
532 // For a single-pass UnitIter, which may not be default-constructible nor coypable,
533 // the caller can pass p into both references, and readAndInc() does not use p0
534 // and constructs CodeUnits without them.
535 // Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
536 // which may not be possible for a single-pass iterator.
537 
538 // UTF-8
539 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
540 class UTFImpl<
541  CP32, behavior,
542  UnitIter, LimitIter,
543  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
544  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
545  static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
546  "For 8-bit strings, the SURROGATE option does not have an equivalent.");
547 public:
548  // Handle ill-formed UTF-8
549  U_FORCE_INLINE static CP32 sub() {
550  switch (behavior) {
551  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
552  case UTF_BEHAVIOR_FFFD: return 0xfffd;
553  }
554  }
555 
556  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
557  // Very similar to U8_FWD_1().
558  uint8_t b = *p;
559  ++p;
560  if (U8_IS_LEAD(b) && p != limit) {
561  uint8_t t1 = *p;
562  if ((0xe0 <= b && b < 0xf0)) {
563  if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
564  ++p != limit && U8_IS_TRAIL(*p)) {
565  ++p;
566  }
567  } else if (b < 0xe0) {
568  if (U8_IS_TRAIL(t1)) {
569  ++p;
570  }
571  } else /* b >= 0xf0 */ {
572  if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
573  ++p != limit && U8_IS_TRAIL(*p) &&
574  ++p != limit && U8_IS_TRAIL(*p)) {
575  ++p;
576  }
577  }
578  }
579  }
580 
581  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
582  // Very similar to U8_BACK_1().
583  uint8_t c = *--p;
584  if (U8_IS_TRAIL(c) && p != start) {
585  UnitIter p1 = p;
586  uint8_t b1 = *--p1;
587  if (U8_IS_LEAD(b1)) {
588  if (b1 < 0xe0 ||
589  (b1 < 0xf0 ?
590  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
591  U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
592  p = p1;
593  return;
594  }
595  } else if (U8_IS_TRAIL(b1) && p1 != start) {
596  uint8_t b2 = *--p1;
597  if (0xe0 <= b2 && b2 <= 0xf4) {
598  if (b2 < 0xf0 ?
599  U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
600  U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
601  p = p1;
602  return;
603  }
604  } else if (U8_IS_TRAIL(b2) && p1 != start) {
605  uint8_t b3 = *--p1;
606  if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
607  p = p1;
608  return;
609  }
610  }
611  }
612  }
613  }
614 
615  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
616  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
617  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
618  // Very similar to U8_NEXT_OR_FFFD().
619  CP32 c = uint8_t(*p);
620  ++p;
621  if (U8_IS_SINGLE(c)) {
622  if constexpr (isMultiPass) {
623  return {c, 1, true, p0, p};
624  } else {
625  return {c, 1, true};
626  }
627  }
628  uint8_t length = 1;
629  uint8_t t = 0;
630  if (p != limit &&
631  // fetch/validate/assemble all but last trail byte
632  (c >= 0xe0 ?
633  (c < 0xf0 ? // U+0800..U+FFFF except surrogates
634  U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
635  (t &= 0x3f, 1)
636  : // U+10000..U+10FFFF
637  (c -= 0xf0) <= 4 &&
638  U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
639  (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
640  (t = *p - 0x80) <= 0x3f) &&
641  // valid second-to-last trail byte
642  (c = (c << 6) | t, ++length, ++p != limit)
643  : // U+0080..U+07FF
644  c >= 0xc2 && (c &= 0x1f, 1)) &&
645  // last trail byte
646  (t = *p - 0x80) <= 0x3f) {
647  c = (c << 6) | t;
648  ++length;
649  ++p;
650  if constexpr (isMultiPass) {
651  return {c, length, true, p0, p};
652  } else {
653  return {c, length, true};
654  }
655  }
656  if constexpr (isMultiPass) {
657  return {sub(), length, false, p0, p};
658  } else {
659  return {sub(), length, false};
660  }
661  }
662 
663  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
664  // Very similar to U8_PREV_OR_FFFD().
665  UnitIter p0 = p;
666  CP32 c = uint8_t(*--p);
667  if (U8_IS_SINGLE(c)) {
668  return {c, 1, true, p, p0};
669  }
670  if (U8_IS_TRAIL(c) && p != start) {
671  UnitIter p1 = p;
672  uint8_t b1 = *--p1;
673  if (U8_IS_LEAD(b1)) {
674  if (b1 < 0xe0) {
675  p = p1;
676  c = ((b1 - 0xc0) << 6) | (c & 0x3f);
677  return {c, 2, true, p, p0};
678  } else if (b1 < 0xf0 ?
679  U8_IS_VALID_LEAD3_AND_T1(b1, c) :
680  U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
681  // Truncated 3- or 4-byte sequence.
682  p = p1;
683  return {sub(), 2, false, p, p0};
684  }
685  } else if (U8_IS_TRAIL(b1) && p1 != start) {
686  // Extract the value bits from the last trail byte.
687  c &= 0x3f;
688  uint8_t b2 = *--p1;
689  if (0xe0 <= b2 && b2 <= 0xf4) {
690  if (b2 < 0xf0) {
691  b2 &= 0xf;
692  if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
693  p = p1;
694  c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
695  return {c, 3, true, p, p0};
696  }
697  } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
698  // Truncated 4-byte sequence.
699  p = p1;
700  return {sub(), 3, false, p, p0};
701  }
702  } else if (U8_IS_TRAIL(b2) && p1 != start) {
703  uint8_t b3 = *--p1;
704  if (0xf0 <= b3 && b3 <= 0xf4) {
705  b3 &= 7;
706  if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
707  p = p1;
708  c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
709  return {c, 4, true, p, p0};
710  }
711  }
712  }
713  }
714  }
715  return {sub(), 1, false, p, p0};
716  }
717 };
718 
719 // UTF-16
720 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
721 class UTFImpl<
722  CP32, behavior,
723  UnitIter, LimitIter,
724  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
725  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
726 public:
727  // Handle ill-formed UTF-16: One unpaired surrogate.
728  U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
729  switch (behavior) {
730  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
731  case UTF_BEHAVIOR_FFFD: return 0xfffd;
732  case UTF_BEHAVIOR_SURROGATE: return surrogate;
733  }
734  }
735 
736  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
737  // Very similar to U16_FWD_1().
738  auto c = *p;
739  ++p;
740  if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
741  ++p;
742  }
743  }
744 
745  U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
746  // Very similar to U16_BACK_1().
747  UnitIter p1;
748  if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
749  p = p1;
750  }
751  }
752 
753  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
754  UnitIter &p0, UnitIter &p, const LimitIter &limit) {
755  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
756  // Very similar to U16_NEXT_OR_FFFD().
757  CP32 c = static_cast<CP32>(*p);
758  ++p;
759  if (!U16_IS_SURROGATE(c)) {
760  if constexpr (isMultiPass) {
761  return {c, 1, true, p0, p};
762  } else {
763  return {c, 1, true};
764  }
765  } else {
766  uint16_t c2;
767  if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
768  ++p;
769  c = U16_GET_SUPPLEMENTARY(c, c2);
770  if constexpr (isMultiPass) {
771  return {c, 2, true, p0, p};
772  } else {
773  return {c, 2, true};
774  }
775  } else {
776  if constexpr (isMultiPass) {
777  return {sub(c), 1, false, p0, p};
778  } else {
779  return {sub(c), 1, false};
780  }
781  }
782  }
783  }
784 
785  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
786  // Very similar to U16_PREV_OR_FFFD().
787  UnitIter p0 = p;
788  CP32 c = static_cast<CP32>(*--p);
789  if (!U16_IS_SURROGATE(c)) {
790  return {c, 1, true, p, p0};
791  } else {
792  UnitIter p1;
793  uint16_t c2;
794  if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
795  p = p1;
796  c = U16_GET_SUPPLEMENTARY(c2, c);
797  return {c, 2, true, p, p0};
798  } else {
799  return {sub(c), 1, false, p, p0};
800  }
801  }
802  }
803 };
804 
805 // UTF-32: trivial, but still validating
806 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
807 class UTFImpl<
808  CP32, behavior,
809  UnitIter, LimitIter,
810  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
811  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
812 public:
813  // Handle ill-formed UTF-32
814  U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
815  switch (behavior) {
816  case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
817  case UTF_BEHAVIOR_FFFD: return 0xfffd;
818  case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
819  }
820  }
821 
822  U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
823  ++p;
824  }
825 
826  U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
827  --p;
828  }
829 
830  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
831  UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
832  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
833  uint32_t uc = *p;
834  CP32 c = uc;
835  ++p;
836  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
837  if constexpr (isMultiPass) {
838  return {c, 1, true, p0, p};
839  } else {
840  return {c, 1, true};
841  }
842  } else {
843  if constexpr (isMultiPass) {
844  return {sub(uc < 0xe000, c), 1, false, p0, p};
845  } else {
846  return {sub(uc < 0xe000, c), 1, false};
847  }
848  }
849  }
850 
851  U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
852  UnitIter p0 = p;
853  uint32_t uc = *--p;
854  CP32 c = uc;
855  if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
856  return {c, 1, true, p, p0};
857  } else {
858  return {sub(uc < 0xe000, c), 1, false, p, p0};
859  }
860  }
861 };
862 
863 // Non-validating implementations ------------------------------------------ ***
864 
865 template<typename CP32, typename UnitIter, typename = void>
866 class UnsafeUTFImpl;
867 
868 // UTF-8
869 template<typename CP32, typename UnitIter>
870 class UnsafeUTFImpl<
871  CP32,
872  UnitIter,
873  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
874  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
875 public:
876  U_FORCE_INLINE static void inc(UnitIter &p) {
877  // Very similar to U8_FWD_1_UNSAFE().
878  uint8_t b = *p;
879  std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
880  }
881 
882  U_FORCE_INLINE static void dec(UnitIter &p) {
883  // Very similar to U8_BACK_1_UNSAFE().
884  while (U8_IS_TRAIL(*--p)) {}
885  }
886 
887  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
888  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
889  // Very similar to U8_NEXT_UNSAFE().
890  CP32 c = uint8_t(*p);
891  ++p;
892  if (U8_IS_SINGLE(c)) {
893  if constexpr (isMultiPass) {
894  return {c, 1, p0, p};
895  } else {
896  return {c, 1};
897  }
898  } else if (c < 0xe0) {
899  c = ((c & 0x1f) << 6) | (*p & 0x3f);
900  ++p;
901  if constexpr (isMultiPass) {
902  return {c, 2, p0, p};
903  } else {
904  return {c, 2};
905  }
906  } else if (c < 0xf0) {
907  // No need for (c&0xf) because the upper bits are truncated
908  // after <<12 in the cast to uint16_t.
909  c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
910  ++p;
911  c |= *p & 0x3f;
912  ++p;
913  if constexpr (isMultiPass) {
914  return {c, 3, p0, p};
915  } else {
916  return {c, 3};
917  }
918  } else {
919  c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
920  ++p;
921  c |= (*p & 0x3f) << 6;
922  ++p;
923  c |= *p & 0x3f;
924  ++p;
925  if constexpr (isMultiPass) {
926  return {c, 4, p0, p};
927  } else {
928  return {c, 4};
929  }
930  }
931  }
932 
933  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
934  // Very similar to U8_PREV_UNSAFE().
935  UnitIter p0 = p;
936  CP32 c = uint8_t(*--p);
937  if (U8_IS_SINGLE(c)) {
938  return {c, 1, p, p0};
939  }
940  // U8_IS_TRAIL(c) if well-formed
941  c &= 0x3f;
942  uint8_t count = 1;
943  for (uint8_t shift = 6;;) {
944  uint8_t b = *--p;
945  if (b >= 0xc0) {
946  U8_MASK_LEAD_BYTE(b, count);
947  c |= uint32_t{b} << shift;
948  break;
949  } else {
950  c |= (uint32_t{b} & 0x3f) << shift;
951  ++count;
952  shift += 6;
953  }
954  }
955  ++count;
956  return {c, count, p, p0};
957  }
958 };
959 
960 // UTF-16
961 template<typename CP32, typename UnitIter>
962 class UnsafeUTFImpl<
963  CP32,
964  UnitIter,
965  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
966  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
967 public:
968  U_FORCE_INLINE static void inc(UnitIter &p) {
969  // Very similar to U16_FWD_1_UNSAFE().
970  auto c = *p;
971  ++p;
972  if (U16_IS_LEAD(c)) {
973  ++p;
974  }
975  }
976 
977  U_FORCE_INLINE static void dec(UnitIter &p) {
978  // Very similar to U16_BACK_1_UNSAFE().
979  if (U16_IS_TRAIL(*--p)) {
980  --p;
981  }
982  }
983 
984  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
985  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
986  // Very similar to U16_NEXT_UNSAFE().
987  CP32 c = static_cast<CP32>(*p);
988  ++p;
989  if (!U16_IS_LEAD(c)) {
990  if constexpr (isMultiPass) {
991  return {c, 1, p0, p};
992  } else {
993  return {c, 1};
994  }
995  } else {
996  uint16_t c2 = *p;
997  ++p;
998  c = U16_GET_SUPPLEMENTARY(c, c2);
999  if constexpr (isMultiPass) {
1000  return {c, 2, p0, p};
1001  } else {
1002  return {c, 2};
1003  }
1004  }
1005  }
1006 
1007  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1008  // Very similar to U16_PREV_UNSAFE().
1009  UnitIter p0 = p;
1010  CP32 c = static_cast<CP32>(*--p);
1011  if (!U16_IS_TRAIL(c)) {
1012  return {c, 1, p, p0};
1013  } else {
1014  uint16_t c2 = *--p;
1015  c = U16_GET_SUPPLEMENTARY(c2, c);
1016  return {c, 2, p, p0};
1017  }
1018  }
1019 };
1020 
1021 // UTF-32: trivial
1022 template<typename CP32, typename UnitIter>
1023 class UnsafeUTFImpl<
1024  CP32,
1025  UnitIter,
1026  std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1027  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1028 public:
1029  U_FORCE_INLINE static void inc(UnitIter &p) {
1030  ++p;
1031  }
1032 
1033  U_FORCE_INLINE static void dec(UnitIter &p) {
1034  --p;
1035  }
1036 
1037  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1038  constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1039  CP32 c = *p;
1040  ++p;
1041  if constexpr (isMultiPass) {
1042  return {c, 1, p0, p};
1043  } else {
1044  return {c, 1};
1045  }
1046  }
1047 
1048  U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1049  UnitIter p0 = p;
1050  CP32 c = *--p;
1051  return {c, 1, p, p0};
1052  }
1053 };
1054 
1055 #endif
1056 
1057 // Validating iterators ---------------------------------------------------- ***
1058 
1082 template<typename CP32, UTFIllFormedBehavior behavior,
1083  typename UnitIter, typename LimitIter = UnitIter, typename = void>
1085  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1086  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1087 
1088  // Proxy type for operator->() (required by LegacyInputIterator)
1089  // so that we don't promise always returning CodeUnits.
1090  class Proxy {
1091  public:
1092  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1093  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1094  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1095  private:
1097  };
1098 
1099 public:
1105  using pointer = Proxy;
1109  using iterator_category = std::conditional_t<
1110  prv::bidirectional_iterator<UnitIter>,
1111  std::bidirectional_iterator_tag,
1112  std::forward_iterator_tag>;
1113 
1127  U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1128  p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1140  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1141  p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1153  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1159  U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1160 
1162  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1164  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1165 
1167  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1170 
1176  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1177  return getLogicalPosition() == other.getLogicalPosition();
1178  }
1184  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1185 
1186  // Asymmetric equality & nonequality with a sentinel type.
1187 
1194  template<typename Sentinel> U_FORCE_INLINE friend
1195  std::enable_if_t<
1196  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1197  bool>
1198  operator==(const UTFIterator &iter, const Sentinel &s) {
1199  return iter.getLogicalPosition() == s;
1200  }
1201 
1202 #if U_CPLUSPLUS_VERSION < 20
1203  // C++17: Need to define all four combinations of == / != vs. parameter order.
1204  // Once we require C++20, we could remove all but the first == because
1205  // the compiler would generate the rest.
1206 
1213  template<typename Sentinel> U_FORCE_INLINE friend
1214  std::enable_if_t<
1215  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1216  bool>
1217  operator==(const Sentinel &s, const UTFIterator &iter) {
1218  return iter.getLogicalPosition() == s;
1219  }
1226  template<typename Sentinel> U_FORCE_INLINE friend
1227  std::enable_if_t<
1228  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1229  bool>
1230  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1237  template<typename Sentinel> U_FORCE_INLINE friend
1238  std::enable_if_t<
1239  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1240  bool>
1241  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1242 #endif // C++17
1243 
1251  if (state_ == 0) {
1252  UnitIter p0 = p_;
1253  units_ = Impl::readAndInc(p0, p_, limit_);
1254  state_ = 1;
1255  }
1256  return units_;
1257  }
1258 
1267  U_FORCE_INLINE Proxy operator->() const {
1268  if (state_ == 0) {
1269  UnitIter p0 = p_;
1270  units_ = Impl::readAndInc(p0, p_, limit_);
1271  state_ = 1;
1272  }
1273  return Proxy(units_);
1274  }
1275 
1282  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1283  if (state_ > 0) {
1284  // operator*() called readAndInc() so p_ is already ahead.
1285  state_ = 0;
1286  } else if (state_ == 0) {
1287  Impl::inc(p_, limit_);
1288  } else /* state_ < 0 */ {
1289  // operator--() called decAndRead() so we know how far to skip.
1290  p_ = units_.end();
1291  state_ = 0;
1292  }
1293  return *this;
1294  }
1295 
1304  U_FORCE_INLINE UTFIterator operator++(int) { // post-increment
1305  if (state_ > 0) {
1306  // operator*() called readAndInc() so p_ is already ahead.
1307  UTFIterator result(*this);
1308  state_ = 0;
1309  return result;
1310  } else if (state_ == 0) {
1311  UnitIter p0 = p_;
1312  units_ = Impl::readAndInc(p0, p_, limit_);
1313  UTFIterator result(*this);
1314  result.state_ = 1;
1315  // keep this->state_ == 0
1316  return result;
1317  } else /* state_ < 0 */ {
1318  UTFIterator result(*this);
1319  // operator--() called decAndRead() so we know how far to skip.
1320  p_ = units_.end();
1321  state_ = 0;
1322  return result;
1323  }
1324  }
1325 
1333  template<typename Iter = UnitIter>
1335  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1336  operator--() { // pre-decrement
1337  if (state_ > 0) {
1338  // operator*() called readAndInc() so p_ is ahead of the logical position.
1339  p_ = units_.begin();
1340  }
1341  units_ = Impl::decAndRead(start_, p_);
1342  state_ = -1;
1343  return *this;
1344  }
1345 
1353  template<typename Iter = UnitIter>
1355  std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1356  operator--(int) { // post-decrement
1357  UTFIterator result(*this);
1358  operator--();
1359  return result;
1360  }
1361 
1362 private:
1363  friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1364 
1365  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1366  return state_ <= 0 ? p_ : units_.begin();
1367  }
1368 
1369  // operator*() etc. are logically const.
1370  mutable UnitIter p_;
1371  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1372  // (forward or backward) we can test if there are enough code units.
1373  UnitIter start_;
1374  LimitIter limit_;
1375  // Keep state so that we call readAndInc() only once for both operator*() and ++
1376  // to make it easy for the compiler to optimize.
1377  mutable CodeUnits<CP32, UnitIter> units_;
1378  // >0: units_ = readAndInc(), p_ = units limit
1379  // which means that p_ is ahead of its logical position
1380  // 0: initial state
1381  // <0: units_ = decAndRead(), p_ = units start
1382  mutable int8_t state_ = 0;
1383 };
1384 
1385 #ifndef U_IN_DOXYGEN
1386 // Partial template specialization for single-pass input iterator.
1387 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1388 class UTFIterator<
1389  CP32, behavior,
1390  UnitIter, LimitIter,
1391  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1392  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1393  using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1394 
1395  // Proxy type for post-increment return value, to make *iter++ work.
1396  // Also for operator->() (required by LegacyInputIterator)
1397  // so that we don't promise always returning CodeUnits.
1398  class Proxy {
1399  public:
1400  explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1401  CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1402  CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1403  private:
1404  CodeUnits<CP32, UnitIter> units_;
1405  };
1406 
1407 public:
1408  using value_type = CodeUnits<CP32, UnitIter>;
1409  using reference = value_type;
1410  using pointer = Proxy;
1411  using difference_type = prv::iter_difference_t<UnitIter>;
1412  using iterator_category = std::input_iterator_tag;
1413 
1414  U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1415 
1416  // Constructs an iterator start or limit sentinel.
1417  // Requires p to be copyable.
1418  U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1419 
1420  U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1421  U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1422 
1423  U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1424  U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1425 
1426  U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1427  return p_ == other.p_ && ahead_ == other.ahead_;
1428  // Strictly speaking, we should check if the logical position is the same.
1429  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1430  }
1431  U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1432 
1433  template<typename Sentinel> U_FORCE_INLINE friend
1434  std::enable_if_t<
1435  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1436  bool>
1437  operator==(const UTFIterator &iter, const Sentinel &s) {
1438  return !iter.ahead_ && iter.p_ == s;
1439  }
1440 
1441 #if U_CPLUSPLUS_VERSION < 20
1442  template<typename Sentinel> U_FORCE_INLINE friend
1443  std::enable_if_t<
1444  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1445  bool>
1446  operator==(const Sentinel &s, const UTFIterator &iter) {
1447  return !iter.ahead_ && iter.p_ == s;
1448  }
1449 
1450  template<typename Sentinel> U_FORCE_INLINE friend
1451  std::enable_if_t<
1452  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1453  bool>
1454  operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1455 
1456  template<typename Sentinel> U_FORCE_INLINE friend
1457  std::enable_if_t<
1458  !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1459  bool>
1460  operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1461 #endif // C++17
1462 
1463  U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1464  if (!ahead_) {
1465  units_ = Impl::readAndInc(p_, p_, limit_);
1466  ahead_ = true;
1467  }
1468  return units_;
1469  }
1470 
1471  U_FORCE_INLINE Proxy operator->() const {
1472  if (!ahead_) {
1473  units_ = Impl::readAndInc(p_, p_, limit_);
1474  ahead_ = true;
1475  }
1476  return Proxy(units_);
1477  }
1478 
1479  U_FORCE_INLINE UTFIterator &operator++() { // pre-increment
1480  if (ahead_) {
1481  // operator*() called readAndInc() so p_ is already ahead.
1482  ahead_ = false;
1483  } else {
1484  Impl::inc(p_, limit_);
1485  }
1486  return *this;
1487  }
1488 
1489  U_FORCE_INLINE Proxy operator++(int) { // post-increment
1490  if (ahead_) {
1491  // operator*() called readAndInc() so p_ is already ahead.
1492  ahead_ = false;
1493  } else {
1494  units_ = Impl::readAndInc(p_, p_, limit_);
1495  // keep this->ahead_ == false
1496  }
1497  return Proxy(units_);
1498  }
1499 
1500 private:
1501  // operator*() etc. are logically const.
1502  mutable UnitIter p_;
1503  // In a validating iterator, we need limit_ so that when we read a code point
1504  // we can test if there are enough code units.
1505  LimitIter limit_;
1506  // Keep state so that we call readAndInc() only once for both operator*() and ++
1507  // so that we can use a single-pass input iterator for UnitIter.
1508  mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1509  // true: units_ = readAndInc(), p_ = units limit
1510  // which means that p_ is ahead of its logical position
1511  // false: initial state
1512  mutable bool ahead_ = false;
1513 };
1514 #endif // U_IN_DOXYGEN
1515 
1516 } // namespace U_HEADER_ONLY_NAMESPACE
1517 
1518 #ifndef U_IN_DOXYGEN
1519 // Bespoke specialization of reverse_iterator.
1520 // The default implementation implements reverse operator*() and ++ in a way
1521 // that does most of the same work twice for reading variable-length sequences.
1522 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1523 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1524  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1525  using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1527 
1528  // Proxy type for operator->() (required by LegacyInputIterator)
1529  // so that we don't promise always returning CodeUnits.
1530  class Proxy {
1531  public:
1532  explicit Proxy(CodeUnits_ units) : units_(units) {}
1533  CodeUnits_ &operator*() { return units_; }
1534  CodeUnits_ *operator->() { return &units_; }
1535  private:
1536  CodeUnits_ units_;
1537  };
1538 
1539 public:
1540  using value_type = CodeUnits_;
1541  using reference = value_type;
1542  using pointer = Proxy;
1544  using iterator_category = std::bidirectional_iterator_tag;
1545 
1547  p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1548  units_(0, 0, false, p_, p_) {}
1549  U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1550 
1551  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1552  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1553 
1554  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1555  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1556 
1557  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1558  return getLogicalPosition() == other.getLogicalPosition();
1559  }
1560  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1561 
1562  U_FORCE_INLINE CodeUnits_ operator*() const {
1563  if (state_ == 0) {
1564  units_ = Impl::decAndRead(start_, p_);
1565  state_ = -1;
1566  }
1567  return units_;
1568  }
1569 
1570  U_FORCE_INLINE Proxy operator->() const {
1571  if (state_ == 0) {
1572  units_ = Impl::decAndRead(start_, p_);
1573  state_ = -1;
1574  }
1575  return Proxy(units_);
1576  }
1577 
1578  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
1579  if (state_ < 0) {
1580  // operator*() called decAndRead() so p_ is already behind.
1581  state_ = 0;
1582  } else if (state_ == 0) {
1583  Impl::dec(start_, p_);
1584  } else /* state_ > 0 */ {
1585  // operator--() called readAndInc() so we know how far to skip.
1586  p_ = units_.begin();
1587  state_ = 0;
1588  }
1589  return *this;
1590  }
1591 
1592  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
1593  if (state_ < 0) {
1594  // operator*() called decAndRead() so p_ is already behind.
1595  reverse_iterator result(*this);
1596  state_ = 0;
1597  return result;
1598  } else if (state_ == 0) {
1599  units_ = Impl::decAndRead(start_, p_);
1600  reverse_iterator result(*this);
1601  result.state_ = -1;
1602  // keep this->state_ == 0
1603  return result;
1604  } else /* state_ > 0 */ {
1605  reverse_iterator result(*this);
1606  // operator--() called readAndInc() so we know how far to skip.
1607  p_ = units_.begin();
1608  state_ = 0;
1609  return result;
1610  }
1611  }
1612 
1613  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
1614  if (state_ < 0) {
1615  // operator*() called decAndRead() so p_ is behind the logical position.
1616  p_ = units_.end();
1617  }
1618  UnitIter p0 = p_;
1619  units_ = Impl::readAndInc(p0, p_, limit_);
1620  state_ = 1;
1621  return *this;
1622  }
1623 
1624  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
1625  reverse_iterator result(*this);
1626  operator--();
1627  return result;
1628  }
1629 
1630 private:
1631  U_FORCE_INLINE UnitIter getLogicalPosition() const {
1632  return state_ >= 0 ? p_ : units_.end();
1633  }
1634 
1635  // operator*() etc. are logically const.
1636  mutable UnitIter p_;
1637  // In a validating iterator, we need start_ & limit_ so that when we read a code point
1638  // (forward or backward) we can test if there are enough code units.
1639  UnitIter start_;
1640  UnitIter limit_;
1641  // Keep state so that we call decAndRead() only once for both operator*() and ++
1642  // to make it easy for the compiler to optimize.
1643  mutable CodeUnits_ units_;
1644  // >0: units_ = readAndInc(), p_ = units limit
1645  // 0: initial state
1646  // <0: units_ = decAndRead(), p_ = units start
1647  // which means that p_ is behind its logical position
1648  mutable int8_t state_ = 0;
1649 };
1650 #endif // U_IN_DOXYGEN
1651 
1652 namespace U_HEADER_ONLY_NAMESPACE {
1653 
1676 template<typename CP32, UTFIllFormedBehavior behavior,
1677  typename UnitIter, typename LimitIter = UnitIter>
1678 auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1680  std::move(start), std::move(p), std::move(limit));
1681 }
1682 
1703 template<typename CP32, UTFIllFormedBehavior behavior,
1704  typename UnitIter, typename LimitIter = UnitIter>
1705 auto utfIterator(UnitIter p, LimitIter limit) {
1707  std::move(p), std::move(limit));
1708 }
1709 
1710 // Note: We should only enable the following factory function for a copyable UnitIter.
1711 // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1712 // but a function template partial specialization is not allowed.
1713 // In C++20, we might be able to require the std::copyable concept.
1714 
1734 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1735 auto utfIterator(UnitIter p) {
1736  return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1737 }
1738 
1751 template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1753  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1754 public:
1759  UTFStringCodePoints() = default;
1760 
1766  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1767  explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1776  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1777  explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1778 
1780  UTFStringCodePoints(const UTFStringCodePoints &other) = default;
1781 
1784 
1789  auto begin() {
1790  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1791  }
1792 
1797  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1798  auto begin() const {
1799  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1800  }
1801 
1806  auto end() {
1807  using UnitIter = decltype(unitRange.begin());
1808  using LimitIter = decltype(unitRange.end());
1809  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1810  // Return the code unit sentinel.
1811  return unitRange.end();
1812  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1813  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1814  } else {
1815  // The input iterator specialization has no three-argument constructor.
1816  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1817  }
1818  }
1819 
1824  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1825  auto end() const {
1826  using UnitIter = decltype(unitRange.begin());
1827  using LimitIter = decltype(unitRange.end());
1828  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1829  // Return the code unit sentinel.
1830  return unitRange.end();
1831  } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1832  return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1833  } else {
1834  // The input iterator specialization has no three-argument constructor.
1835  return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1836  }
1837  }
1838 
1843  auto rbegin() const {
1844  return std::make_reverse_iterator(end());
1845  }
1846 
1851  auto rend() const {
1852  return std::make_reverse_iterator(begin());
1853  }
1854 
1855 private:
1856  Range unitRange;
1857 };
1858 
1860 template<typename CP32, UTFIllFormedBehavior behavior>
1862 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
1863  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1864  : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1865 #endif
1866 {
1868  template<typename Range>
1869  auto operator()(Range &&unitRange) const {
1870 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
1872  std::forward<Range>(unitRange));
1873 #else
1874  return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1875 #endif
1876  }
1877 };
1878 
1893 template<typename CP32, UTFIllFormedBehavior behavior>
1895 
1896 // Non-validating iterators ------------------------------------------------ ***
1897 
1919 template<typename CP32, typename UnitIter, typename = void>
1921  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1922  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1923 
1924  // Proxy type for operator->() (required by LegacyInputIterator)
1925  // so that we don't promise always returning UnsafeCodeUnits.
1926  class Proxy {
1927  public:
1928  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1929  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
1930  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1931  private:
1933  };
1934 
1935 public:
1941  using pointer = Proxy;
1945  using iterator_category = std::conditional_t<
1946  prv::bidirectional_iterator<UnitIter>,
1947  std::bidirectional_iterator_tag,
1948  std::forward_iterator_tag>;
1949 
1959  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
1965  U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
1966 
1971 
1976 
1982  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
1983  return getLogicalPosition() == other.getLogicalPosition();
1984  }
1990  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
1991 
1998  template<typename Sentinel> U_FORCE_INLINE friend
1999  std::enable_if_t<
2000  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2001  bool>
2002  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2003  return iter.getLogicalPosition() == s;
2004  }
2005 
2006 #if U_CPLUSPLUS_VERSION < 20
2013  template<typename Sentinel> U_FORCE_INLINE friend
2014  std::enable_if_t<
2015  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2016  bool>
2017  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2018  return iter.getLogicalPosition() == s;
2019  }
2026  template<typename Sentinel> U_FORCE_INLINE friend
2027  std::enable_if_t<
2028  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2029  bool>
2030  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2037  template<typename Sentinel> U_FORCE_INLINE friend
2038  std::enable_if_t<
2039  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2040  bool>
2041  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2042 #endif // C++17
2043 
2051  if (state_ == 0) {
2052  UnitIter p0 = p_;
2053  units_ = Impl::readAndInc(p0, p_);
2054  state_ = 1;
2055  }
2056  return units_;
2057  }
2058 
2067  U_FORCE_INLINE Proxy operator->() const {
2068  if (state_ == 0) {
2069  UnitIter p0 = p_;
2070  units_ = Impl::readAndInc(p0, p_);
2071  state_ = 1;
2072  }
2073  return Proxy(units_);
2074  }
2075 
2083  if (state_ > 0) {
2084  // operator*() called readAndInc() so p_ is already ahead.
2085  state_ = 0;
2086  } else if (state_ == 0) {
2087  Impl::inc(p_);
2088  } else /* state_ < 0 */ {
2089  // operator--() called decAndRead() so we know how far to skip.
2090  p_ = units_.end();
2091  state_ = 0;
2092  }
2093  return *this;
2094  }
2095 
2105  if (state_ > 0) {
2106  // operator*() called readAndInc() so p_ is already ahead.
2107  UnsafeUTFIterator result(*this);
2108  state_ = 0;
2109  return result;
2110  } else if (state_ == 0) {
2111  UnitIter p0 = p_;
2112  units_ = Impl::readAndInc(p0, p_);
2113  UnsafeUTFIterator result(*this);
2114  result.state_ = 1;
2115  // keep this->state_ == 0
2116  return result;
2117  } else /* state_ < 0 */ {
2118  UnsafeUTFIterator result(*this);
2119  // operator--() called decAndRead() so we know how far to skip.
2120  p_ = units_.end();
2121  state_ = 0;
2122  return result;
2123  }
2124  }
2125 
2133  template<typename Iter = UnitIter>
2135  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2136  operator--() { // pre-decrement
2137  if (state_ > 0) {
2138  // operator*() called readAndInc() so p_ is ahead of the logical position.
2139  p_ = units_.begin();
2140  }
2141  units_ = Impl::decAndRead(p_);
2142  state_ = -1;
2143  return *this;
2144  }
2145 
2153  template<typename Iter = UnitIter>
2155  std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2156  operator--(int) { // post-decrement
2157  UnsafeUTFIterator result(*this);
2158  operator--();
2159  return result;
2160  }
2161 
2162 private:
2163  friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2164 
2165  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2166  return state_ <= 0 ? p_ : units_.begin();
2167  }
2168 
2169  // operator*() etc. are logically const.
2170  mutable UnitIter p_;
2171  // Keep state so that we call readAndInc() only once for both operator*() and ++
2172  // to make it easy for the compiler to optimize.
2173  mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2174  // >0: units_ = readAndInc(), p_ = units limit
2175  // which means that p_ is ahead of its logical position
2176  // 0: initial state
2177  // <0: units_ = decAndRead(), p_ = units start
2178  mutable int8_t state_ = 0;
2179 };
2180 
2181 #ifndef U_IN_DOXYGEN
2182 // Partial template specialization for single-pass input iterator.
2183 template<typename CP32, typename UnitIter>
2184 class UnsafeUTFIterator<
2185  CP32,
2186  UnitIter,
2187  std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2188  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2189  using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2190 
2191  // Proxy type for post-increment return value, to make *iter++ work.
2192  // Also for operator->() (required by LegacyInputIterator)
2193  // so that we don't promise always returning UnsafeCodeUnits.
2194  class Proxy {
2195  public:
2196  explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2197  UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2198  UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2199  private:
2200  UnsafeCodeUnits<CP32, UnitIter> units_;
2201  };
2202 
2203 public:
2204  using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2205  using reference = value_type;
2206  using pointer = Proxy;
2207  using difference_type = prv::iter_difference_t<UnitIter>;
2208  using iterator_category = std::input_iterator_tag;
2209 
2210  U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2211 
2212  U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2213  U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2214 
2215  U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2217 
2218  U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2219  return p_ == other.p_ && ahead_ == other.ahead_;
2220  // Strictly speaking, we should check if the logical position is the same.
2221  // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2222  }
2223  U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2224 
2225  template<typename Sentinel> U_FORCE_INLINE friend
2226  std::enable_if_t<
2227  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2228  bool>
2229  operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2230  return !iter.ahead_ && iter.p_ == s;
2231  }
2232 
2233 #if U_CPLUSPLUS_VERSION < 20
2234  template<typename Sentinel> U_FORCE_INLINE friend
2235  std::enable_if_t<
2236  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2237  bool>
2238  operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2239  return !iter.ahead_ && iter.p_ == s;
2240  }
2241 
2242  template<typename Sentinel> U_FORCE_INLINE friend
2243  std::enable_if_t<
2244  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2245  bool>
2246  operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2247 
2248  template<typename Sentinel> U_FORCE_INLINE friend
2249  std::enable_if_t<
2250  !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2251  bool>
2252  operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2253 #endif // C++17
2254 
2255  U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2256  if (!ahead_) {
2257  units_ = Impl::readAndInc(p_, p_);
2258  ahead_ = true;
2259  }
2260  return units_;
2261  }
2262 
2263  U_FORCE_INLINE Proxy operator->() const {
2264  if (!ahead_) {
2265  units_ = Impl::readAndInc(p_, p_);
2266  ahead_ = true;
2267  }
2268  return Proxy(units_);
2269  }
2270 
2271  U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment
2272  if (ahead_) {
2273  // operator*() called readAndInc() so p_ is already ahead.
2274  ahead_ = false;
2275  } else {
2276  Impl::inc(p_);
2277  }
2278  return *this;
2279  }
2280 
2281  U_FORCE_INLINE Proxy operator++(int) { // post-increment
2282  if (ahead_) {
2283  // operator*() called readAndInc() so p_ is already ahead.
2284  ahead_ = false;
2285  } else {
2286  units_ = Impl::readAndInc(p_, p_);
2287  // keep this->ahead_ == false
2288  }
2289  return Proxy(units_);
2290  }
2291 
2292 private:
2293  // operator*() etc. are logically const.
2294  mutable UnitIter p_;
2295  // Keep state so that we call readAndInc() only once for both operator*() and ++
2296  // so that we can use a single-pass input iterator for UnitIter.
2297  mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2298  // true: units_ = readAndInc(), p_ = units limit
2299  // which means that p_ is ahead of its logical position
2300  // false: initial state
2301  mutable bool ahead_ = false;
2302 };
2303 #endif // U_IN_DOXYGEN
2304 
2305 } // namespace U_HEADER_ONLY_NAMESPACE
2306 
2307 #ifndef U_IN_DOXYGEN
2308 // Bespoke specialization of reverse_iterator.
2309 // The default implementation implements reverse operator*() and ++ in a way
2310 // that does most of the same work twice for reading variable-length sequences.
2311 template<typename CP32, typename UnitIter>
2312 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2313  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2314  using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2316 
2317  // Proxy type for operator->() (required by LegacyInputIterator)
2318  // so that we don't promise always returning UnsafeCodeUnits.
2319  class Proxy {
2320  public:
2321  explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2322  UnsafeCodeUnits_ &operator*() { return units_; }
2323  UnsafeCodeUnits_ *operator->() { return &units_; }
2324  private:
2325  UnsafeCodeUnits_ units_;
2326  };
2327 
2328 public:
2329  using value_type = UnsafeCodeUnits_;
2330  using reference = value_type;
2331  using pointer = Proxy;
2333  using iterator_category = std::bidirectional_iterator_tag;
2334 
2336  p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2337  U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2338 
2339  U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2340  U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2341 
2342  U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2343  U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2344 
2345  U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2346  return getLogicalPosition() == other.getLogicalPosition();
2347  }
2348  U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2349 
2350  U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2351  if (state_ == 0) {
2352  units_ = Impl::decAndRead(p_);
2353  state_ = -1;
2354  }
2355  return units_;
2356  }
2357 
2358  U_FORCE_INLINE Proxy operator->() const {
2359  if (state_ == 0) {
2360  units_ = Impl::decAndRead(p_);
2361  state_ = -1;
2362  }
2363  return Proxy(units_);
2364  }
2365 
2366  U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment
2367  if (state_ < 0) {
2368  // operator*() called decAndRead() so p_ is already behind.
2369  state_ = 0;
2370  } else if (state_ == 0) {
2371  Impl::dec(p_);
2372  } else /* state_ > 0 */ {
2373  // operator--() called readAndInc() so we know how far to skip.
2374  p_ = units_.begin();
2375  state_ = 0;
2376  }
2377  return *this;
2378  }
2379 
2380  U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment
2381  if (state_ < 0) {
2382  // operator*() called decAndRead() so p_ is already behind.
2383  reverse_iterator result(*this);
2384  state_ = 0;
2385  return result;
2386  } else if (state_ == 0) {
2387  units_ = Impl::decAndRead(p_);
2388  reverse_iterator result(*this);
2389  result.state_ = -1;
2390  // keep this->state_ == 0
2391  return result;
2392  } else /* state_ > 0 */ {
2393  reverse_iterator result(*this);
2394  // operator--() called readAndInc() so we know how far to skip.
2395  p_ = units_.begin();
2396  state_ = 0;
2397  return result;
2398  }
2399  }
2400 
2401  U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement
2402  if (state_ < 0) {
2403  // operator*() called decAndRead() so p_ is behind the logical position.
2404  p_ = units_.end();
2405  }
2406  UnitIter p0 = p_;
2407  units_ = Impl::readAndInc(p0, p_);
2408  state_ = 1;
2409  return *this;
2410  }
2411 
2412  U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement
2413  reverse_iterator result(*this);
2414  operator--();
2415  return result;
2416  }
2417 
2418 private:
2419  U_FORCE_INLINE UnitIter getLogicalPosition() const {
2420  return state_ >= 0 ? p_ : units_.end();
2421  }
2422 
2423  // operator*() etc. are logically const.
2424  mutable UnitIter p_;
2425  // Keep state so that we call decAndRead() only once for both operator*() and ++
2426  // to make it easy for the compiler to optimize.
2427  mutable UnsafeCodeUnits_ units_;
2428  // >0: units_ = readAndInc(), p_ = units limit
2429  // 0: initial state
2430  // <0: units_ = decAndRead(), p_ = units start
2431  // which means that p_ is behind its logical position
2432  mutable int8_t state_ = 0;
2433 };
2434 #endif // U_IN_DOXYGEN
2435 
2436 namespace U_HEADER_ONLY_NAMESPACE {
2437 
2453 template<typename CP32, typename UnitIter>
2454 auto unsafeUTFIterator(UnitIter iter) {
2455  return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2456 }
2457 
2469 template<typename CP32, typename Range>
2471  static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2472 public:
2478 
2484  template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2485  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2494  template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2495  explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2496 
2499 
2502 
2507  auto begin() {
2508  return unsafeUTFIterator<CP32>(unitRange.begin());
2509  }
2510 
2515  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2516  auto begin() const {
2517  return unsafeUTFIterator<CP32>(unitRange.begin());
2518  }
2519 
2524  auto end() {
2525  using UnitIter = decltype(unitRange.begin());
2526  using LimitIter = decltype(unitRange.end());
2527  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2528  // Return the code unit sentinel.
2529  return unitRange.end();
2530  } else {
2531  return unsafeUTFIterator<CP32>(unitRange.end());
2532  }
2533  }
2534 
2539  template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2540  auto end() const {
2541  using UnitIter = decltype(unitRange.begin());
2542  using LimitIter = decltype(unitRange.end());
2543  if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2544  // Return the code unit sentinel.
2545  return unitRange.end();
2546  } else {
2547  return unsafeUTFIterator<CP32>(unitRange.end());
2548  }
2549  }
2550 
2555  auto rbegin() const {
2556  return std::make_reverse_iterator(end());
2557  }
2558 
2563  auto rend() const {
2564  return std::make_reverse_iterator(begin());
2565  }
2566 
2567 private:
2568  Range unitRange;
2569 };
2570 
2572 template<typename CP32>
2574 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \
2575  __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2576  : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2577 #endif
2578 {
2580  template<typename Range>
2581  auto operator()(Range &&unitRange) const {
2582 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2.
2583  return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2584 #else
2585  return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2586 #endif
2587  }
2588 };
2589 
2590 
2603 template<typename CP32>
2605 
2606 } // namespace U_HEADER_ONLY_NAMESPACE
2607 
2608 #endif // U_HIDE_DRAFT_API
2609 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2610 #endif // __UTFITERATOR_H__
A C++ "range" over all Unicode code points U+0000..U+10FFFF.
Definition: utfiterator.h:292
A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
Definition: utfiterator.h:323
Result of validating and decoding a code unit sequence for one code point.
Definition: utfiterator.h:477
CodeUnits & operator=(const CodeUnits &other)=default
Copy assignment operator.
CodeUnits(const CodeUnits &other)=default
Copy constructor.
CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit)
Definition: utfiterator.h:480
Validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1084
U_FORCE_INLINE UTFIterator()
Default constructor.
Definition: utfiterator.h:1159
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1217
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:1356
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1267
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1103
U_FORCE_INLINE UTFIterator & operator=(UTFIterator &&src) noexcept=default
Move assignment operator.
CodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1101
U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1230
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1112
U_FORCE_INLINE UTFIterator(UnitIter p)
Constructs an iterator start or limit sentinel.
Definition: utfiterator.h:1153
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UTFIterator &iter)
Definition: utfiterator.h:1241
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:1198
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:1336
U_FORCE_INLINE bool operator!=(const UTFIterator &other) const
Definition: utfiterator.h:1184
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1105
U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit)
Constructor with start <= p < limit.
Definition: utfiterator.h:1127
U_FORCE_INLINE UTFIterator(const UTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE CodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:1250
U_FORCE_INLINE UTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:1304
U_FORCE_INLINE UTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:1282
U_FORCE_INLINE UTFIterator & operator=(const UTFIterator &other)=default
Copy assignment operator.
U_FORCE_INLINE bool operator==(const UTFIterator &other) const
Definition: utfiterator.h:1176
U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit)
Constructor with start == p < limit.
Definition: utfiterator.h:1140
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1107
A C++ "range" for validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:1752
UTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UTFStringCodePoints & operator=(const UTFStringCodePoints &other)=default
Copy assignment operator.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:1767
UTFStringCodePoints(const UTFStringCodePoints &other)=default
Copy constructor.
UTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:1777
Result of decoding a code unit sequence for one code point.
Definition: utfiterator.h:357
std::enable_if_t< std::is_pointer_v< Iter >||std::is_same_v< Iter, typename std::basic_string< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string< Unit >::const_iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::iterator >||std::is_same_v< Iter, typename std::basic_string_view< Unit >::const_iterator >, std::basic_string_view< Unit > > stringView() const
Definition: utfiterator.h:422
UnsafeCodeUnits & operator=(const UnsafeCodeUnits &other)=default
Copy assignment operator.
UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit)
Definition: utfiterator.h:362
UnsafeCodeUnits(const UnsafeCodeUnits &other)=default
Copy constructor.
Non-validating iterator over the code points in a Unicode string.
Definition: utfiterator.h:1920
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator & > operator--()
Pre-decrement operator.
Definition: utfiterator.h:2136
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2017
U_FORCE_INLINE UnsafeUTFIterator()
Default constructor.
Definition: utfiterator.h:1965
UnsafeCodeUnits< CP32, UnitIter > value_type
C++ iterator boilerplate.
Definition: utfiterator.h:1937
U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:1990
U_FORCE_INLINE UnsafeUTFIterator & operator=(const UnsafeUTFIterator &other)=default
Copy assignment operator.
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:1939
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2030
U_FORCE_INLINE UnsafeCodeUnits< CP32, UnitIter > operator*() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2050
std::conditional_t< prv::bidirectional_iterator< UnitIter >, std::bidirectional_iterator_tag, std::forward_iterator_tag > iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:1948
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator==(const UnsafeUTFIterator &iter, const Sentinel &s)
Definition: utfiterator.h:2002
Proxy pointer
C++ iterator boilerplate.
Definition: utfiterator.h:1941
prv::iter_difference_t< UnitIter > difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:1943
U_FORCE_INLINE std::enable_if_t< prv::bidirectional_iterator< Iter >, UnsafeUTFIterator > operator--(int)
Post-decrement operator.
Definition: utfiterator.h:2156
U_FORCE_INLINE Proxy operator->() const
Decodes the code unit sequence at the current position.
Definition: utfiterator.h:2067
U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept=default
Move constructor.
U_FORCE_INLINE UnsafeUTFIterator operator++(int)
Post-increment operator.
Definition: utfiterator.h:2104
U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other)=default
Copy constructor.
U_FORCE_INLINE UnsafeUTFIterator & operator++()
Pre-increment operator.
Definition: utfiterator.h:2082
U_FORCE_INLINE UnsafeUTFIterator(UnitIter p)
Constructor; the iterator/pointer should be at a code point boundary.
Definition: utfiterator.h:1959
U_FORCE_INLINE friend std::enable_if_t< !std::is_same_v< Sentinel, UnsafeUTFIterator > &&!std::is_same_v< Sentinel, UnitIter >, bool > operator!=(const Sentinel &s, const UnsafeUTFIterator &iter)
Definition: utfiterator.h:2041
U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const
Definition: utfiterator.h:1982
U_FORCE_INLINE UnsafeUTFIterator & operator=(UnsafeUTFIterator &&src) noexcept=default
Move assignment operator.
A C++ "range" for non-validating iteration over all of the code points of a code unit range.
Definition: utfiterator.h:2470
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string, keeping a reference to the code u...
Definition: utfiterator.h:2495
UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other)=default
Copy constructor.
UnsafeUTFStringCodePoints()=default
Constructs an empty C++ "range" object.
UnsafeUTFStringCodePoints(Range unitRange)
Constructs a C++ "range" object over the code points in the string.
Definition: utfiterator.h:2485
UnsafeUTFStringCodePoints & operator=(const UnsafeUTFStringCodePoints &other)=default
Copy assignment operator.
int32_t difference_type
C++ iterator boilerplate.
Definition: utfiterator.h:248
bool operator==(const CodePointsIterator &other) const
Definition: utfiterator.h:255
bool operator!=(const CodePointsIterator &other) const
Definition: utfiterator.h:257
value_type reference
C++ iterator boilerplate.
Definition: utfiterator.h:244
CP32 value_type
C++ iterator boilerplate.
Definition: utfiterator.h:242
std::forward_iterator_tag iterator_category
C++ iterator boilerplate.
Definition: utfiterator.h:250
CP32 * pointer
C++ iterator boilerplate.
Definition: utfiterator.h:246
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:346
#define U_CPLUSPLUS_VERSION
0 if no C++; 1, 11, 14, ...
Definition: platform.h:464
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
Definition: umachine.h:469
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
Definition: umachine.h:135
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
Definition: utf16.h:93
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
Definition: utf16.h:84
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
Definition: utf16.h:112
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
Definition: utf16.h:75
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
Definition: utf16.h:59
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
Definition: utf16.h:67
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
Definition: utf8.h:71
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
Definition: utf8.h:98
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
Definition: utf8.h:115
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
Definition: utf8.h:173
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
Definition: utf8.h:91
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
Definition: utf8.h:108
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
Definition: utf8.h:181
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
Definition: utf8.h:81
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
Definition: utf8.h:190
auto unsafeUTFIterator(UnitIter iter)
UnsafeUTFIterator factory function.
Definition: utfiterator.h:2454
typename std::iterator_traits< Iter >::difference_type iter_difference_t
Definition: utfiterator.h:203
constexpr bool forward_iterator
Definition: utfiterator.h:207
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit)
UTFIterator factory function for start <= p < limit.
Definition: utfiterator.h:1678
constexpr UTFStringCodePointsAdaptor< CP32, behavior > utfStringCodePoints
Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of cod...
Definition: utfiterator.h:1894
typename std::iterator_traits< Iter >::value_type iter_value_t
Definition: utfiterator.h:199
constexpr bool bidirectional_iterator
Definition: utfiterator.h:214
constexpr UnsafeUTFStringCodePointsAdaptor< CP32 > unsafeUTFStringCodePoints
Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a "range"...
Definition: utfiterator.h:2604
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
Definition: utfiterator.h:149
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
Definition: utfiterator.h:159
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
Definition: utfiterator.h:167
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Definition: utfiterator.h:157
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.