diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 1faff8765d33..72f02de9cc3b 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -1258,6 +1258,9 @@ strings + + strings + strings diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h new file mode 100644 index 000000000000..78252ddebc14 --- /dev/null +++ b/icu4c/source/common/unicode/utfiter.h @@ -0,0 +1,672 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: https://www.unicode.org/copyright.html + +// utf16cppiter.h +// created: 2024aug12 Markus W. Scherer + +#ifndef __UTF16CPPITER_H__ +#define __UTF16CPPITER_H__ + +// TODO: For experimentation outside of ICU, comment out this include. +// Experimentally conditional code below checks for UTYPES_H and +// otherwise uses copies of bits of ICU. +#include "unicode/utypes.h" + +#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H) + +#include +#ifdef UTYPES_H +#include "unicode/utf16.h" +#include "unicode/uversion.h" +#else +// TODO: Remove checks for UTYPES_H and replacement definitions. +// unicode/utypes.h etc. +#include +typedef int32_t UChar32; +constexpr UChar32 U_SENTINEL = -1; +// unicode/uversion.h +#define U_HEADER_ONLY_NAMESPACE header +namespace header {} +// unicode/utf.h +#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) +// unicode/utf16.h +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) +#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) +#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) +#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) +#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) +#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) +#define U16_GET_SUPPLEMENTARY(lead, trail) \ + (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) +#endif + +/** + * \file + * \brief C++ header-only API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed). + */ + +#ifndef U_HIDE_DRAFT_API + +// Some defined behaviors for handling ill-formed Unicode strings. +// TODO: For 8-bit strings, the SURROGATE option does not have an equivalent -- static_assert. +typedef enum UIllFormedBehavior { + U_BEHAVIOR_NEGATIVE, + U_BEHAVIOR_FFFD, + U_BEHAVIOR_SURROGATE +} UIllFormedBehavior; + +namespace U_HEADER_ONLY_NAMESPACE { + +/** + * Result of validating and decoding a minimal Unicode code unit sequence. + * Returned from validating Unicode string code point iterators. + * + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class CodeUnits { + using Unit = typename std::iterator_traits::value_type; +public: + // @internal + CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter data) : + c(codePoint), len(length), ok(wellFormed), p(data) {} + + CodeUnits(const CodeUnits &other) = default; + CodeUnits &operator=(const CodeUnits &other) = default; + + UChar32 codePoint() const { return c; } + + bool wellFormed() const { return ok; } + + UnitIter data() const { return p; } + + int32_t length() const { return len; } + + template + std::enable_if_t< + std::is_pointer_v, + std::basic_string_view> + stringView() const { + return std::basic_string_view(p, len); + } + +private: + // Order of fields with padding and access frequency in mind. + CP32 c; + uint8_t len; + bool ok; + UnitIter p; +}; + +/** + * Result of decoding a minimal Unicode code unit sequence which must be well-formed. + * Returned from non-validating Unicode string code point iterators. + * + * @tparam Unit Code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class UnsafeCodeUnits { +public: + // @internal + UnsafeCodeUnits(CP32 codePoint, uint8_t length, const Unit *data) : + c(codePoint), len(length), p(data) {} + + UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; + UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; + + UChar32 codePoint() const { return c; } + + const Unit *data() const { return p; } + + int32_t length() const { return len; } + + std::basic_string_view stringView() const { + return std::basic_string_view(p, len); + } + + // TODO: std::optional maybeCodePoint() const ? (nullopt if ill-formed) + +private: + // Order of fields with padding and access frequency in mind. + CP32 c; + uint8_t len; + const Unit *p; +}; + +/** + * Internal base class for public U16Iterator & U16ReverseIterator. + * Not intended for public subclassing. + * + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO + * @internal + */ +template +class U16IteratorBase { +protected: + // @internal + U16IteratorBase(UnitIter start, UnitIter p, UnitIter limit) : + start(start), current(p), limit(limit) {} + // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. + // Test pointers for == or != but not < or >. + + // @internal + U16IteratorBase(const U16IteratorBase &other) = default; + // @internal + U16IteratorBase &operator=(const U16IteratorBase &other) = default; + + // @internal + bool operator==(const U16IteratorBase &other) const { return current == other.current; } + // @internal + bool operator!=(const U16IteratorBase &other) const { return !operator==(other); } + + // @internal + void dec() { + // TODO: assert current != limit -- more precisely: start <= current < limit + // Very similar to U16_BACK_1(). + if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) { + --current; + } + } + + // @internal + CodeUnits readAndInc(UnitIter &p) const { + // TODO: assert p != limit -- more precisely: start <= p < limit + // Very similar to U16_NEXT_OR_FFFD(). + UnitIter p0 = p; + CP32 c = *p++; + if (!U16_IS_SURROGATE(c)) { + return {c, 1, true, p0}; + } else { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) { + ++p; + c = U16_GET_SUPPLEMENTARY(c, c2); + return {c, 2, true, p0}; + } else { + return {sub(c), 1, false, p0}; + } + } + } + + // @internal + CodeUnits decAndRead(UnitIter &p) const { + // TODO: assert p != limit -- more precisely: start <= p < limit + // Very similar to U16_PREV_OR_FFFD(). + CP32 c = *--p; + if (!U16_IS_SURROGATE(c)) { + return {c, 1, true, p}; + } else { + UnitIter p1; + uint16_t c2; + if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p--, U16_IS_LEAD(c2 = *p1))) { + p = p1; + c = U16_GET_SUPPLEMENTARY(c2, c); + return {c, 2, true, p}; + } else { + return {sub(c), 1, false, p}; + } + } + } + + // Handle ill-formed UTF-16: One unpaired surrogate. + // @internal + CP32 sub(CP32 surrogate) const { + switch (behavior) { + case U_BEHAVIOR_NEGATIVE: return U_SENTINEL; + case U_BEHAVIOR_FFFD: return 0xfffd; + case U_BEHAVIOR_SURROGATE: return surrogate; + } + } + + // In a validating iterator, we need start & limit so that when we read a code point + // (forward or backward) we can test if there are enough code units. + // @internal + const UnitIter start; + // @internal + UnitIter current; + // @internal + const UnitIter limit; +}; + +/** + * Validating bidirectional iterator over the code points in a Unicode 16-bit string. + * + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO + * @draft ICU 77 + */ +template +class U16Iterator : private U16IteratorBase { + // FYI: We need to qualify all accesses to super class members because of private inheritance. + using Super = U16IteratorBase; +public: + // TODO: make private, make friends + U16Iterator(UnitIter start, UnitIter p, UnitIter limit) : + Super(start, p, limit) {} + + U16Iterator(const U16Iterator &other) = default; + U16Iterator &operator=(const U16Iterator &other) = default; + + bool operator==(const U16Iterator &other) const { return Super::operator==(other); } + bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } + + CodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + UnitIter p = Super::current; + return Super::readAndInc(p); + } + + U16Iterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::readAndInc(Super::current); + return *this; + } + + U16Iterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16Iterator result(*this); + Super::readAndInc(Super::current); + return result; + } + + U16Iterator &operator--() { // pre-decrement + return Super::dec(); + } + + U16Iterator operator--(int) { // post-decrement + U16Iterator result(*this); + Super::dec(); + return result; + } +}; + +/** + * Validating reverse iterator over the code points in a Unicode 16-bit string. + * Not bidirectional, but optimized for reverse iteration. + * + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO + * @draft ICU 77 + */ +template +class U16ReverseIterator : private U16IteratorBase { + using Super = U16IteratorBase; +public: + // TODO: make private, make friends + U16ReverseIterator(UnitIter start, UnitIter p, UnitIter limit) : + Super(start, p, limit) {} + + U16ReverseIterator(const U16ReverseIterator &other) = default; + U16ReverseIterator &operator=(const U16ReverseIterator &other) = default; + + bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } + + CodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + UnitIter p = Super::current; + return Super::decAndRead(p); + } + + U16ReverseIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::decAndRead(Super::current); + return *this; + } + + U16ReverseIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16ReverseIterator result(*this); + Super::decAndRead(Super::current); + return result; + } +}; + +/** + * A C++ "range" for validating iteration over all of the code points of a 16-bit Unicode string. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO + * @draft ICU 77 + */ +template +class U16StringCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points in the string. + * @draft ICU 77 + */ + U16StringCodePoints(std::basic_string_view s) : s(s) {} + + /** @draft ICU 77 */ + U16StringCodePoints(const U16StringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16StringCodePoints &operator=(const U16StringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16Iterator begin() const { + return {s.data(), s.data(), s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16Iterator end() const { + const Unit16 *limit = s.data() + s.length(); + return {s.data(), limit, limit}; + } + + /** @draft ICU 77 */ + U16ReverseIterator rbegin() const { + const Unit16 *limit = s.data() + s.length(); + return {s.data(), limit, limit}; + } + + /** @draft ICU 77 */ + U16ReverseIterator rend() const { + return {s.data(), s.data(), s.data() + s.length()}; + } + +private: + std::basic_string_view s; +}; + +// ------------------------------------------------------------------------- *** + +/** + * Internal base class for public U16UnsafeIterator & U16UnsafeReverseIterator. + * Not intended for public subclassing. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @internal + */ +template +class U16UnsafeIteratorBase { +protected: + // @internal + U16UnsafeIteratorBase(const Unit16 *p) : current(p) {} + // Test pointers for == or != but not < or >. + + // @internal + U16UnsafeIteratorBase(const U16UnsafeIteratorBase &other) = default; + // @internal + U16UnsafeIteratorBase &operator=(const U16UnsafeIteratorBase &other) = default; + + // @internal + bool operator==(const U16UnsafeIteratorBase &other) const { return current == other.current; } + // @internal + bool operator!=(const U16UnsafeIteratorBase &other) const { return !operator==(other); } + + // @internal + void dec() { + // Very similar to U16_BACK_1_UNSAFE(). + if (U16_IS_TRAIL(*(--current))) { + --current; + } + } + + // @internal + UnsafeCodeUnits readAndInc(const Unit16 *&p) const { + // Very similar to U16_NEXT_UNSAFE(). + const Unit16 *p0 = p; + CP32 c = *p++; + if (!U16_IS_LEAD(c)) { + return {c, 1, p0}; + } else { + c = U16_GET_SUPPLEMENTARY(c, *p++); + return {c, 2, p0}; + } + } + + // @internal + UnsafeCodeUnits decAndRead(const Unit16 *&p) const { + // Very similar to U16_PREV_UNSAFE(). + CP32 c = *--p; + if (!U16_IS_TRAIL(c)) { + return {c, 1, p}; + } else { + c = U16_GET_SUPPLEMENTARY(*--p, c); + return {c, 2, p}; + } + } + + // @internal + const Unit16 *current; +}; + +/** + * Non-validating bidirectional iterator over the code points in a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeIterator : private U16UnsafeIteratorBase { + // FYI: We need to qualify all accesses to super class members because of private inheritance. + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeIterator(const U16UnsafeIterator &other) = default; + U16UnsafeIterator &operator=(const U16UnsafeIterator &other) = default; + + bool operator==(const U16UnsafeIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::readAndInc(p); + } + + U16UnsafeIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::readAndInc(Super::current); + return *this; + } + + U16UnsafeIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeIterator result(*this); + Super::readAndInc(Super::current); + return result; + } + + U16UnsafeIterator &operator--() { // pre-decrement + return Super::dec(); + } + + U16UnsafeIterator operator--(int) { // post-decrement + U16UnsafeIterator result(*this); + Super::dec(); + return result; + } +}; + +/** + * Non-validating reverse iterator over the code points in a UTF-16 string. + * Not bidirectional, but optimized for reverse iteration. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeReverseIterator : private U16UnsafeIteratorBase { + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeReverseIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeReverseIterator(const U16UnsafeReverseIterator &other) = default; + U16UnsafeReverseIterator &operator=(const U16UnsafeReverseIterator &other) = default; + + bool operator==(const U16UnsafeReverseIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeReverseIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::decAndRead(p); + } + + U16UnsafeReverseIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::decAndRead(Super::current); + return *this; + } + + U16UnsafeReverseIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeReverseIterator result(*this); + Super::decAndRead(Super::current); + return result; + } +}; + +/** + * A C++ "range" for non-validating iteration over all of the code points of a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeStringCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points in the string. + * @draft ICU 77 + */ + U16UnsafeStringCodePoints(std::basic_string_view s) : s(s) {} + + /** @draft ICU 77 */ + U16UnsafeStringCodePoints(const U16UnsafeStringCodePoints &other) = default; + U16UnsafeStringCodePoints &operator=(const U16UnsafeStringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16UnsafeIterator begin() const { + return {s.data()}; + } + + /** @draft ICU 77 */ + U16UnsafeIterator end() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rbegin() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rend() const { + return {s.data()}; + } + +private: + std::basic_string_view s; +}; + +// ------------------------------------------------------------------------- *** + +// TODO: UTF-8 + +// TODO: remove experimental sample code +#ifndef UTYPES_H +int32_t rangeLoop(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto units : range) { + sum += units.codePoint(); + } + return sum; +} + +int32_t loopIterPlusPlus(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + auto iter = range.begin(); + auto limit = range.end(); + while (iter != limit) { + sum += (*iter++).codePoint(); + } + return sum; +} + +int32_t reverseLoop(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += (*iter).codePoint(); + } + return sum; +} + +int32_t unsafeRangeLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto units : range) { + sum += units.codePoint(); + } + return sum; +} + +int32_t unsafeReverseLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += (*iter).codePoint(); + } + return sum; +} +#endif + +} // namespace U_HEADER_ONLY_NAMESPACE + +#endif // U_HIDE_DRAFT_API +#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API +#endif // __UTF16CPPITER_H__ diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 81ad55578072..8a12daa2f5de 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -75,7 +75,7 @@ numbertest_parse.o numbertest_doubleconversion.o numbertest_skeletons.o \ static_unisets_test.o numfmtdatadriventest.o numbertest_range.o erarulestest.o \ formattedvaluetest.o formatted_string_builder_test.o numbertest_permutation.o \ units_data_test.o units_router_test.o units_test.o displayoptions_test.o \ -numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o +numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o utfitertest.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index b58b29b3d4e7..476b4b3b5934 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -223,6 +223,7 @@ + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index d5c23d5e4cb5..7fc0c646647a 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -490,6 +490,9 @@ strings + + strings + strings diff --git a/icu4c/source/test/intltest/itutil.cpp b/icu4c/source/test/intltest/itutil.cpp index 4585792126d6..b9df5935414c 100644 --- a/icu4c/source/test/intltest/itutil.cpp +++ b/icu4c/source/test/intltest/itutil.cpp @@ -48,6 +48,7 @@ extern IntlTest *createPluralMapTest(); extern IntlTest *createStaticUnicodeSetsTest(); #endif static IntlTest *createUHashTest(); +extern IntlTest *createU16IteratorTest(); void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { @@ -84,6 +85,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* & TESTCASE_AUTO_CREATE_CLASS(LocaleMatcherTest); TESTCASE_AUTO_CREATE_CLASS(UHashTest); TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest); + TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest); TESTCASE_AUTO_END; } diff --git a/icu4c/source/test/intltest/utfitertest.cpp b/icu4c/source/test/intltest/utfitertest.cpp new file mode 100644 index 000000000000..a8bda260bc4d --- /dev/null +++ b/icu4c/source/test/intltest/utfitertest.cpp @@ -0,0 +1,209 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: https://www.unicode.org/copyright.html + +// utfitertest.cpp +// created: 2024aug12 Markus W. Scherer + +#include + +// Test header-only ICU C++ APIs. Do not use other ICU C++ APIs. +// Non-default configuration: +#define U_SHOW_CPLUSPLUS_API 0 +// Default configuration: +// #define U_SHOW_CPLUSPLUS_HEADER_API 1 + +#include "unicode/utypes.h" +#include "unicode/utfiter.h" +#include "intltest.h" + +// Makes u"literal"sv std::u16string_view literals possible. +// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv +using namespace std::string_view_literals; + +using U_HEADER_ONLY_NAMESPACE::U16Iterator; +using U_HEADER_ONLY_NAMESPACE::U16StringCodePoints; + +template +class FwdIter { +public: + typedef Unit value_type; + + FwdIter(const Unit *data) : p(data) {} + + bool operator==(const FwdIter &other) const { return p == other.p; } + bool operator!=(const FwdIter &other) const { return !operator==(other); } + + Unit operator*() const { return *p; } + FwdIter &operator++() { // pre-increment + ++p; + return *this; + } + FwdIter operator++(int) { // post-increment + FwdIter result(*this); + ++p; + return result; + } + +private: + const Unit *p; +}; + +class U16IteratorTest : public IntlTest { +public: + U16IteratorTest() {} + + void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override; + + void testGood(); + void testNegative(); + void testFFFD(); + void testSurrogate(); + void testFwdIter(); +}; + +extern IntlTest *createU16IteratorTest() { + return new U16IteratorTest(); +} + +void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { + if(exec) { + logln("TestSuite U16IteratorTest: "); + } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testGood); + TESTCASE_AUTO(testNegative); + TESTCASE_AUTO(testFFFD); + TESTCASE_AUTO(testSurrogate); + TESTCASE_AUTO(testFwdIter); + TESTCASE_AUTO_END; +} + +void U16IteratorTest::testGood() { + IcuTestErrorCode errorCode(*this, "testGood"); + std::u16string_view good(u"abçカ🚴"sv); + U16StringCodePoints range(good); + auto iter = range.begin(); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + assertTrue("iter[1] * stringView()", units.stringView() == u"b"sv); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint()); + ++iter; + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 2, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + assertTrue("iter[4] * stringView()", units.stringView() == u"🚴"sv); + assertTrue("iter == endIter", iter == range.end()); +} + +void U16IteratorTest::testNegative() { + IcuTestErrorCode errorCode(*this, "testNegative"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + U16StringCodePoints range(bad); + auto iter = range.begin(); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", -1, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + // TODO: test units.data() + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", -1, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + assertTrue("iter == endIter", iter == range.end()); +} + +void U16IteratorTest::testFFFD() { + IcuTestErrorCode errorCode(*this, "testFFFD"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + U16StringCodePoints range(bad); + auto iter = range.begin(); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", 0xfffd, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xfffd, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + assertTrue("iter == endIter", iter == range.end()); +} + +void U16IteratorTest::testSurrogate() { + IcuTestErrorCode errorCode(*this, "testSurrogate"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + U16StringCodePoints range(bad); + auto iter = range.begin(); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", 0xd900, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xdc05, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + assertTrue("iter == endIter", iter == range.end()); +} + +void U16IteratorTest::testFwdIter() { + IcuTestErrorCode errorCode(*this, "testFwdIter"); + std::u16string_view good(u"abçカ🚴"sv); + FwdIter goodBegin(good.data()); + FwdIter goodLimit(good.data() + good.length()); + U16Iterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin( + goodBegin, goodBegin, goodLimit); + U16Iterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit( + goodBegin, goodLimit, goodLimit); + // TODO: U16StringCodePoints range(good); + auto iter = rangeBegin; + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + // No units.stringView() when the unit iterator is not a pointer. + assertTrue("iter[1] * data()[0]", *units.data() == u'b'); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint()); + ++iter; + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 2, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + FwdIter data = units.data(); + assertTrue("iter[4] * data()[0]", *data++ == u"🚴"[0]); + assertTrue("iter[4] * data()[1]", *data == u"🚴"[1]); + assertTrue("iter == endIter", iter == rangeLimit); +}