From 74e9b6f7ca25bfad5bf645d1e8cb841bea2fedb5 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 12 Aug 2024 14:59:58 -0700 Subject: [PATCH 01/23] U16Iterator experiment --- icu4c/source/common/unicode/utf16cppiter.h | 144 ++++++++++++++++++ icu4c/source/test/intltest/Makefile.in | 2 +- icu4c/source/test/intltest/itutil.cpp | 2 + icu4c/source/test/intltest/utfcppitertest.cpp | 59 +++++++ 4 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 icu4c/source/common/unicode/utf16cppiter.h create mode 100644 icu4c/source/test/intltest/utfcppitertest.cpp diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h new file mode 100644 index 000000000000..5fb0b87dae06 --- /dev/null +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -0,0 +1,144 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: https://www.unicode.org/copyright.html + +// utf16cppiter.h +// created: 2024aug12 Markus W. Scherer + +#ifndef __UTF16CPPITER_H__ +#define __UTF16CPPITER_H__ + +#include + +#include "unicode/utypes.h" + +#if U_SHOW_CPLUSPLUS_API + +#include "unicode/utf16.h" +#include "unicode/uversion.h" + +/** + * \file + * \brief C++ API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed). + */ + +namespace U_HEADER_ONLY_NAMESPACE { + +// Some defined behaviors for handling ill-formed 16-bit strings. +// TODO: Maybe share with 8-bit strings, but the SURROGATE option does not have an equivalent there. +// +// TODO: A possible alternative to an enum might be some kind of function template +// which would be fully customizable. +// The operator*() return value might then want to be a template parameter as well. +// For example, for a well-formed sequence, the return value could be +// a tuple of (code point, well-formed), or a string view, or... +// (And then the caller could choose between UChar32 and char32_t.) +// However, all of that would make the API more complex and daunting. +enum U16IllFormedBehavior { + U16_BEHAVIOR_NEGATIVE, + U16_BEHAVIOR_FFFD, + U16_BEHAVIOR_SURROGATE +}; + +// Validating iterator over the code points in a Unicode 16-bit string. +// TODO: all @draft ICU 76 +template +class U16Iterator { +public: + // TODO: make private, make friends + U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + start(start), p(p), limit(limit) { + if (p != limit) { + readOneForward(); + } + } + // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. + // Test pointers for == or != but not < or >. + + U16Iterator(const U16Iterator &other) = default; + U16Iterator(U16Iterator &&other) noexcept = default; + + bool operator==(const U16Iterator &other) const { return p == other.p; } + bool operator!=(const U16Iterator &other) const { return !operator==(other); } + + UChar32 operator*() const { + return c; + } + + // TODO: good function names? + // It would be nice to avoid a prefix like "current", "one", "cp", + // but just length() on the iterator could be confusing. + int32_t currentLength() const { return len; } + + std::basic_string_view currentView() const { + return std::basic_string_view(p, len); + } + + bool currentIsWellFormed() const { return ok; } + + U16Iterator &operator++() { // pre-increment + // TODO: think about switching directions etc. + // Assume that readOneForward() was called and set `len`. + // Skip the current code point, then read the next one. + p += len; + if (p != limit) { + readOneForward(); + } + return *this; + } + + U16Iterator operator++(int) { // post-increment + U16Iterator result(*this); + // TODO: think about switching directions etc. + // Assume that readOneForward() was called and set `len`. + // Skip the current code point, then read the next one. + p += len; + if (p != limit) { + readOneForward(); + } + return result; + } + +private: + void readOneForward() { + // see U16_NEXT_OR_FFFD() + c = *p; + len = 1; + ok = true; + if (U16_IS_SURROGATE(c)) { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { + c = U16_GET_SUPPLEMENTARY(c, c2); + len = 2; + } else { + // TODO: U16IllFormedBehavior + c = 0xfffd; + ok = false; + } + } + } + + // In a validating iterator, we need start & limit so that when we read a code point + // (forward or backward) we can test if there are enough code units. + const Unit16 *start; + const Unit16 *p; + const Unit16 *limit; + UChar32 c = 0; + int8_t len = 0; + bool ok = false; +}; + +// ------------------------------------------------------------------------- *** + +// TODO: Non-validating iterator over the code points in a Unicode 16-bit string. +// Assumes well-formed UTF-16. Otherwise the behavior is undefined. +// TODO: all @draft ICU 76 +// template +// class U16UnsafeIterator +// TODO: only p, no start, no limit +// TODO: can/should we read the code point only in operator*()? +// if we read it in the constructor, then we would still need start/limit... + +} // namespace U_HEADER_ONLY_NAMESPACE + +#endif // U_SHOW_CPLUSPLUS_API +#endif // __UTF16CPPITER_H__ diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 81ad55578072..64f36bd061f8 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -75,7 +75,7 @@ numbertest_parse.o numbertest_doubleconversion.o numbertest_skeletons.o \ static_unisets_test.o numfmtdatadriventest.o numbertest_range.o erarulestest.o \ formattedvaluetest.o formatted_string_builder_test.o numbertest_permutation.o \ units_data_test.o units_router_test.o units_test.o displayoptions_test.o \ -numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o +numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o utfcppitertest.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/itutil.cpp b/icu4c/source/test/intltest/itutil.cpp index 4585792126d6..b9df5935414c 100644 --- a/icu4c/source/test/intltest/itutil.cpp +++ b/icu4c/source/test/intltest/itutil.cpp @@ -48,6 +48,7 @@ extern IntlTest *createPluralMapTest(); extern IntlTest *createStaticUnicodeSetsTest(); #endif static IntlTest *createUHashTest(); +extern IntlTest *createU16IteratorTest(); void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { @@ -84,6 +85,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* & TESTCASE_AUTO_CREATE_CLASS(LocaleMatcherTest); TESTCASE_AUTO_CREATE_CLASS(UHashTest); TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest); + TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest); TESTCASE_AUTO_END; } diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp new file mode 100644 index 000000000000..c0a914b579c1 --- /dev/null +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -0,0 +1,59 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: https://www.unicode.org/copyright.html + +// utfcppitertest.cpp +// created: 2024aug12 Markus W. Scherer + +#include + +#include "unicode/utypes.h" +#include "unicode/utf16cppiter.h" +#include "intltest.h" + +// Makes u"literal"sv std::u16string_view literals possible. +// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv +using namespace std::string_view_literals; + +using U_HEADER_ONLY_NAMESPACE::U16Iterator; +using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE; + +class U16IteratorTest : public IntlTest { +public: + U16IteratorTest() {} + + void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override; + + void testExperiment(); +}; + +extern IntlTest *createU16IteratorTest() { + return new U16IteratorTest(); +} + +void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { + if(exec) { + logln("TestSuite U16IteratorTest: "); + } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testExperiment); + TESTCASE_AUTO_END; +} + +void U16IteratorTest::testExperiment() { + IcuTestErrorCode errorCode(*this, "testExperiment"); + std::u16string_view good(u"abçカ🚴"sv); + const char16_t *goodLimit = good.data() + good.length(); + U16Iterator goodIter(good.data(), good.data(), goodLimit); + assertEquals("goodIter[0] *", u'a', *goodIter); + ++goodIter; // pre-increment + assertEquals("goodIter[1] *", u'b', *goodIter); + ++goodIter; + assertEquals("goodIter[2] *", u'ç', *goodIter++); // post-increment + assertEquals("goodIter[3] *", u'カ', *goodIter); + ++goodIter; + assertEquals("goodIter[4] *", U'🚴', *goodIter++); + U16Iterator goodEndIter(good.data(), goodLimit, goodLimit); + assertTrue("goodIter == goodEndIter", goodIter == goodEndIter); + + // TODO: test ill-formed, and much more... +} From 6568b04c7017473667dfb8d8b5de7bf07499d073 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 12 Aug 2024 19:17:23 -0700 Subject: [PATCH 02/23] U16Iterator op*() returns U16OneSeq --- icu4c/source/common/common.vcxproj.filters | 3 + icu4c/source/common/unicode/utf16cppiter.h | 136 +++++++++++------- icu4c/source/test/intltest/intltest.vcxproj | 1 + .../test/intltest/intltest.vcxproj.filters | 3 + icu4c/source/test/intltest/utfcppitertest.cpp | 16 ++- 5 files changed, 100 insertions(+), 59 deletions(-) diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 1faff8765d33..72f02de9cc3b 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -1258,6 +1258,9 @@ strings + + strings + strings diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 5fb0b87dae06..582ce1d1b6de 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -39,92 +39,122 @@ enum U16IllFormedBehavior { U16_BEHAVIOR_SURROGATE }; -// Validating iterator over the code points in a Unicode 16-bit string. -// TODO: all @draft ICU 76 +/** + * A code unit sequence for one code point returned by U16Iterator. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @draft ICU 76 + */ +template +class U16OneSeq { +public: + U16OneSeq(const U16OneSeq &other) = default; + + const Unit16 *data() { return p; } + int32_t length() const { return len; } + + std::basic_string_view stringView() const { + return std::basic_string_view(p, len); + } + + bool isWellFormed() const { return ok; } + + UChar32 codePoint() const { return c; } + + // TODO: std::optional maybeCodePoint() const ? (nullopt if !ok) + +private: + // TODO: Why can't we just use Unit16 here? + // error: declaration of 'Unit16' shadows template parameter + template + friend class U16Iterator; + + U16OneSeq(const Unit16 *p) : p(p) {} + + void fwd1() { p += len; } + + void readOneForward(const Unit16 *limit) { + if (p == limit) { + len = 0; + return; + } + // see U16_NEXT_OR_FFFD() + c = *p; + len = 1; + ok = true; + if (U16_IS_SURROGATE(c)) { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { + c = U16_GET_SUPPLEMENTARY(c, c2); + len = 2; + } else { + // TODO: U16IllFormedBehavior + c = 0xfffd; + ok = false; + } + } + } + + const Unit16 *p; + UChar32 c = 0; + int8_t len = 0; + bool ok = false; +}; + +/** + * Validating iterator over the code points in a Unicode 16-bit string. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @param U16IllFormedBehavior TODO + * @draft ICU 76 + */ template class U16Iterator { public: // TODO: make private, make friends U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : - start(start), p(p), limit(limit) { - if (p != limit) { - readOneForward(); - } + start(start), limit(limit), seq(p) { + seq.readOneForward(limit); } // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. U16Iterator(const U16Iterator &other) = default; - U16Iterator(U16Iterator &&other) noexcept = default; - bool operator==(const U16Iterator &other) const { return p == other.p; } + bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } - UChar32 operator*() const { - return c; - } - - // TODO: good function names? - // It would be nice to avoid a prefix like "current", "one", "cp", - // but just length() on the iterator could be confusing. - int32_t currentLength() const { return len; } - - std::basic_string_view currentView() const { - return std::basic_string_view(p, len); + const U16OneSeq &operator*() const { + return seq; } - bool currentIsWellFormed() const { return ok; } - U16Iterator &operator++() { // pre-increment // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set `len`. + // Assume that readOneForward() was called and set seq.len. // Skip the current code point, then read the next one. - p += len; - if (p != limit) { - readOneForward(); - } + seq.fwd1(); + seq.readOneForward(limit); return *this; } U16Iterator operator++(int) { // post-increment U16Iterator result(*this); // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set `len`. + // Assume that readOneForward() was called and set seq.len. // Skip the current code point, then read the next one. - p += len; - if (p != limit) { - readOneForward(); - } + seq.fwd1(); + seq.readOneForward(limit); return result; } private: - void readOneForward() { - // see U16_NEXT_OR_FFFD() - c = *p; - len = 1; - ok = true; - if (U16_IS_SURROGATE(c)) { - uint16_t c2; - if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { - c = U16_GET_SUPPLEMENTARY(c, c2); - len = 2; - } else { - // TODO: U16IllFormedBehavior - c = 0xfffd; - ok = false; - } - } - } - // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. const Unit16 *start; - const Unit16 *p; const Unit16 *limit; - UChar32 c = 0; - int8_t len = 0; - bool ok = false; + U16OneSeq seq; }; // ------------------------------------------------------------------------- *** diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index b58b29b3d4e7..8d9bba021508 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -223,6 +223,7 @@ + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index d5c23d5e4cb5..0abc4608d1a6 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -490,6 +490,9 @@ strings + + strings + strings diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index c0a914b579c1..f71f23327386 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -14,8 +14,9 @@ // https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv using namespace std::string_view_literals; -using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE; +using U_HEADER_ONLY_NAMESPACE::U16Iterator; +using U_HEADER_ONLY_NAMESPACE::U16OneSeq; class U16IteratorTest : public IntlTest { public: @@ -44,14 +45,17 @@ void U16IteratorTest::testExperiment() { std::u16string_view good(u"abçカ🚴"sv); const char16_t *goodLimit = good.data() + good.length(); U16Iterator goodIter(good.data(), good.data(), goodLimit); - assertEquals("goodIter[0] *", u'a', *goodIter); + assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint()); ++goodIter; // pre-increment - assertEquals("goodIter[1] *", u'b', *goodIter); + assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint()); ++goodIter; - assertEquals("goodIter[2] *", u'ç', *goodIter++); // post-increment - assertEquals("goodIter[3] *", u'カ', *goodIter); + assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint()); // post-increment + assertEquals("goodIter[3] * codePoint()", u'カ', (*goodIter).codePoint()); ++goodIter; - assertEquals("goodIter[4] *", U'🚴', *goodIter++); + const U16OneSeq &seq = *goodIter++; + assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint()); + assertEquals("goodIter[4] * length()", 2, seq.length()); + assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv); U16Iterator goodEndIter(good.data(), goodLimit, goodLimit); assertTrue("goodIter == goodEndIter", goodIter == goodEndIter); From 1bcd5ee309172aaeed8467fde4dda82f7244bdc6 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 23 Dec 2024 14:26:16 -0800 Subject: [PATCH 03/23] header-only --- icu4c/source/common/unicode/utf16cppiter.h | 15 +++++++++------ icu4c/source/test/intltest/utfcppitertest.cpp | 6 ++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 582ce1d1b6de..0050b60fee49 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -11,16 +11,18 @@ #include "unicode/utypes.h" -#if U_SHOW_CPLUSPLUS_API +#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API #include "unicode/utf16.h" #include "unicode/uversion.h" /** * \file - * \brief C++ API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed). + * \brief C++ header-only API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed). */ +#ifndef U_HIDE_DRAFT_API + namespace U_HEADER_ONLY_NAMESPACE { // Some defined behaviors for handling ill-formed 16-bit strings. @@ -44,7 +46,7 @@ enum U16IllFormedBehavior { * * TODO: check doxygen syntax for template parameters * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t - * @draft ICU 76 + * @draft ICU 77 */ template class U16OneSeq { @@ -108,7 +110,7 @@ class U16OneSeq { * TODO: check doxygen syntax for template parameters * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t * @param U16IllFormedBehavior TODO - * @draft ICU 76 + * @draft ICU 77 */ template class U16Iterator { @@ -161,7 +163,7 @@ class U16Iterator { // TODO: Non-validating iterator over the code points in a Unicode 16-bit string. // Assumes well-formed UTF-16. Otherwise the behavior is undefined. -// TODO: all @draft ICU 76 +// TODO: all @draft ICU 77 // template // class U16UnsafeIterator // TODO: only p, no start, no limit @@ -170,5 +172,6 @@ class U16Iterator { } // namespace U_HEADER_ONLY_NAMESPACE -#endif // U_SHOW_CPLUSPLUS_API +#endif // U_HIDE_DRAFT_API +#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API #endif // __UTF16CPPITER_H__ diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index f71f23327386..c1162dde1594 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -6,6 +6,12 @@ #include +// Test header-only ICU C++ APIs. Do not use other ICU C++ APIs. +// Non-default configuration: +#define U_SHOW_CPLUSPLUS_API 0 +// Default configuration: +// #define U_SHOW_CPLUSPLUS_HEADER_API 1 + #include "unicode/utypes.h" #include "unicode/utf16cppiter.h" #include "intltest.h" From 20f890be31682c7467bbc4c03bd0b05aa92a5aa2 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 23 Dec 2024 16:55:31 -0800 Subject: [PATCH 04/23] operator* read on the fly --- icu4c/source/common/unicode/utf16cppiter.h | 114 +++++++----------- icu4c/source/test/intltest/utfcppitertest.cpp | 12 +- 2 files changed, 51 insertions(+), 75 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 0050b60fee49..9b03ab4486bc 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -41,6 +41,8 @@ enum U16IllFormedBehavior { U16_BEHAVIOR_SURROGATE }; +// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t. + /** * A code unit sequence for one code point returned by U16Iterator. * @@ -49,59 +51,18 @@ enum U16IllFormedBehavior { * @draft ICU 77 */ template -class U16OneSeq { -public: - U16OneSeq(const U16OneSeq &other) = default; - - const Unit16 *data() { return p; } - int32_t length() const { return len; } +struct U16OneSeq { + // Order of fields with padding and access frequency in mind. + UChar32 codePoint = 0; + uint8_t length = 0; + bool isWellFormed = false; + const Unit16 *data; std::basic_string_view stringView() const { - return std::basic_string_view(p, len); - } - - bool isWellFormed() const { return ok; } - - UChar32 codePoint() const { return c; } - - // TODO: std::optional maybeCodePoint() const ? (nullopt if !ok) - -private: - // TODO: Why can't we just use Unit16 here? - // error: declaration of 'Unit16' shadows template parameter - template - friend class U16Iterator; - - U16OneSeq(const Unit16 *p) : p(p) {} - - void fwd1() { p += len; } - - void readOneForward(const Unit16 *limit) { - if (p == limit) { - len = 0; - return; - } - // see U16_NEXT_OR_FFFD() - c = *p; - len = 1; - ok = true; - if (U16_IS_SURROGATE(c)) { - uint16_t c2; - if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { - c = U16_GET_SUPPLEMENTARY(c, c2); - len = 2; - } else { - // TODO: U16IllFormedBehavior - c = 0xfffd; - ok = false; - } - } + return std::basic_string_view(data, length); } - const Unit16 *p; - UChar32 c = 0; - int8_t len = 0; - bool ok = false; + // TODO: std::optional maybeCodePoint() const ? (nullopt if !isWellFormed) }; /** @@ -117,53 +78,68 @@ class U16Iterator { public: // TODO: make private, make friends U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : - start(start), limit(limit), seq(p) { - seq.readOneForward(limit); - } + start(start), p(p), limit(limit) {} // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. U16Iterator(const U16Iterator &other) = default; - bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; } + bool operator==(const U16Iterator &other) const { return p == other.p; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } - const U16OneSeq &operator*() const { - return seq; + const U16OneSeq operator*() const { + // TODO: assert p != limit -- more precisely: start <= p < limit + // Similar to U16_NEXT_OR_FFFD(). + UChar32 c = *p; + if (!U16_IS_SURROGATE(c)) { + return {c, 1, true, p}; + } else { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { + c = U16_GET_SUPPLEMENTARY(c, c2); + return {c, 2, true, p}; + } else { + // TODO: U16IllFormedBehavior + return {0xfffd, 1, false, p}; + } + } } U16Iterator &operator++() { // pre-increment - // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set seq.len. - // Skip the current code point, then read the next one. - seq.fwd1(); - seq.readOneForward(limit); + // TODO: assert p != limit -- more precisely: start <= p < limit + // Similar to U16_FWD_1(). + if (U16_IS_LEAD(*p++) && p != limit && U16_IS_TRAIL(*p)) { + ++p; + } return *this; } U16Iterator operator++(int) { // post-increment + // TODO: assert p != limit -- more precisely: start <= p < limit U16Iterator result(*this); - // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set seq.len. - // Skip the current code point, then read the next one. - seq.fwd1(); - seq.readOneForward(limit); + // More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler + // amortize work between operator*() and operator++(int) in typical *it++ usage. + // Otherwise this is slightly less efficient because it tests a lead surrogate twice. + UChar32 c = *p++; + if (U16_IS_SURROGATE(c) && + U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { + ++p; + } return result; } private: // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. - const Unit16 *start; - const Unit16 *limit; - U16OneSeq seq; + const Unit16 *const start; + const Unit16 *p; + const Unit16 *const limit; }; // ------------------------------------------------------------------------- *** // TODO: Non-validating iterator over the code points in a Unicode 16-bit string. // Assumes well-formed UTF-16. Otherwise the behavior is undefined. -// TODO: all @draft ICU 77 // template // class U16UnsafeIterator // TODO: only p, no start, no limit diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index c1162dde1594..0ae44937d294 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -51,16 +51,16 @@ void U16IteratorTest::testExperiment() { std::u16string_view good(u"abçカ🚴"sv); const char16_t *goodLimit = good.data() + good.length(); U16Iterator goodIter(good.data(), good.data(), goodLimit); - assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint()); + assertEquals("goodIter[0] * codePoint", u'a', (*goodIter).codePoint); ++goodIter; // pre-increment - assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint()); + assertEquals("goodIter[1] * codePoint", u'b', (*goodIter).codePoint); ++goodIter; - assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint()); // post-increment - assertEquals("goodIter[3] * codePoint()", u'カ', (*goodIter).codePoint()); + assertEquals("goodIter[2] * codePoint", u'ç', (*goodIter++).codePoint); // post-increment + assertEquals("goodIter[3] * codePoint", u'カ', (*goodIter).codePoint); ++goodIter; const U16OneSeq &seq = *goodIter++; - assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint()); - assertEquals("goodIter[4] * length()", 2, seq.length()); + assertEquals("goodIter[4] * codePoint", U'🚴', seq.codePoint); + assertEquals("goodIter[4] * length", 2, seq.length); assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv); U16Iterator goodEndIter(good.data(), goodLimit, goodLimit); assertTrue("goodIter == goodEndIter", goodIter == goodEndIter); From 7dc31d2e5961f63dbfe6c1352a535b4d1d21459b Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 11:18:07 -0800 Subject: [PATCH 05/23] fix hdrtest --- icu4c/source/common/unicode/utf16cppiter.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 9b03ab4486bc..0967904e574d 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -7,12 +7,11 @@ #ifndef __UTF16CPPITER_H__ #define __UTF16CPPITER_H__ -#include - #include "unicode/utypes.h" #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API +#include #include "unicode/utf16.h" #include "unicode/uversion.h" From b381c2b569d051e7e765027c086e77028268fbd4 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 12:07:24 -0800 Subject: [PATCH 06/23] U16IllFormedBehavior --- icu4c/source/common/unicode/utf16cppiter.h | 14 +- icu4c/source/test/intltest/utfcppitertest.cpp | 126 +++++++++++++++--- 2 files changed, 117 insertions(+), 23 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 0967904e574d..4205f2d5a0fe 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -86,7 +86,7 @@ class U16Iterator { bool operator==(const U16Iterator &other) const { return p == other.p; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } - const U16OneSeq operator*() const { + U16OneSeq operator*() const { // TODO: assert p != limit -- more precisely: start <= p < limit // Similar to U16_NEXT_OR_FFFD(). UChar32 c = *p; @@ -98,8 +98,7 @@ class U16Iterator { c = U16_GET_SUPPLEMENTARY(c, c2); return {c, 2, true, p}; } else { - // TODO: U16IllFormedBehavior - return {0xfffd, 1, false, p}; + return {sub(c), 1, false, p}; } } } @@ -128,6 +127,15 @@ class U16Iterator { } private: + // Handle ill-formed UTF-16: One unpaired surrogate. + UChar32 sub(UChar32 surrogate) const { + switch (behavior) { + case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL; + case U16_BEHAVIOR_FFFD: return 0xfffd; + case U16_BEHAVIOR_SURROGATE: return surrogate; + } + } + // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. const Unit16 *const start; diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index 0ae44937d294..eb698bc9f699 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -21,6 +21,8 @@ using namespace std::string_view_literals; using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE; +using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_FFFD; +using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_SURROGATE; using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16OneSeq; @@ -30,7 +32,10 @@ class U16IteratorTest : public IntlTest { void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override; - void testExperiment(); + void testGood(); + void testNegative(); + void testFFFD(); + void testSurrogate(); }; extern IntlTest *createU16IteratorTest() { @@ -42,28 +47,109 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam logln("TestSuite U16IteratorTest: "); } TESTCASE_AUTO_BEGIN; - TESTCASE_AUTO(testExperiment); + TESTCASE_AUTO(testGood); + TESTCASE_AUTO(testNegative); + TESTCASE_AUTO(testFFFD); + TESTCASE_AUTO(testSurrogate); TESTCASE_AUTO_END; } -void U16IteratorTest::testExperiment() { - IcuTestErrorCode errorCode(*this, "testExperiment"); +void U16IteratorTest::testGood() { + IcuTestErrorCode errorCode(*this, "testGood"); std::u16string_view good(u"abçカ🚴"sv); - const char16_t *goodLimit = good.data() + good.length(); - U16Iterator goodIter(good.data(), good.data(), goodLimit); - assertEquals("goodIter[0] * codePoint", u'a', (*goodIter).codePoint); - ++goodIter; // pre-increment - assertEquals("goodIter[1] * codePoint", u'b', (*goodIter).codePoint); - ++goodIter; - assertEquals("goodIter[2] * codePoint", u'ç', (*goodIter++).codePoint); // post-increment - assertEquals("goodIter[3] * codePoint", u'カ', (*goodIter).codePoint); - ++goodIter; - const U16OneSeq &seq = *goodIter++; - assertEquals("goodIter[4] * codePoint", U'🚴', seq.codePoint); - assertEquals("goodIter[4] * length", 2, seq.length); - assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv); - U16Iterator goodEndIter(good.data(), goodLimit, goodLimit); - assertTrue("goodIter == goodEndIter", goodIter == goodEndIter); + const char16_t *limit = good.data() + good.length(); + U16Iterator iter(good.data(), good.data(), limit); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + ++iter; // pre-increment + U16OneSeq seq = *iter; + assertEquals("iter[1] * codePoint", u'b', seq.codePoint); + assertEquals("iter[1] * length", 1, seq.length); + assertTrue("iter[1] * isWellFormed", seq.isWellFormed); + assertTrue("iter[1] * stringView()", seq.stringView() == u"b"sv); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint); // post-increment + assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint); + ++iter; + seq = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', seq.codePoint); + assertEquals("iter[4] * length", 2, seq.length); + assertTrue("iter[4] * isWellFormed", seq.isWellFormed); + assertTrue("iter[4] * stringView()", seq.stringView() == u"🚴"sv); + U16Iterator endIter(good.data(), limit, limit); + assertTrue("iter == endIter", iter == endIter); +} + +void U16IteratorTest::testNegative() { + IcuTestErrorCode errorCode(*this, "testNegative"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + const char16_t *limit = bad.data() + bad.length(); + U16Iterator iter(bad.data(), bad.data(), limit); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + ++iter; // pre-increment + U16OneSeq seq = *iter; + assertEquals("iter[1] * codePoint", -1, seq.codePoint); + assertEquals("iter[1] * length", 1, seq.length); + assertFalse("iter[1] * isWellFormed", seq.isWellFormed); + auto sv = seq.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment + seq = *iter++; // post-increment + assertEquals("iter[3] * codePoint", -1, seq.codePoint); + assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + U16Iterator endIter(bad.data(), limit, limit); + assertTrue("iter == endIter", iter == endIter); +} + +void U16IteratorTest::testFFFD() { + IcuTestErrorCode errorCode(*this, "testFFFD"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + const char16_t *limit = bad.data() + bad.length(); + U16Iterator iter(bad.data(), bad.data(), limit); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + ++iter; // pre-increment + U16OneSeq seq = *iter; + assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint); + assertEquals("iter[1] * length", 1, seq.length); + assertFalse("iter[1] * isWellFormed", seq.isWellFormed); + auto sv = seq.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment + seq = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xfffd, seq.codePoint); + assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + U16Iterator endIter(bad.data(), limit, limit); + assertTrue("iter == endIter", iter == endIter); +} - // TODO: test ill-formed, and much more... +void U16IteratorTest::testSurrogate() { + IcuTestErrorCode errorCode(*this, "testSurrogate"); + static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; + std::u16string_view bad(badChars, 5); + const char16_t *limit = bad.data() + bad.length(); + U16Iterator iter(bad.data(), bad.data(), limit); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + ++iter; // pre-increment + U16OneSeq seq = *iter; + assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint); + assertEquals("iter[1] * length", 1, seq.length); + assertFalse("iter[1] * isWellFormed", seq.isWellFormed); + auto sv = seq.stringView(); + assertEquals("iter[1] * stringView().length()", 1, sv.length()); + assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + ++iter; + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment + seq = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xdc05, seq.codePoint); + assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment + U16Iterator endIter(bad.data(), limit, limit); + assertTrue("iter == endIter", iter == endIter); } From 6851e8db32692469db22a4d5a440f1b4883b4aed Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 12:24:04 -0800 Subject: [PATCH 07/23] C++ range: U16StringCodePoints --- icu4c/source/common/unicode/utf16cppiter.h | 33 +++++++++++++++++++ icu4c/source/test/intltest/utfcppitertest.cpp | 29 ++++++++-------- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 4205f2d5a0fe..a6603203dd32 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -143,6 +143,39 @@ class U16Iterator { const Unit16 *const limit; }; +/** + * A C++ "range" for iterating over all of the code points of a 16-bit Unicode string. + * + * @return a code point iterator. + * @draft ICU 77 + */ +template +class U16StringCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points in the string. + * @draft ICU 77 + */ + U16StringCodePoints(std::basic_string_view s) : s(s) {} + + /** @draft ICU 77 */ + U16StringCodePoints(const U16StringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16Iterator begin() const { + return {s.data(), s.data(), s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16Iterator end() const { + const Unit16 *limit = s.data() + s.length(); + return {s.data(), limit, limit}; + } + +private: + std::basic_string_view s; +}; + // ------------------------------------------------------------------------- *** // TODO: Non-validating iterator over the code points in a Unicode 16-bit string. diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index eb698bc9f699..8e429d8a74b2 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -25,6 +25,7 @@ using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_FFFD; using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_SURROGATE; using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16OneSeq; +using U_HEADER_ONLY_NAMESPACE::U16StringCodePoints; class U16IteratorTest : public IntlTest { public: @@ -57,8 +58,8 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam void U16IteratorTest::testGood() { IcuTestErrorCode errorCode(*this, "testGood"); std::u16string_view good(u"abçカ🚴"sv); - const char16_t *limit = good.data() + good.length(); - U16Iterator iter(good.data(), good.data(), limit); + U16StringCodePoints range(good); + auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment U16OneSeq seq = *iter; @@ -75,16 +76,15 @@ void U16IteratorTest::testGood() { assertEquals("iter[4] * length", 2, seq.length); assertTrue("iter[4] * isWellFormed", seq.isWellFormed); assertTrue("iter[4] * stringView()", seq.stringView() == u"🚴"sv); - U16Iterator endIter(good.data(), limit, limit); - assertTrue("iter == endIter", iter == endIter); + assertTrue("iter == endIter", iter == range.end()); } void U16IteratorTest::testNegative() { IcuTestErrorCode errorCode(*this, "testNegative"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - const char16_t *limit = bad.data() + bad.length(); - U16Iterator iter(bad.data(), bad.data(), limit); + U16StringCodePoints range(bad); + auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment U16OneSeq seq = *iter; @@ -100,16 +100,15 @@ void U16IteratorTest::testNegative() { assertEquals("iter[3] * codePoint", -1, seq.codePoint); assertFalse("iter[3] * isWellFormed", seq.isWellFormed); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment - U16Iterator endIter(bad.data(), limit, limit); - assertTrue("iter == endIter", iter == endIter); + assertTrue("iter == endIter", iter == range.end()); } void U16IteratorTest::testFFFD() { IcuTestErrorCode errorCode(*this, "testFFFD"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - const char16_t *limit = bad.data() + bad.length(); - U16Iterator iter(bad.data(), bad.data(), limit); + U16StringCodePoints range(bad); + auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment U16OneSeq seq = *iter; @@ -125,16 +124,15 @@ void U16IteratorTest::testFFFD() { assertEquals("iter[3] * codePoint", 0xfffd, seq.codePoint); assertFalse("iter[3] * isWellFormed", seq.isWellFormed); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment - U16Iterator endIter(bad.data(), limit, limit); - assertTrue("iter == endIter", iter == endIter); + assertTrue("iter == endIter", iter == range.end()); } void U16IteratorTest::testSurrogate() { IcuTestErrorCode errorCode(*this, "testSurrogate"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - const char16_t *limit = bad.data() + bad.length(); - U16Iterator iter(bad.data(), bad.data(), limit); + U16StringCodePoints range(bad); + auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment U16OneSeq seq = *iter; @@ -150,6 +148,5 @@ void U16IteratorTest::testSurrogate() { assertEquals("iter[3] * codePoint", 0xdc05, seq.codePoint); assertFalse("iter[3] * isWellFormed", seq.isWellFormed); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment - U16Iterator endIter(bad.data(), limit, limit); - assertTrue("iter == endIter", iter == endIter); + assertTrue("iter == endIter", iter == range.end()); } From 64ea1100ea4d956036316b5140ee97bd305330af Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 12:45:17 -0800 Subject: [PATCH 08/23] template param: code point type --- icu4c/source/common/unicode/utf16cppiter.h | 44 +++++++++---------- icu4c/source/test/intltest/utfcppitertest.cpp | 16 +++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index a6603203dd32..56116378fef2 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -29,30 +29,26 @@ namespace U_HEADER_ONLY_NAMESPACE { // // TODO: A possible alternative to an enum might be some kind of function template // which would be fully customizable. -// The operator*() return value might then want to be a template parameter as well. -// For example, for a well-formed sequence, the return value could be -// a tuple of (code point, well-formed), or a string view, or... -// (And then the caller could choose between UChar32 and char32_t.) -// However, all of that would make the API more complex and daunting. enum U16IllFormedBehavior { U16_BEHAVIOR_NEGATIVE, U16_BEHAVIOR_FFFD, U16_BEHAVIOR_SURROGATE }; -// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t. - /** * A code unit sequence for one code point returned by U16Iterator. + * TODO: Share with UTF-8? * * TODO: check doxygen syntax for template parameters - * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE * @draft ICU 77 */ -template +template struct U16OneSeq { // Order of fields with padding and access frequency in mind. - UChar32 codePoint = 0; + CP32 codePoint = 0; uint8_t length = 0; bool isWellFormed = false; const Unit16 *data; @@ -61,18 +57,20 @@ struct U16OneSeq { return std::basic_string_view(data, length); } - // TODO: std::optional maybeCodePoint() const ? (nullopt if !isWellFormed) + // TODO: std::optional maybeCodePoint() const ? (nullopt if !isWellFormed) }; /** * Validating iterator over the code points in a Unicode 16-bit string. * * TODO: check doxygen syntax for template parameters - * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE * @param U16IllFormedBehavior TODO * @draft ICU 77 */ -template +template class U16Iterator { public: // TODO: make private, make friends @@ -86,10 +84,10 @@ class U16Iterator { bool operator==(const U16Iterator &other) const { return p == other.p; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } - U16OneSeq operator*() const { + U16OneSeq operator*() const { // TODO: assert p != limit -- more precisely: start <= p < limit // Similar to U16_NEXT_OR_FFFD(). - UChar32 c = *p; + CP32 c = *p; if (!U16_IS_SURROGATE(c)) { return {c, 1, true, p}; } else { @@ -118,7 +116,7 @@ class U16Iterator { // More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler // amortize work between operator*() and operator++(int) in typical *it++ usage. // Otherwise this is slightly less efficient because it tests a lead surrogate twice. - UChar32 c = *p++; + CP32 c = *p++; if (U16_IS_SURROGATE(c) && U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { ++p; @@ -126,9 +124,13 @@ class U16Iterator { return result; } + // TODO: operator--() + // TODO: maybe fused readAndInc()? + // TODO: maybe fused decAndRead()? + private: // Handle ill-formed UTF-16: One unpaired surrogate. - UChar32 sub(UChar32 surrogate) const { + CP32 sub(CP32 surrogate) const { switch (behavior) { case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL; case U16_BEHAVIOR_FFFD: return 0xfffd; @@ -149,7 +151,7 @@ class U16Iterator { * @return a code point iterator. * @draft ICU 77 */ -template +template class U16StringCodePoints { public: /** @@ -162,12 +164,12 @@ class U16StringCodePoints { U16StringCodePoints(const U16StringCodePoints &other) = default; /** @draft ICU 77 */ - U16Iterator begin() const { + U16Iterator begin() const { return {s.data(), s.data(), s.data() + s.length()}; } /** @draft ICU 77 */ - U16Iterator end() const { + U16Iterator end() const { const Unit16 *limit = s.data() + s.length(); return {s.data(), limit, limit}; } @@ -183,8 +185,6 @@ class U16StringCodePoints { // template // class U16UnsafeIterator // TODO: only p, no start, no limit -// TODO: can/should we read the code point only in operator*()? -// if we read it in the constructor, then we would still need start/limit... } // namespace U_HEADER_ONLY_NAMESPACE diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index 8e429d8a74b2..15db94e613b0 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -58,11 +58,11 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam void U16IteratorTest::testGood() { IcuTestErrorCode errorCode(*this, "testGood"); std::u16string_view good(u"abçカ🚴"sv); - U16StringCodePoints range(good); + U16StringCodePoints range(good); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment - U16OneSeq seq = *iter; + auto seq = *iter; assertEquals("iter[1] * codePoint", u'b', seq.codePoint); assertEquals("iter[1] * length", 1, seq.length); assertTrue("iter[1] * isWellFormed", seq.isWellFormed); @@ -83,11 +83,11 @@ void U16IteratorTest::testNegative() { IcuTestErrorCode errorCode(*this, "testNegative"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment - U16OneSeq seq = *iter; + auto seq = *iter; assertEquals("iter[1] * codePoint", -1, seq.codePoint); assertEquals("iter[1] * length", 1, seq.length); assertFalse("iter[1] * isWellFormed", seq.isWellFormed); @@ -107,11 +107,11 @@ void U16IteratorTest::testFFFD() { IcuTestErrorCode errorCode(*this, "testFFFD"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment - U16OneSeq seq = *iter; + auto seq = *iter; assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint); assertEquals("iter[1] * length", 1, seq.length); assertFalse("iter[1] * isWellFormed", seq.isWellFormed); @@ -131,11 +131,11 @@ void U16IteratorTest::testSurrogate() { IcuTestErrorCode errorCode(*this, "testSurrogate"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment - U16OneSeq seq = *iter; + auto seq = *iter; assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint); assertEquals("iter[1] * length", 1, seq.length); assertFalse("iter[1] * isWellFormed", seq.isWellFormed); From 7bbeefcdd35426b279a2f58b6ff49cc5ae4cd1d7 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 17:19:08 -0800 Subject: [PATCH 09/23] make it work outside of ICU --- icu4c/source/common/unicode/utf16cppiter.h | 26 +++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 56116378fef2..41bc9cf4e230 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -7,13 +7,37 @@ #ifndef __UTF16CPPITER_H__ #define __UTF16CPPITER_H__ +// TODO: For experimentation outside of ICU, comment out this include. +// Experimentally conditional code below checks for UTYPES_H and +// otherwise uses copies of bits of ICU. #include "unicode/utypes.h" -#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API +#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H) #include +#ifdef UTYPES_H #include "unicode/utf16.h" #include "unicode/uversion.h" +#else +// TODO: Remove checks for UTYPES_H and replacement definitions. +// unicode/utypes.h etc. +#include +typedef int32_t UChar32; +constexpr UChar32 U_SENTINEL = -1; +// unicode/uversion.h +#define U_HEADER_ONLY_NAMESPACE header +namespace header {} +// unicode/utf.h +#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) +// unicode/utf16.h +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) +#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) +#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) +#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) +#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) +#define U16_GET_SUPPLEMENTARY(lead, trail) \ + (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) +#endif /** * \file From 43e99e082ef3015df1481e900e8ca2c5de3e8247 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 17:24:02 -0800 Subject: [PATCH 10/23] experimental sample code --- icu4c/source/common/unicode/utf16cppiter.h | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 41bc9cf4e230..96bc0ad1cbef 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -210,6 +210,29 @@ class U16StringCodePoints { // class U16UnsafeIterator // TODO: only p, no start, no limit +// TODO: remove experimental sample code +#ifndef UTYPES_H +int32_t rangeLoop(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto seq : range) { + sum += seq.codePoint; + } + return sum; +} + +int32_t loopIterPlusPlus(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + auto iter = range.begin(); + auto limit = range.end(); + while (iter != limit) { + sum += (*iter++).codePoint; + } + return sum; +} +#endif + } // namespace U_HEADER_ONLY_NAMESPACE #endif // U_HIDE_DRAFT_API From bfc722e398c53a7e82e1f32e4937af87efc5b625 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 26 Dec 2024 17:38:51 -0800 Subject: [PATCH 11/23] pre=post-inc, fused readAndInc() --- icu4c/source/common/unicode/utf16cppiter.h | 51 +++++++++++++++++----- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 96bc0ad1cbef..14f04c0111a3 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -127,16 +127,42 @@ class U16Iterator { U16Iterator &operator++() { // pre-increment // TODO: assert p != limit -- more precisely: start <= p < limit - // Similar to U16_FWD_1(). - if (U16_IS_LEAD(*p++) && p != limit && U16_IS_TRAIL(*p)) { - ++p; - } + inc(); return *this; } U16Iterator operator++(int) { // post-increment // TODO: assert p != limit -- more precisely: start <= p < limit U16Iterator result(*this); + inc(); + return result; + } + + // Fused/optimized *iter++ + U16OneSeq readAndInc() { + // TODO: assert p != limit -- more precisely: start <= p < limit + // Very similar to U16_NEXT_OR_FFFD(). + const Unit16 *p0 = p; + CP32 c = *p++; + if (!U16_IS_SURROGATE(c)) { + return {c, 1, true, p0}; + } else { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) { + ++p; + c = U16_GET_SUPPLEMENTARY(c, c2); + return {c, 2, true, p0}; + } else { + return {sub(c), 1, false, p0}; + } + } + } + + // TODO: operator--() + // TODO: maybe fused decAndRead()? + +private: + void inc() { // More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler // amortize work between operator*() and operator++(int) in typical *it++ usage. // Otherwise this is slightly less efficient because it tests a lead surrogate twice. @@ -145,14 +171,8 @@ class U16Iterator { U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { ++p; } - return result; } - // TODO: operator--() - // TODO: maybe fused readAndInc()? - // TODO: maybe fused decAndRead()? - -private: // Handle ill-formed UTF-16: One unpaired surrogate. CP32 sub(CP32 surrogate) const { switch (behavior) { @@ -231,6 +251,17 @@ int32_t loopIterPlusPlus(std::u16string_view s) { } return sum; } + +int32_t loopReadAndInc(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + auto iter = range.begin(); + auto limit = range.end(); + while (iter != limit) { + sum += iter.readAndInc().codePoint; + } + return sum; +} #endif } // namespace U_HEADER_ONLY_NAMESPACE From c156434b2caf446af0d0ba1d09b70332388112a1 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 2 Jan 2025 13:59:10 -0800 Subject: [PATCH 12/23] readAndInc() for all --- icu4c/source/common/unicode/utf16cppiter.h | 70 ++++++++++------------ 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 14f04c0111a3..ae513a3d5b27 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -99,47 +99,58 @@ class U16Iterator { public: // TODO: make private, make friends U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : - start(start), p(p), limit(limit) {} + start(start), current(p), limit(limit) {} // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. U16Iterator(const U16Iterator &other) = default; - bool operator==(const U16Iterator &other) const { return p == other.p; } + bool operator==(const U16Iterator &other) const { return current == other.current; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } U16OneSeq operator*() const { - // TODO: assert p != limit -- more precisely: start <= p < limit - // Similar to U16_NEXT_OR_FFFD(). - CP32 c = *p; - if (!U16_IS_SURROGATE(c)) { - return {c, 1, true, p}; - } else { - uint16_t c2; - if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { - c = U16_GET_SUPPLEMENTARY(c, c2); - return {c, 2, true, p}; - } else { - return {sub(c), 1, false, p}; - } - } + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = current; + return readAndInc(p); } U16Iterator &operator++() { // pre-increment - // TODO: assert p != limit -- more precisely: start <= p < limit - inc(); + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + readAndInc(current); return *this; } U16Iterator operator++(int) { // post-increment - // TODO: assert p != limit -- more precisely: start <= p < limit + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. U16Iterator result(*this); - inc(); + readAndInc(current); return result; } - // Fused/optimized *iter++ + // Explicitly fused/optimized *iter++ U16OneSeq readAndInc() { + return readAndInc(current); + } + + // Same as pre-increment operator++() but slightly faster if used by itself. + // operator++() should be used together with operator*() for best compiler optimization. + U16Iterator &inc() { + // TODO: assert current != limit -- more precisely: start <= current < limit + // Very similar to U16_FWD_1(). + if (U16_IS_LEAD(*current++) && current != limit && U16_IS_TRAIL(*current)) { + ++current; + } + return *this; + } + + // TODO: operator--() + // TODO: maybe fused decAndRead()? + +private: + U16OneSeq readAndInc(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_NEXT_OR_FFFD(). const Unit16 *p0 = p; @@ -158,21 +169,6 @@ class U16Iterator { } } - // TODO: operator--() - // TODO: maybe fused decAndRead()? - -private: - void inc() { - // More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler - // amortize work between operator*() and operator++(int) in typical *it++ usage. - // Otherwise this is slightly less efficient because it tests a lead surrogate twice. - CP32 c = *p++; - if (U16_IS_SURROGATE(c) && - U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { - ++p; - } - } - // Handle ill-formed UTF-16: One unpaired surrogate. CP32 sub(CP32 surrogate) const { switch (behavior) { @@ -185,7 +181,7 @@ class U16Iterator { // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. const Unit16 *const start; - const Unit16 *p; + const Unit16 *current; const Unit16 *const limit; }; From e0cf8f7a6bce488b1f104d00cdd251b82356cf1f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 2 Jan 2025 14:55:05 -0800 Subject: [PATCH 13/23] bidirectional --- icu4c/source/common/unicode/utf16cppiter.h | 46 ++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index ae513a3d5b27..dea877887ab0 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -85,7 +85,7 @@ struct U16OneSeq { }; /** - * Validating iterator over the code points in a Unicode 16-bit string. + * Validating, bidirectional iterator over the code points in a Unicode 16-bit string. * * TODO: check doxygen syntax for template parameters * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t @@ -130,6 +130,16 @@ class U16Iterator { return result; } + U16Iterator &operator--() { // pre-decrement + return dec(); + } + + U16Iterator operator--(int) { // post-decrement + U16Iterator result(*this); + dec(); + return result; + } + // Explicitly fused/optimized *iter++ U16OneSeq readAndInc() { return readAndInc(current); @@ -146,8 +156,20 @@ class U16Iterator { return *this; } - // TODO: operator--() - // TODO: maybe fused decAndRead()? + // Explicitly fused/optimized *--iter + U16OneSeq decAndRead() { + return decAndRead(current); + } + + // Same as pre-decrement operator--(), for API symmetry. + U16Iterator &dec() { + // TODO: assert current != limit -- more precisely: start <= current < limit + // Very similar to U16_BACK_1(). + if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) { + --current; + } + return *this; + } private: U16OneSeq readAndInc(const Unit16 *&p) const { @@ -169,6 +191,24 @@ class U16Iterator { } } + U16OneSeq decAndRead(const Unit16 *&p) const { + // TODO: assert p != limit -- more precisely: start <= p < limit + // Very similar to U16_PREV_OR_FFFD(). + CP32 c = *--p; + if (!U16_IS_SURROGATE(c)) { + return {c, 1, true, p}; + } else { + uint16_t c2; + if (U16_IS_SURROGATE_TRAIL(c) && p != start && U16_IS_LEAD(c2 = *(p - 1))) { + --p; + c = U16_GET_SUPPLEMENTARY(c2, c); + return {c, 2, true, p}; + } else { + return {sub(c), 1, false, p}; + } + } + } + // Handle ill-formed UTF-16: One unpaired surrogate. CP32 sub(CP32 surrogate) const { switch (behavior) { From ca4787eeffa2a6c248d126a36295c16999e0f8d0 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 2 Jan 2025 15:54:08 -0800 Subject: [PATCH 14/23] efficient rbegin() & rend() --- icu4c/source/common/unicode/utf16cppiter.h | 235 +++++++++++++++------ 1 file changed, 169 insertions(+), 66 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index dea877887ab0..c42c4f1823a3 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -34,6 +34,7 @@ namespace header {} #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) +#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) #define U16_GET_SUPPLEMENTARY(lead, trail) \ (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) @@ -85,93 +86,43 @@ struct U16OneSeq { }; /** - * Validating, bidirectional iterator over the code points in a Unicode 16-bit string. - * - * TODO: check doxygen syntax for template parameters - * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t - * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @param U16IllFormedBehavior TODO - * @draft ICU 77 + * Internal base class for public U16Iterator & U16ReverseIterator. + * Not intended for public subclassing. + * @internal */ template -class U16Iterator { -public: - // TODO: make private, make friends - U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : +class U16IteratorBase { +protected: + // @internal + U16IteratorBase(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : start(start), current(p), limit(limit) {} // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. - U16Iterator(const U16Iterator &other) = default; - - bool operator==(const U16Iterator &other) const { return current == other.current; } - bool operator!=(const U16Iterator &other) const { return !operator==(other); } - - U16OneSeq operator*() const { - // Call the same function in both operator*() and operator++() so that an - // optimizing compiler can easily eliminate redundant work when alternating between the two. - const Unit16 *p = current; - return readAndInc(p); - } - - U16Iterator &operator++() { // pre-increment - // Call the same function in both operator*() and operator++() so that an - // optimizing compiler can easily eliminate redundant work when alternating between the two. - readAndInc(current); - return *this; - } - - U16Iterator operator++(int) { // post-increment - // Call the same function in both operator*() and operator++() so that an - // optimizing compiler can easily eliminate redundant work when alternating between the two. - U16Iterator result(*this); - readAndInc(current); - return result; - } - - U16Iterator &operator--() { // pre-decrement - return dec(); - } - - U16Iterator operator--(int) { // post-decrement - U16Iterator result(*this); - dec(); - return result; - } - - // Explicitly fused/optimized *iter++ - U16OneSeq readAndInc() { - return readAndInc(current); - } + // @internal + bool operator==(const U16IteratorBase &other) const { return current == other.current; } + // @internal + bool operator!=(const U16IteratorBase &other) const { return !operator==(other); } - // Same as pre-increment operator++() but slightly faster if used by itself. - // operator++() should be used together with operator*() for best compiler optimization. - U16Iterator &inc() { + // @internal + void inc() { // TODO: assert current != limit -- more precisely: start <= current < limit // Very similar to U16_FWD_1(). if (U16_IS_LEAD(*current++) && current != limit && U16_IS_TRAIL(*current)) { ++current; } - return *this; } - // Explicitly fused/optimized *--iter - U16OneSeq decAndRead() { - return decAndRead(current); - } - - // Same as pre-decrement operator--(), for API symmetry. - U16Iterator &dec() { + // @internal + void dec() { // TODO: assert current != limit -- more precisely: start <= current < limit // Very similar to U16_BACK_1(). if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) { --current; } - return *this; } -private: + // @internal U16OneSeq readAndInc(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_NEXT_OR_FFFD(). @@ -191,6 +142,7 @@ class U16Iterator { } } + // @internal U16OneSeq decAndRead(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_PREV_OR_FFFD(). @@ -210,6 +162,7 @@ class U16Iterator { } // Handle ill-formed UTF-16: One unpaired surrogate. + // @internal CP32 sub(CP32 surrogate) const { switch (behavior) { case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL; @@ -220,11 +173,141 @@ class U16Iterator { // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. + // @internal const Unit16 *const start; + // @internal const Unit16 *current; + // @internal const Unit16 *const limit; }; +/** + * Validating bidirectional iterator over the code points in a Unicode 16-bit string. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE + * @param U16IllFormedBehavior TODO + * @draft ICU 77 + */ +template +class U16Iterator : private U16IteratorBase { + // FYI: We need to qualify all accesses to super class members because of private inheritance. + using Super = U16IteratorBase; +public: + // TODO: make private, make friends + U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + Super(start, p, limit) {} + + U16Iterator(const U16Iterator &other) = default; + + bool operator==(const U16Iterator &other) const { return Super::operator==(other); } + bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } + + U16OneSeq operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::readAndInc(p); + } + + U16Iterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::readAndInc(Super::current); + return *this; + } + + U16Iterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16Iterator result(*this); + Super::readAndInc(Super::current); + return result; + } + + U16Iterator &operator--() { // pre-decrement + return Super::dec(); + } + + U16Iterator operator--(int) { // post-decrement + U16Iterator result(*this); + Super::dec(); + return result; + } + + // Same as pre-increment operator++() but slightly faster if used by itself. + // operator++() should be used together with operator*() for best compiler optimization. + U16Iterator &inc() { + Super::inc(); + return *this; + } + + // Same as pre-decrement operator--(), for API symmetry. + U16Iterator &dec() { + Super::dec(); + return *this; + } + + // Explicitly fused/optimized *iter++ + U16OneSeq readAndInc() { + return Super::readAndInc(Super::current); + } + + // Explicitly fused/optimized *--iter + U16OneSeq decAndRead() { + return Super::decAndRead(Super::current); + } +}; + +/** + * Validating reverse iterator over the code points in a Unicode 16-bit string. + * Not bidirectional, but optimized for reverse iteration. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE + * @param U16IllFormedBehavior TODO + * @draft ICU 77 + */ +template +class U16ReverseIterator : private U16IteratorBase { + using Super = U16IteratorBase; +public: + // TODO: make private, make friends + U16ReverseIterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + Super(start, p, limit) {} + + U16ReverseIterator(const U16ReverseIterator &other) = default; + + bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } + + U16OneSeq operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::decAndRead(p); + } + + U16ReverseIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::decAndRead(Super::current); + return *this; + } + + U16ReverseIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16ReverseIterator result(*this); + Super::decAndRead(Super::current); + return result; + } +}; + /** * A C++ "range" for iterating over all of the code points of a 16-bit Unicode string. * @@ -254,6 +337,17 @@ class U16StringCodePoints { return {s.data(), limit, limit}; } + /** @draft ICU 77 */ + U16ReverseIterator rbegin() const { + const Unit16 *limit = s.data() + s.length(); + return {s.data(), limit, limit}; + } + + /** @draft ICU 77 */ + U16ReverseIterator rend() const { + return {s.data(), s.data(), s.data() + s.length()}; + } + private: std::basic_string_view s; }; @@ -298,6 +392,15 @@ int32_t loopReadAndInc(std::u16string_view s) { } return sum; } + +int32_t reverseLoop(std::u16string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += (*iter).codePoint; + } + return sum; +} #endif } // namespace U_HEADER_ONLY_NAMESPACE From a24b710f336243852c21495e74ea97ceba53f4db Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 2 Jan 2025 16:11:26 -0800 Subject: [PATCH 15/23] doxygen tparam --- icu4c/source/common/unicode/utf16cppiter.h | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index c42c4f1823a3..7b0870d00d08 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -64,10 +64,9 @@ enum U16IllFormedBehavior { * A code unit sequence for one code point returned by U16Iterator. * TODO: Share with UTF-8? * - * TODO: check doxygen syntax for template parameters - * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t - * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE * @draft ICU 77 */ template @@ -184,11 +183,10 @@ class U16IteratorBase { /** * Validating bidirectional iterator over the code points in a Unicode 16-bit string. * - * TODO: check doxygen syntax for template parameters - * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t - * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @param U16IllFormedBehavior TODO + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE + * @tparam U16IllFormedBehavior TODO * @draft ICU 77 */ template @@ -265,11 +263,10 @@ class U16Iterator : private U16IteratorBase { * Validating reverse iterator over the code points in a Unicode 16-bit string. * Not bidirectional, but optimized for reverse iteration. * - * TODO: check doxygen syntax for template parameters - * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t - * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @param U16IllFormedBehavior TODO + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE + * @tparam U16IllFormedBehavior TODO * @draft ICU 77 */ template @@ -311,7 +308,10 @@ class U16ReverseIterator : private U16IteratorBase { /** * A C++ "range" for iterating over all of the code points of a 16-bit Unicode string. * - * @return a code point iterator. + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U16_BEHAVIOR_NEGATIVE + * @tparam U16IllFormedBehavior TODO * @draft ICU 77 */ template From 633fafafda122c9d49f226a6052f65bd246b704c Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 14:39:37 -0800 Subject: [PATCH 16/23] remove non-standard iter API --- icu4c/source/common/unicode/utf16cppiter.h | 52 ---------------------- 1 file changed, 52 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 7b0870d00d08..e9c58d2ab0b0 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -103,24 +103,6 @@ class U16IteratorBase { // @internal bool operator!=(const U16IteratorBase &other) const { return !operator==(other); } - // @internal - void inc() { - // TODO: assert current != limit -- more precisely: start <= current < limit - // Very similar to U16_FWD_1(). - if (U16_IS_LEAD(*current++) && current != limit && U16_IS_TRAIL(*current)) { - ++current; - } - } - - // @internal - void dec() { - // TODO: assert current != limit -- more precisely: start <= current < limit - // Very similar to U16_BACK_1(). - if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) { - --current; - } - } - // @internal U16OneSeq readAndInc(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit @@ -234,29 +216,6 @@ class U16Iterator : private U16IteratorBase { Super::dec(); return result; } - - // Same as pre-increment operator++() but slightly faster if used by itself. - // operator++() should be used together with operator*() for best compiler optimization. - U16Iterator &inc() { - Super::inc(); - return *this; - } - - // Same as pre-decrement operator--(), for API symmetry. - U16Iterator &dec() { - Super::dec(); - return *this; - } - - // Explicitly fused/optimized *iter++ - U16OneSeq readAndInc() { - return Super::readAndInc(Super::current); - } - - // Explicitly fused/optimized *--iter - U16OneSeq decAndRead() { - return Super::decAndRead(Super::current); - } }; /** @@ -382,17 +341,6 @@ int32_t loopIterPlusPlus(std::u16string_view s) { return sum; } -int32_t loopReadAndInc(std::u16string_view s) { - header::U16StringCodePoints range(s); - int32_t sum = 0; - auto iter = range.begin(); - auto limit = range.end(); - while (iter != limit) { - sum += iter.readAndInc().codePoint; - } - return sum; -} - int32_t reverseLoop(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; From 70ef2fa0d37f4a75315d78fab8248d45cca8a4cf Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 14:48:30 -0800 Subject: [PATCH 17/23] C enum UIllFormedBehavior will be shared with 8-bit --- icu4c/source/common/unicode/utf16cppiter.h | 53 +++++++++---------- icu4c/source/test/intltest/utfcppitertest.cpp | 11 ++-- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index e9c58d2ab0b0..3c60f461b954 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -47,18 +47,15 @@ namespace header {} #ifndef U_HIDE_DRAFT_API -namespace U_HEADER_ONLY_NAMESPACE { +// Some defined behaviors for handling ill-formed Unicode strings. +// TODO: For 8-bit strings, the SURROGATE option does not have an equivalent -- static_assert. +typedef enum UIllFormedBehavior { + U_BEHAVIOR_NEGATIVE, + U_BEHAVIOR_FFFD, + U_BEHAVIOR_SURROGATE +} UIllFormedBehavior; -// Some defined behaviors for handling ill-formed 16-bit strings. -// TODO: Maybe share with 8-bit strings, but the SURROGATE option does not have an equivalent there. -// -// TODO: A possible alternative to an enum might be some kind of function template -// which would be fully customizable. -enum U16IllFormedBehavior { - U16_BEHAVIOR_NEGATIVE, - U16_BEHAVIOR_FFFD, - U16_BEHAVIOR_SURROGATE -}; +namespace U_HEADER_ONLY_NAMESPACE { /** * A code unit sequence for one code point returned by U16Iterator. @@ -66,7 +63,7 @@ enum U16IllFormedBehavior { * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE + * should be signed if U_BEHAVIOR_NEGATIVE * @draft ICU 77 */ template @@ -89,7 +86,7 @@ struct U16OneSeq { * Not intended for public subclassing. * @internal */ -template +template class U16IteratorBase { protected: // @internal @@ -146,9 +143,9 @@ class U16IteratorBase { // @internal CP32 sub(CP32 surrogate) const { switch (behavior) { - case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL; - case U16_BEHAVIOR_FFFD: return 0xfffd; - case U16_BEHAVIOR_SURROGATE: return surrogate; + case U_BEHAVIOR_NEGATIVE: return U_SENTINEL; + case U_BEHAVIOR_FFFD: return 0xfffd; + case U_BEHAVIOR_SURROGATE: return surrogate; } } @@ -167,11 +164,11 @@ class U16IteratorBase { * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @tparam U16IllFormedBehavior TODO + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO * @draft ICU 77 */ -template +template class U16Iterator : private U16IteratorBase { // FYI: We need to qualify all accesses to super class members because of private inheritance. using Super = U16IteratorBase; @@ -224,11 +221,11 @@ class U16Iterator : private U16IteratorBase { * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @tparam U16IllFormedBehavior TODO + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO * @draft ICU 77 */ -template +template class U16ReverseIterator : private U16IteratorBase { using Super = U16IteratorBase; public: @@ -269,11 +266,11 @@ class U16ReverseIterator : private U16IteratorBase { * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U16_BEHAVIOR_NEGATIVE - * @tparam U16IllFormedBehavior TODO + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO * @draft ICU 77 */ -template +template class U16StringCodePoints { public: /** @@ -322,7 +319,7 @@ class U16StringCodePoints { // TODO: remove experimental sample code #ifndef UTYPES_H int32_t rangeLoop(std::u16string_view s) { - header::U16StringCodePoints range(s); + header::U16StringCodePoints range(s); int32_t sum = 0; for (auto seq : range) { sum += seq.codePoint; @@ -331,7 +328,7 @@ int32_t rangeLoop(std::u16string_view s) { } int32_t loopIterPlusPlus(std::u16string_view s) { - header::U16StringCodePoints range(s); + header::U16StringCodePoints range(s); int32_t sum = 0; auto iter = range.begin(); auto limit = range.end(); @@ -342,7 +339,7 @@ int32_t loopIterPlusPlus(std::u16string_view s) { } int32_t reverseLoop(std::u16string_view s) { - header::U16StringCodePoints range(s); + header::U16StringCodePoints range(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { sum += (*iter).codePoint; diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index 15db94e613b0..9fc7c1ab5969 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -20,9 +20,6 @@ // https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv using namespace std::string_view_literals; -using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE; -using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_FFFD; -using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_SURROGATE; using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16OneSeq; using U_HEADER_ONLY_NAMESPACE::U16StringCodePoints; @@ -58,7 +55,7 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam void U16IteratorTest::testGood() { IcuTestErrorCode errorCode(*this, "testGood"); std::u16string_view good(u"abçカ🚴"sv); - U16StringCodePoints range(good); + U16StringCodePoints range(good); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment @@ -83,7 +80,7 @@ void U16IteratorTest::testNegative() { IcuTestErrorCode errorCode(*this, "testNegative"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment @@ -107,7 +104,7 @@ void U16IteratorTest::testFFFD() { IcuTestErrorCode errorCode(*this, "testFFFD"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment @@ -131,7 +128,7 @@ void U16IteratorTest::testSurrogate() { IcuTestErrorCode errorCode(*this, "testSurrogate"); static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - U16StringCodePoints range(bad); + U16StringCodePoints range(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); ++iter; // pre-increment From da93999f6d0bac632e0910c033299be37b510f6d Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 14:56:42 -0800 Subject: [PATCH 18/23] CodeUnits result will be shared with 8-bit --- icu4c/source/common/unicode/utf16cppiter.h | 30 +++++++++++-------- icu4c/source/test/intltest/utfcppitertest.cpp | 1 - 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 3c60f461b954..94d217abdb11 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -58,24 +58,25 @@ typedef enum UIllFormedBehavior { namespace U_HEADER_ONLY_NAMESPACE { /** - * A code unit sequence for one code point returned by U16Iterator. - * TODO: Share with UTF-8? + * Result of decoding a minimal Unicode code unit sequence. * - * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam Unit Code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE * @draft ICU 77 */ -template -struct U16OneSeq { +template +struct CodeUnits { // Order of fields with padding and access frequency in mind. CP32 codePoint = 0; uint8_t length = 0; bool isWellFormed = false; - const Unit16 *data; + const Unit *data; - std::basic_string_view stringView() const { - return std::basic_string_view(data, length); + std::basic_string_view stringView() const { + return std::basic_string_view(data, length); } // TODO: std::optional maybeCodePoint() const ? (nullopt if !isWellFormed) @@ -84,6 +85,11 @@ struct U16OneSeq { /** * Internal base class for public U16Iterator & U16ReverseIterator. * Not intended for public subclassing. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior TODO * @internal */ template @@ -101,7 +107,7 @@ class U16IteratorBase { bool operator!=(const U16IteratorBase &other) const { return !operator==(other); } // @internal - U16OneSeq readAndInc(const Unit16 *&p) const { + CodeUnits readAndInc(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_NEXT_OR_FFFD(). const Unit16 *p0 = p; @@ -121,7 +127,7 @@ class U16IteratorBase { } // @internal - U16OneSeq decAndRead(const Unit16 *&p) const { + CodeUnits decAndRead(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_PREV_OR_FFFD(). CP32 c = *--p; @@ -182,7 +188,7 @@ class U16Iterator : private U16IteratorBase { bool operator==(const U16Iterator &other) const { return Super::operator==(other); } bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } - U16OneSeq operator*() const { + CodeUnits operator*() const { // Call the same function in both operator*() and operator++() so that an // optimizing compiler can easily eliminate redundant work when alternating between the two. const Unit16 *p = Super::current; @@ -238,7 +244,7 @@ class U16ReverseIterator : private U16IteratorBase { bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } - U16OneSeq operator*() const { + CodeUnits operator*() const { // Call the same function in both operator*() and operator++() so that an // optimizing compiler can easily eliminate redundant work when alternating between the two. const Unit16 *p = Super::current; diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index 9fc7c1ab5969..cbb9c9728ba7 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -21,7 +21,6 @@ using namespace std::string_view_literals; using U_HEADER_ONLY_NAMESPACE::U16Iterator; -using U_HEADER_ONLY_NAMESPACE::U16OneSeq; using U_HEADER_ONLY_NAMESPACE::U16StringCodePoints; class U16IteratorTest : public IntlTest { From 5c6e1a6a76cf8da739dc5a54daf9a59d042c8555 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 15:16:35 -0800 Subject: [PATCH 19/23] CodeUnits: getters / private fields --- icu4c/source/common/unicode/utf16cppiter.h | 41 ++++++--- icu4c/source/test/intltest/utfcppitertest.cpp | 86 +++++++++---------- 2 files changed, 72 insertions(+), 55 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 94d217abdb11..529bc844d70d 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -68,18 +68,35 @@ namespace U_HEADER_ONLY_NAMESPACE { * @draft ICU 77 */ template -struct CodeUnits { - // Order of fields with padding and access frequency in mind. - CP32 codePoint = 0; - uint8_t length = 0; - bool isWellFormed = false; - const Unit *data; +class CodeUnits { +public: + // @internal + CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, const Unit *data) : + c(codePoint), len(length), ok(wellFormed), p(data) {} + + CodeUnits(const CodeUnits &other) = default; + CodeUnits &operator=(const CodeUnits &other) = default; + + UChar32 codePoint() const { return c; } + + bool wellFormed() const { return ok; } + + const Unit *data() const { return p; } + + int32_t length() const { return len; } std::basic_string_view stringView() const { - return std::basic_string_view(data, length); + return std::basic_string_view(p, len); } - // TODO: std::optional maybeCodePoint() const ? (nullopt if !isWellFormed) + // TODO: std::optional maybeCodePoint() const ? (nullopt if ill-formed) + +private: + // Order of fields with padding and access frequency in mind. + CP32 c; + uint8_t len; + bool ok; + const Unit *p; }; /** @@ -327,8 +344,8 @@ class U16StringCodePoints { int32_t rangeLoop(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; - for (auto seq : range) { - sum += seq.codePoint; + for (auto units : range) { + sum += units.codePoint(); } return sum; } @@ -339,7 +356,7 @@ int32_t loopIterPlusPlus(std::u16string_view s) { auto iter = range.begin(); auto limit = range.end(); while (iter != limit) { - sum += (*iter++).codePoint; + sum += (*iter++).codePoint(); } return sum; } @@ -348,7 +365,7 @@ int32_t reverseLoop(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { - sum += (*iter).codePoint; + sum += (*iter).codePoint(); } return sum; } diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index cbb9c9728ba7..16ed3d0c627a 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -56,22 +56,22 @@ void U16IteratorTest::testGood() { std::u16string_view good(u"abçカ🚴"sv); U16StringCodePoints range(good); auto iter = range.begin(); - assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); ++iter; // pre-increment - auto seq = *iter; - assertEquals("iter[1] * codePoint", u'b', seq.codePoint); - assertEquals("iter[1] * length", 1, seq.length); - assertTrue("iter[1] * isWellFormed", seq.isWellFormed); - assertTrue("iter[1] * stringView()", seq.stringView() == u"b"sv); + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + assertTrue("iter[1] * stringView()", units.stringView() == u"b"sv); ++iter; - assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint); // post-increment - assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint); + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint()); ++iter; - seq = *iter++; - assertEquals("iter[4] * codePoint", U'🚴', seq.codePoint); - assertEquals("iter[4] * length", 2, seq.length); - assertTrue("iter[4] * isWellFormed", seq.isWellFormed); - assertTrue("iter[4] * stringView()", seq.stringView() == u"🚴"sv); + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 2, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + assertTrue("iter[4] * stringView()", units.stringView() == u"🚴"sv); assertTrue("iter == endIter", iter == range.end()); } @@ -81,20 +81,20 @@ void U16IteratorTest::testNegative() { std::u16string_view bad(badChars, 5); U16StringCodePoints range(bad); auto iter = range.begin(); - assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); ++iter; // pre-increment - auto seq = *iter; - assertEquals("iter[1] * codePoint", -1, seq.codePoint); - assertEquals("iter[1] * length", 1, seq.length); - assertFalse("iter[1] * isWellFormed", seq.isWellFormed); - auto sv = seq.stringView(); + auto units = *iter; + assertEquals("iter[1] * codePoint", -1, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); assertEquals("iter[1] * stringView().length()", 1, sv.length()); assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); ++iter; - assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment - seq = *iter++; // post-increment - assertEquals("iter[3] * codePoint", -1, seq.codePoint); - assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", -1, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment assertTrue("iter == endIter", iter == range.end()); } @@ -105,20 +105,20 @@ void U16IteratorTest::testFFFD() { std::u16string_view bad(badChars, 5); U16StringCodePoints range(bad); auto iter = range.begin(); - assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); ++iter; // pre-increment - auto seq = *iter; - assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint); - assertEquals("iter[1] * length", 1, seq.length); - assertFalse("iter[1] * isWellFormed", seq.isWellFormed); - auto sv = seq.stringView(); + auto units = *iter; + assertEquals("iter[1] * codePoint", 0xfffd, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); assertEquals("iter[1] * stringView().length()", 1, sv.length()); assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); ++iter; - assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment - seq = *iter++; // post-increment - assertEquals("iter[3] * codePoint", 0xfffd, seq.codePoint); - assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xfffd, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment assertTrue("iter == endIter", iter == range.end()); } @@ -129,20 +129,20 @@ void U16IteratorTest::testSurrogate() { std::u16string_view bad(badChars, 5); U16StringCodePoints range(bad); auto iter = range.begin(); - assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); ++iter; // pre-increment - auto seq = *iter; - assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint); - assertEquals("iter[1] * length", 1, seq.length); - assertFalse("iter[1] * isWellFormed", seq.isWellFormed); - auto sv = seq.stringView(); + auto units = *iter; + assertEquals("iter[1] * codePoint", 0xd900, units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertFalse("iter[1] * wellFormed", units.wellFormed()); + auto sv = units.stringView(); assertEquals("iter[1] * stringView().length()", 1, sv.length()); assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); ++iter; - assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint); // post-increment - seq = *iter++; // post-increment - assertEquals("iter[3] * codePoint", 0xdc05, seq.codePoint); - assertFalse("iter[3] * isWellFormed", seq.isWellFormed); + assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment + units = *iter++; // post-increment + assertEquals("iter[3] * codePoint", 0xdc05, units.codePoint()); + assertFalse("iter[3] * wellFormed", units.wellFormed()); assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment assertTrue("iter == endIter", iter == range.end()); } From 84dc5f46195f9e22bb0fe307fa197aa16ef78d63 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 16:05:05 -0800 Subject: [PATCH 20/23] unsafe=well-formed iterators --- icu4c/source/common/unicode/utf16cppiter.h | 284 ++++++++++++++++++++- 1 file changed, 277 insertions(+), 7 deletions(-) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 529bc844d70d..777cea6174e4 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -58,7 +58,8 @@ typedef enum UIllFormedBehavior { namespace U_HEADER_ONLY_NAMESPACE { /** - * Result of decoding a minimal Unicode code unit sequence. + * Result of validating and decoding a minimal Unicode code unit sequence. + * Returned from validating Unicode string code point iterators. * * @tparam Unit Code unit type: * UTF-8: char or char8_t or uint8_t; @@ -99,6 +100,46 @@ class CodeUnits { const Unit *p; }; +/** + * Result of decoding a minimal Unicode code unit sequence which must be well-formed. + * Returned from non-validating Unicode string code point iterators. + * + * @tparam Unit Code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class UnsafeCodeUnits { +public: + // @internal + UnsafeCodeUnits(CP32 codePoint, uint8_t length, const Unit *data) : + c(codePoint), len(length), p(data) {} + + UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; + UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; + + UChar32 codePoint() const { return c; } + + const Unit *data() const { return p; } + + int32_t length() const { return len; } + + std::basic_string_view stringView() const { + return std::basic_string_view(p, len); + } + + // TODO: std::optional maybeCodePoint() const ? (nullopt if ill-formed) + +private: + // Order of fields with padding and access frequency in mind. + CP32 c; + uint8_t len; + const Unit *p; +}; + /** * Internal base class for public U16Iterator & U16ReverseIterator. * Not intended for public subclassing. @@ -118,6 +159,11 @@ class U16IteratorBase { // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. + // @internal + U16IteratorBase(const U16IteratorBase &other) = default; + // @internal + U16IteratorBase &operator=(const U16IteratorBase &other) = default; + // @internal bool operator==(const U16IteratorBase &other) const { return current == other.current; } // @internal @@ -201,6 +247,7 @@ class U16Iterator : private U16IteratorBase { Super(start, p, limit) {} U16Iterator(const U16Iterator &other) = default; + U16Iterator &operator=(const U16Iterator &other) = default; bool operator==(const U16Iterator &other) const { return Super::operator==(other); } bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } @@ -257,6 +304,7 @@ class U16ReverseIterator : private U16IteratorBase { Super(start, p, limit) {} U16ReverseIterator(const U16ReverseIterator &other) = default; + U16ReverseIterator &operator=(const U16ReverseIterator &other) = default; bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } @@ -285,7 +333,7 @@ class U16ReverseIterator : private U16IteratorBase { }; /** - * A C++ "range" for iterating over all of the code points of a 16-bit Unicode string. + * A C++ "range" for validating iteration over all of the code points of a 16-bit Unicode string. * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; @@ -305,6 +353,9 @@ class U16StringCodePoints { /** @draft ICU 77 */ U16StringCodePoints(const U16StringCodePoints &other) = default; + /** @draft ICU 77 */ + U16StringCodePoints &operator=(const U16StringCodePoints &other) = default; + /** @draft ICU 77 */ U16Iterator begin() const { return {s.data(), s.data(), s.data() + s.length()}; @@ -333,11 +384,212 @@ class U16StringCodePoints { // ------------------------------------------------------------------------- *** -// TODO: Non-validating iterator over the code points in a Unicode 16-bit string. -// Assumes well-formed UTF-16. Otherwise the behavior is undefined. -// template -// class U16UnsafeIterator -// TODO: only p, no start, no limit +/** + * Internal base class for public U16UnsafeIterator & U16UnsafeReverseIterator. + * Not intended for public subclassing. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @internal + */ +template +class U16UnsafeIteratorBase { +protected: + // @internal + U16UnsafeIteratorBase(const Unit16 *p) : current(p) {} + // Test pointers for == or != but not < or >. + + // @internal + U16UnsafeIteratorBase(const U16UnsafeIteratorBase &other) = default; + // @internal + U16UnsafeIteratorBase &operator=(const U16UnsafeIteratorBase &other) = default; + + // @internal + bool operator==(const U16UnsafeIteratorBase &other) const { return current == other.current; } + // @internal + bool operator!=(const U16UnsafeIteratorBase &other) const { return !operator==(other); } + + // @internal + UnsafeCodeUnits readAndInc(const Unit16 *&p) const { + // Very similar to U16_NEXT_UNSAFE(). + const Unit16 *p0 = p; + CP32 c = *p++; + if (!U16_IS_LEAD(c)) { + return {c, 1, p0}; + } else { + c = U16_GET_SUPPLEMENTARY(c, *p++); + return {c, 2, p0}; + } + } + + // @internal + UnsafeCodeUnits decAndRead(const Unit16 *&p) const { + // Very similar to U16_PREV_UNSAFE(). + CP32 c = *--p; + if (!U16_IS_TRAIL(c)) { + return {c, 1, p}; + } else { + c = U16_GET_SUPPLEMENTARY(*--p, c); + return {c, 2, p}; + } + } + + // @internal + const Unit16 *current; +}; + +/** + * Non-validating bidirectional iterator over the code points in a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeIterator : private U16UnsafeIteratorBase { + // FYI: We need to qualify all accesses to super class members because of private inheritance. + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeIterator(const U16UnsafeIterator &other) = default; + U16UnsafeIterator &operator=(const U16UnsafeIterator &other) = default; + + bool operator==(const U16UnsafeIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::readAndInc(p); + } + + U16UnsafeIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::readAndInc(Super::current); + return *this; + } + + U16UnsafeIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeIterator result(*this); + Super::readAndInc(Super::current); + return result; + } + + U16UnsafeIterator &operator--() { // pre-decrement + return Super::dec(); + } + + U16UnsafeIterator operator--(int) { // post-decrement + U16UnsafeIterator result(*this); + Super::dec(); + return result; + } +}; + +/** + * Non-validating reverse iterator over the code points in a UTF-16 string. + * Not bidirectional, but optimized for reverse iteration. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeReverseIterator : private U16UnsafeIteratorBase { + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeReverseIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeReverseIterator(const U16UnsafeReverseIterator &other) = default; + U16UnsafeReverseIterator &operator=(const U16UnsafeReverseIterator &other) = default; + + bool operator==(const U16UnsafeReverseIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeReverseIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::decAndRead(p); + } + + U16UnsafeReverseIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::decAndRead(Super::current); + return *this; + } + + U16UnsafeReverseIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeReverseIterator result(*this); + Super::decAndRead(Super::current); + return result; + } +}; + +/** + * A C++ "range" for non-validating iteration over all of the code points of a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeStringCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points in the string. + * @draft ICU 77 + */ + U16UnsafeStringCodePoints(std::basic_string_view s) : s(s) {} + + /** @draft ICU 77 */ + U16UnsafeStringCodePoints(const U16UnsafeStringCodePoints &other) = default; + U16UnsafeStringCodePoints &operator=(const U16UnsafeStringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16UnsafeIterator begin() const { + return {s.data()}; + } + + /** @draft ICU 77 */ + U16UnsafeIterator end() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rbegin() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rend() const { + return {s.data()}; + } + +private: + std::basic_string_view s; +}; + +// ------------------------------------------------------------------------- *** + +// TODO: UTF-8 // TODO: remove experimental sample code #ifndef UTYPES_H @@ -369,6 +621,24 @@ int32_t reverseLoop(std::u16string_view s) { } return sum; } + +int32_t unsafeRangeLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto units : range) { + sum += units.codePoint(); + } + return sum; +} + +int32_t unsafeReverseLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += (*iter).codePoint(); + } + return sum; +} #endif } // namespace U_HEADER_ONLY_NAMESPACE From 8bea75e17ba91277e5c023aaebbfa43de38688da Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 16:10:05 -0800 Subject: [PATCH 21/23] restore base dec() (oops) --- icu4c/source/common/unicode/utf16cppiter.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 777cea6174e4..bcb7221fd5b3 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -169,6 +169,15 @@ class U16IteratorBase { // @internal bool operator!=(const U16IteratorBase &other) const { return !operator==(other); } + // @internal + void dec() { + // TODO: assert current != limit -- more precisely: start <= current < limit + // Very similar to U16_BACK_1(). + if (U16_IS_TRAIL(*(--current)) && current != start && U16_IS_LEAD(*(current - 1))) { + --current; + } + } + // @internal CodeUnits readAndInc(const Unit16 *&p) const { // TODO: assert p != limit -- more precisely: start <= p < limit @@ -410,6 +419,14 @@ class U16UnsafeIteratorBase { // @internal bool operator!=(const U16UnsafeIteratorBase &other) const { return !operator==(other); } + // @internal + void dec() { + // Very similar to U16_BACK_1_UNSAFE(). + if (U16_IS_TRAIL(*(--current))) { + --current; + } + } + // @internal UnsafeCodeUnits readAndInc(const Unit16 *&p) const { // Very similar to U16_NEXT_UNSAFE(). From 5281d61f6d0c2315792c5e82d91eb650f61eaa2f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 6 Jan 2025 17:37:42 -0800 Subject: [PATCH 22/23] rename to utfiter.h, also test --- icu4c/source/common/unicode/{utf16cppiter.h => utfiter.h} | 0 icu4c/source/test/intltest/Makefile.in | 2 +- icu4c/source/test/intltest/intltest.vcxproj | 2 +- icu4c/source/test/intltest/intltest.vcxproj.filters | 2 +- .../test/intltest/{utfcppitertest.cpp => utfitertest.cpp} | 4 ++-- 5 files changed, 5 insertions(+), 5 deletions(-) rename icu4c/source/common/unicode/{utf16cppiter.h => utfiter.h} (100%) rename icu4c/source/test/intltest/{utfcppitertest.cpp => utfitertest.cpp} (99%) diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utfiter.h similarity index 100% rename from icu4c/source/common/unicode/utf16cppiter.h rename to icu4c/source/common/unicode/utfiter.h diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 64f36bd061f8..8a12daa2f5de 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -75,7 +75,7 @@ numbertest_parse.o numbertest_doubleconversion.o numbertest_skeletons.o \ static_unisets_test.o numfmtdatadriventest.o numbertest_range.o erarulestest.o \ formattedvaluetest.o formatted_string_builder_test.o numbertest_permutation.o \ units_data_test.o units_router_test.o units_test.o displayoptions_test.o \ -numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o utfcppitertest.o +numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o utfitertest.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index 8d9bba021508..476b4b3b5934 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -223,7 +223,7 @@ - + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index 0abc4608d1a6..7fc0c646647a 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -490,7 +490,7 @@ strings - + strings diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfitertest.cpp similarity index 99% rename from icu4c/source/test/intltest/utfcppitertest.cpp rename to icu4c/source/test/intltest/utfitertest.cpp index 16ed3d0c627a..8f40229abe16 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfitertest.cpp @@ -1,7 +1,7 @@ // © 2024 and later: Unicode, Inc. and others. // License & terms of use: https://www.unicode.org/copyright.html -// utfcppitertest.cpp +// utfitertest.cpp // created: 2024aug12 Markus W. Scherer #include @@ -13,7 +13,7 @@ // #define U_SHOW_CPLUSPLUS_HEADER_API 1 #include "unicode/utypes.h" -#include "unicode/utf16cppiter.h" +#include "unicode/utfiter.h" #include "intltest.h" // Makes u"literal"sv std::u16string_view literals possible. From 035c2c1fba9b106dc166a2273dff371705fb93c8 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 7 Jan 2025 16:32:35 -0800 Subject: [PATCH 23/23] validating iter based on other iter --- icu4c/source/common/unicode/utfiter.h | 81 ++++++++++++---------- icu4c/source/test/intltest/utfitertest.cpp | 61 ++++++++++++++++ 2 files changed, 105 insertions(+), 37 deletions(-) diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index bcb7221fd5b3..78252ddebc14 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -61,18 +61,19 @@ namespace U_HEADER_ONLY_NAMESPACE { * Result of validating and decoding a minimal Unicode code unit sequence. * Returned from validating Unicode string code point iterators. * - * @tparam Unit Code unit type: + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: * UTF-8: char or char8_t or uint8_t; * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE * @draft ICU 77 */ -template +template class CodeUnits { + using Unit = typename std::iterator_traits::value_type; public: // @internal - CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, const Unit *data) : + CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter data) : c(codePoint), len(length), ok(wellFormed), p(data) {} CodeUnits(const CodeUnits &other) = default; @@ -82,22 +83,24 @@ class CodeUnits { bool wellFormed() const { return ok; } - const Unit *data() const { return p; } + UnitIter data() const { return p; } int32_t length() const { return len; } - std::basic_string_view stringView() const { + template + std::enable_if_t< + std::is_pointer_v, + std::basic_string_view> + stringView() const { return std::basic_string_view(p, len); } - // TODO: std::optional maybeCodePoint() const ? (nullopt if ill-formed) - private: // Order of fields with padding and access frequency in mind. CP32 c; uint8_t len; bool ok; - const Unit *p; + UnitIter p; }; /** @@ -144,17 +147,18 @@ class UnsafeCodeUnits { * Internal base class for public U16Iterator & U16ReverseIterator. * Not intended for public subclassing. * - * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE * @tparam UIllFormedBehavior TODO * @internal */ -template +template class U16IteratorBase { protected: // @internal - U16IteratorBase(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + U16IteratorBase(UnitIter start, UnitIter p, UnitIter limit) : start(start), current(p), limit(limit) {} // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. @@ -179,10 +183,10 @@ class U16IteratorBase { } // @internal - CodeUnits readAndInc(const Unit16 *&p) const { + CodeUnits readAndInc(UnitIter &p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_NEXT_OR_FFFD(). - const Unit16 *p0 = p; + UnitIter p0 = p; CP32 c = *p++; if (!U16_IS_SURROGATE(c)) { return {c, 1, true, p0}; @@ -199,16 +203,17 @@ class U16IteratorBase { } // @internal - CodeUnits decAndRead(const Unit16 *&p) const { + CodeUnits decAndRead(UnitIter &p) const { // TODO: assert p != limit -- more precisely: start <= p < limit // Very similar to U16_PREV_OR_FFFD(). CP32 c = *--p; if (!U16_IS_SURROGATE(c)) { return {c, 1, true, p}; } else { + UnitIter p1; uint16_t c2; - if (U16_IS_SURROGATE_TRAIL(c) && p != start && U16_IS_LEAD(c2 = *(p - 1))) { - --p; + if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p--, U16_IS_LEAD(c2 = *p1))) { + p = p1; c = U16_GET_SUPPLEMENTARY(c2, c); return {c, 2, true, p}; } else { @@ -230,29 +235,30 @@ class U16IteratorBase { // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. // @internal - const Unit16 *const start; + const UnitIter start; // @internal - const Unit16 *current; + UnitIter current; // @internal - const Unit16 *const limit; + const UnitIter limit; }; /** * Validating bidirectional iterator over the code points in a Unicode 16-bit string. * - * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE * @tparam UIllFormedBehavior TODO * @draft ICU 77 */ -template -class U16Iterator : private U16IteratorBase { +template +class U16Iterator : private U16IteratorBase { // FYI: We need to qualify all accesses to super class members because of private inheritance. - using Super = U16IteratorBase; + using Super = U16IteratorBase; public: // TODO: make private, make friends - U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + U16Iterator(UnitIter start, UnitIter p, UnitIter limit) : Super(start, p, limit) {} U16Iterator(const U16Iterator &other) = default; @@ -261,10 +267,10 @@ class U16Iterator : private U16IteratorBase { bool operator==(const U16Iterator &other) const { return Super::operator==(other); } bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } - CodeUnits operator*() const { + CodeUnits operator*() const { // Call the same function in both operator*() and operator++() so that an // optimizing compiler can easily eliminate redundant work when alternating between the two. - const Unit16 *p = Super::current; + UnitIter p = Super::current; return Super::readAndInc(p); } @@ -298,18 +304,19 @@ class U16Iterator : private U16IteratorBase { * Validating reverse iterator over the code points in a Unicode 16-bit string. * Not bidirectional, but optimized for reverse iteration. * - * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE * @tparam UIllFormedBehavior TODO * @draft ICU 77 */ -template -class U16ReverseIterator : private U16IteratorBase { - using Super = U16IteratorBase; +template +class U16ReverseIterator : private U16IteratorBase { + using Super = U16IteratorBase; public: // TODO: make private, make friends - U16ReverseIterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : + U16ReverseIterator(UnitIter start, UnitIter p, UnitIter limit) : Super(start, p, limit) {} U16ReverseIterator(const U16ReverseIterator &other) = default; @@ -318,10 +325,10 @@ class U16ReverseIterator : private U16IteratorBase { bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } - CodeUnits operator*() const { + CodeUnits operator*() const { // Call the same function in both operator*() and operator++() so that an // optimizing compiler can easily eliminate redundant work when alternating between the two. - const Unit16 *p = Super::current; + UnitIter p = Super::current; return Super::decAndRead(p); } @@ -366,24 +373,24 @@ class U16StringCodePoints { U16StringCodePoints &operator=(const U16StringCodePoints &other) = default; /** @draft ICU 77 */ - U16Iterator begin() const { + U16Iterator begin() const { return {s.data(), s.data(), s.data() + s.length()}; } /** @draft ICU 77 */ - U16Iterator end() const { + U16Iterator end() const { const Unit16 *limit = s.data() + s.length(); return {s.data(), limit, limit}; } /** @draft ICU 77 */ - U16ReverseIterator rbegin() const { + U16ReverseIterator rbegin() const { const Unit16 *limit = s.data() + s.length(); return {s.data(), limit, limit}; } /** @draft ICU 77 */ - U16ReverseIterator rend() const { + U16ReverseIterator rend() const { return {s.data(), s.data(), s.data() + s.length()}; } diff --git a/icu4c/source/test/intltest/utfitertest.cpp b/icu4c/source/test/intltest/utfitertest.cpp index 8f40229abe16..a8bda260bc4d 100644 --- a/icu4c/source/test/intltest/utfitertest.cpp +++ b/icu4c/source/test/intltest/utfitertest.cpp @@ -23,6 +23,31 @@ using namespace std::string_view_literals; using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16StringCodePoints; +template +class FwdIter { +public: + typedef Unit value_type; + + FwdIter(const Unit *data) : p(data) {} + + bool operator==(const FwdIter &other) const { return p == other.p; } + bool operator!=(const FwdIter &other) const { return !operator==(other); } + + Unit operator*() const { return *p; } + FwdIter &operator++() { // pre-increment + ++p; + return *this; + } + FwdIter operator++(int) { // post-increment + FwdIter result(*this); + ++p; + return result; + } + +private: + const Unit *p; +}; + class U16IteratorTest : public IntlTest { public: U16IteratorTest() {} @@ -33,6 +58,7 @@ class U16IteratorTest : public IntlTest { void testNegative(); void testFFFD(); void testSurrogate(); + void testFwdIter(); }; extern IntlTest *createU16IteratorTest() { @@ -48,6 +74,7 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam TESTCASE_AUTO(testNegative); TESTCASE_AUTO(testFFFD); TESTCASE_AUTO(testSurrogate); + TESTCASE_AUTO(testFwdIter); TESTCASE_AUTO_END; } @@ -90,6 +117,7 @@ void U16IteratorTest::testNegative() { auto sv = units.stringView(); assertEquals("iter[1] * stringView().length()", 1, sv.length()); assertEquals("iter[1] * stringView()[0]", 0xd900, sv[0]); + // TODO: test units.data() ++iter; assertEquals("iter[2] * codePoint", u'b', (*iter++).codePoint()); // post-increment units = *iter++; // post-increment @@ -146,3 +174,36 @@ void U16IteratorTest::testSurrogate() { assertEquals("iter[4] * stringView()", u"ç", (*iter++).stringView()); // post-increment assertTrue("iter == endIter", iter == range.end()); } + +void U16IteratorTest::testFwdIter() { + IcuTestErrorCode errorCode(*this, "testFwdIter"); + std::u16string_view good(u"abçカ🚴"sv); + FwdIter goodBegin(good.data()); + FwdIter goodLimit(good.data() + good.length()); + U16Iterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin( + goodBegin, goodBegin, goodLimit); + U16Iterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit( + goodBegin, goodLimit, goodLimit); + // TODO: U16StringCodePoints range(good); + auto iter = rangeBegin; + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + // No units.stringView() when the unit iterator is not a pointer. + assertTrue("iter[1] * data()[0]", *units.data() == u'b'); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] * codePoint", u'カ', (*iter).codePoint()); + ++iter; + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 2, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + FwdIter data = units.data(); + assertTrue("iter[4] * data()[0]", *data++ == u"🚴"[0]); + assertTrue("iter[4] * data()[1]", *data == u"🚴"[1]); + assertTrue("iter == endIter", iter == rangeLimit); +}