From 59615c93f694227eaacf242d22a549c316557cdf Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Tue, 9 Aug 2022 14:36:16 +0300 Subject: [PATCH] ICU-22115 Merge passthrough and canonical combining class data into the NFD trie for ICU4X --- .../tools/icuexportdata/icuexportdata.cpp | 369 ++++++++++-------- 1 file changed, 215 insertions(+), 154 deletions(-) diff --git a/icu4c/source/tools/icuexportdata/icuexportdata.cpp b/icu4c/source/tools/icuexportdata/icuexportdata.cpp index b1f2df5577b3..a167b277f40e 100644 --- a/icu4c/source/tools/icuexportdata/icuexportdata.cpp +++ b/icu4c/source/tools/icuexportdata/icuexportdata.cpp @@ -380,7 +380,7 @@ void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_ fclose(f); } -void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector& pendingTrieInsertions) { +void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector& pendingTrieInsertions, char16_t passthroughCap) { IcuToolErrorCode status("icuexportdata: writeDecompositionData"); FILE* f = prepareOutputFile(basename); @@ -392,7 +392,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { const PendingDescriptor& pending = pendingTrieInsertions[i]; uint32_t additional = 0; - if (!(pending.descriptor & 0xFFFF0000)) { + if (!(pending.descriptor & 0xFFFE0000)) { uint32_t offset = pending.descriptor & 0xFFF; if (!pending.supplementary) { if (offset >= baseSize16) { @@ -419,7 +419,15 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t handleError(status, basename); } } - umutablecptrie_set(builder.getAlias(), pending.scalar, pending.descriptor + additional, status); + // It turns out it's better to swap the halves compared to the initial + // idea in order to put special marker values close to zero so that + // an important marker value becomes 1, so it's efficient to compare + // "1 or 0". Unfortunately, going through all the code to swap + // things is too error prone, so let's do the swapping here in one + // place. + uint32_t oldTrieValue = pending.descriptor + additional; + uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16); + umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status); } LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( builder.getAlias(), @@ -460,9 +468,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t USet* iotaCheck = uset_cloneAsThawed(reference); uset_removeAll(iotaCheck, uset); - if (uset_equals(iotaCheck, iotaSubscript)) { - flags |= (1 << 1); - } else if (!uset_isEmpty(iotaCheck)) { + if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) { // The result was neither empty nor contained exactly // the iota subscript. The ICU4X normalizer doesn't // know how to deal with this case. @@ -475,6 +481,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t uset_close(halfWidthVoicing); fprintf(f, "flags = 0x%X\n", flags); + fprintf(f, "cap = 0x%X\n", passthroughCap); } fprintf(f, "[trie]\n"); usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); @@ -482,110 +489,64 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t handleError(status, basename); } -void writeNopCompositionPassThrough(const char* basename) { - IcuToolErrorCode status("icuexportdata: writeNopCompositionPassThrough"); - FILE* f = prepareOutputFile(basename); - - fprintf(f, "first = 0x0\n"); - - LocalUMutableCPTriePointer builder(umutablecptrie_open(0xFF, 0xFF, status)); - - LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( - builder.getAlias(), - trieType, - UCPTRIE_VALUE_BITS_8, - status)); - handleError(status, basename); +// Special marker for the NFKD form of U+FDFA +const int32_t FDFA_MARKER = 3; - fprintf(f, "[trie]\n"); - usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +// Special marker for characters whose decomposition starts with a non-starter +// and the decomposition isn't the character itself. +const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2; - fclose(f); - handleError(status, basename); -} +// Special marker for starters that decompose to themselves but that may +// combine backwards under canonical composition +const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1; -void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) { - IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough"); - FILE* f = prepareOutputFile(basename); +/// Marker that a complex decomposition isn't round-trippable +/// under re-composition. +const uint32_t NON_ROUND_TRIP_MARKER = 1; - const Normalizer2* nfc = nullptr; - if (!norm) { - // UTS 46 case - norm = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status); - nfc = Normalizer2::getNFCInstance(status); +UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) { + if (knownToRoundTrip) { + return TRUE; } - for (UChar32 c = 0; c <= 0x10FFFF; ++c) { - if (c >= 0xD800 && c < 0xE000) { - // Surrogate - continue; - } - if (uset_contains(decompositionStartsWithNonStarter, c) || uset_contains(decompositionStartsWithBackwardCombiningStarter, c)) { - continue; - } - UnicodeString src; - UnicodeString dst; - src.append(c); - norm->normalize(src, dst, status); - if (nfc && (dst.isEmpty() || (dst == u"\uFFFD" && c != 0xFFFD))) { - // UTS 46 ignored and disallowed fall back to NFC for data - // overlap. - dst.truncate(0); - nfc->normalize(src, dst, status); - } - if (src == dst) { - uset_add(potentialPassthroughAndNotBackwardCombining, c); - } + // Nuktas, Hebrew presentation forms and polytonic Greek with oxia + // are special-cased in ICU4X. + if (c >= 0xFB1D && c <= 0xFB4E) { + // Hebrew presentation forms + return TRUE; } - - // There are fancier ways to do this, but let's keep things - // very simple: Deliberately not working this into the above - // loop and not extracting this from the inversion list - // directly. - for (UChar32 c = 0; c <= 0x10FFFF; ++c) { - if (!uset_contains(potentialPassthroughAndNotBackwardCombining, c)) { - fprintf(f, "first = 0x%X\n", c); - break; - } + if (c >= 0x1F71 && c <= 0x1FFB) { + // Polytonic Greek with oxia + return TRUE; } - - // 8 bits per trie value. Default is 0, which means pass-through. - // That is, the lookup key isn't actually a UChar32 but a UChar32 - // divided by 8, but that's still in range, so things work despite - // the data structure not being meant to be used like this. - LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); - - for (int32_t i = 0; i < ((0x10FFFF + 1)/8); ++i) { - uint32_t trieVal = 0; - for (int32_t j = 0; j < 8; ++j) { - UChar32 c = i*8 + j; - if (!uset_contains(potentialPassthroughAndNotBackwardCombining, c)) { - trieVal |= (1 << j); - } - } - if (trieVal) { - umutablecptrie_set(builder.getAlias(), UChar32(i), trieVal, status); - } + if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) { + // Nukta + return TRUE; } - - LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( - builder.getAlias(), - trieType, - UCPTRIE_VALUE_BITS_8, - status)); - handleError(status, basename); - - fprintf(f, "[trie]\n"); - usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); - - fclose(f); - handleError(status, basename); + // To avoid more branchiness, 4 characters that decompose to + // a BMP starter followed by a BMP non-starter are excluded + // from being encoded directly into the trie value and are + // handled as complex decompositions instead. These are: + // U+0F76 TIBETAN VOWEL SIGN VOCALIC R + // U+0F78 TIBETAN VOWEL SIGN VOCALIC L + // U+212B ANGSTROM SIGN + // U+2ADC FORKING + return FALSE; } // Computes data for canonical decompositions -void computeDecompositions(const char* basename, const USet* backwardCombiningStarters, std::vector& storage16, std::vector& storage32, USet* decompositionStartsWithNonStarter, USet* decompositionStartsWithBackwardCombiningStarter, std::vector& pendingTrieInsertions) { +void computeDecompositions(const char* basename, + const USet* backwardCombiningStarters, + std::vector& storage16, + std::vector& storage32, + USet* decompositionStartsWithNonStarter, + USet* decompositionStartsWithBackwardCombiningStarter, + std::vector& pendingTrieInsertions, + UChar32& decompositionPassthroughBound, + UChar32& compositionPassthroughBound) { IcuToolErrorCode status("icuexportdata: computeDecompositions"); const Normalizer2* mainNormalizer; const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status); + const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status); FILE* f = NULL; std::vector nonRecursive32; LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); @@ -637,6 +598,10 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt } UnicodeString src; UnicodeString dst; + // True if we're building non-NFD or we're building NFD but + // the `c` round trips to NFC. + // False if we're building NFD and `c` does not round trip to NFC. + UBool nonNfdOrRoundTrips = TRUE; src.append(c); if (mainNormalizer != nfdNormalizer) { UnicodeString inter; @@ -644,6 +609,9 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt nfdNormalizer->normalize(inter, dst, status); } else { nfdNormalizer->normalize(src, dst, status); + UnicodeString nfc; + nfcNormalizer->normalize(dst, nfc, status); + nonNfdOrRoundTrips = (src == nfc); } int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { @@ -670,19 +638,36 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } - bool startsWithNonStarter = u_getCombiningClass(utf32[0]); - if (startsWithNonStarter) { + uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]); + bool specialNonStarterDecomposition = false; + bool startsWithBackwardCombiningStarter = false; + if (firstCombiningClass) { + decompositionPassthroughBound = c; + compositionPassthroughBound = c; uset_add(decompositionStartsWithNonStarter, c); - if (src != dst && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) { - // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. - status.set(U_INTERNAL_PROGRAM_ERROR); - handleError(status, basename); + if (src != dst) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) { + specialNonStarterDecomposition = true; + } else { + // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } } - } else if (uset_contains(backwardCombiningStarters, c)) { + } else if (uset_contains(backwardCombiningStarters, utf32[0])) { + compositionPassthroughBound = c; + startsWithBackwardCombiningStarter = true; uset_add(decompositionStartsWithBackwardCombiningStarter, c); } - if (c != 2 && len == 1 && utf32[0] == 2) { - // 2 is reserved as a marker for decomposition starts with non-starter. + if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) { status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } @@ -692,14 +677,24 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt if (dst == nfd) { continue; } - } else if (startsWithNonStarter) { - // Insert a special marker + decompositionPassthroughBound = c; + compositionPassthroughBound = c; + } else if (firstCombiningClass) { len = 1; - utf32[0] = 2; // magic value (1 is reserved for U+FDFA) + if (specialNonStarterDecomposition) { + utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value + } else { + // Use the surrogate range to store the canonical combining class + utf32[0] = 0xD800 | UChar32(firstCombiningClass); + } } else { if (src == dst) { + if (startsWithBackwardCombiningStarter) { + pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, FALSE}); + } continue; } + decompositionPassthroughBound = c; // ICU4X hard-codes ANGSTROM SIGN if (c != 0x212B) { UnicodeString raw; @@ -725,14 +720,14 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } - uint32_t shifted = uint32_t(rawUtf32[0]) << 16; - umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, shifted, status); + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status); } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) { if (!rawUtf32[0] || !rawUtf32[1]) { status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } - uint32_t bmpPair = uint32_t(rawUtf32[0]) << 16 | uint32_t(rawUtf32[1]); + // Swapped for consistency with the primary trie + uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]); umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status); } else { // Let's add 1 to index to make it always non-zero to distinguish @@ -744,33 +739,53 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } - umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index, status); + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status); } } } } + if (!nonNfdOrRoundTrips) { + compositionPassthroughBound = c; + } if (len == 1 && utf32[0] <= 0xFFFF) { - if (utf32[0] == 1) { - // 1 is reserved as a marker for the expansion of U+FDFA. - status.set(U_INTERNAL_PROGRAM_ERROR); - handleError(status, basename); - } - // U+0345 is hard-coded in ICU4X - if (!(c == 0x0345 && utf32[0] == 0x03B9)) { - pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE}); + if (startsWithBackwardCombiningStarter) { + if (mainNormalizer == nfdNormalizer) { + // Not supposed to happen in NFD + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) { + // Other than conjoining jamo vowels and trails + // unsupported for non-NFD. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } } - } else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) { + pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE}); + } else if (len == 2 && + utf32[0] <= 0xFFFF && + utf32[1] <= 0xFFFF && + !u_getCombiningClass(utf32[0]) && + u_getCombiningClass(utf32[1]) && + permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) { for (int32_t i = 0; i < len; ++i) { if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { // Assert that iota subscript and half-width voicing marks never occur in these // expansions in the normalization forms where they are special. - printf("HER c: %X\n", c); status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } } + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE}); } else { + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + UBool supplementary = FALSE; UBool nonInitialStarter = FALSE; for (int32_t i = 0; i < len; ++i) { @@ -797,7 +812,7 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt if (len == 18 && c == 0xFDFA) { // Special marker for the one character whose decomposition // is too long. - pendingTrieInsertions.push_back({c, 1 << 16, supplementary}); + pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary}); continue; } else { status.set(U_INTERNAL_PROGRAM_ERROR); @@ -900,7 +915,12 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt } } } - pendingTrieInsertions.push_back({c, descriptor, supplementary}); + + uint32_t nonRoundTripMarker = 0; + if (!nonNfdOrRoundTrips) { + nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16); + } + pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary}); } } if (storage16.size() + storage32.size() > 0xFFF) { @@ -1187,7 +1207,22 @@ int exportNorm() { USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty(); USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); std::vector nfdPendingTrieInsertions; - computeDecompositions("nfd", backwardCombiningStarters, storage16, storage32, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfdPendingTrieInsertions); + UChar32 nfdBound = 0x10FFFF; + UChar32 nfcBound = 0x10FFFF; + computeDecompositions("nfd", + backwardCombiningStarters, + storage16, + storage32, + nfdDecompositionStartsWithNonStarter, + nfdDecompositionStartsWithBackwardCombiningStarter, + nfdPendingTrieInsertions, + nfdBound, + nfcBound); + if (!(nfdBound == 0xC0 && nfcBound == 0x300)) { + // Unexpected bounds for NFD/NFC. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } uint32_t baseSize16 = storage16.size(); uint32_t baseSize32 = storage32.size(); @@ -1195,47 +1230,73 @@ int exportNorm() { USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty(); USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); std::vector nfkdPendingTrieInsertions; - computeDecompositions("nfkd", backwardCombiningStarters, storage16, storage32, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkdPendingTrieInsertions); + UChar32 nfkdBound = 0x10FFFF; + UChar32 nfkcBound = 0x10FFFF; + computeDecompositions("nfkd", + backwardCombiningStarters, + storage16, + storage32, + nfkdDecompositionStartsWithNonStarter, + nfkdDecompositionStartsWithBackwardCombiningStarter, + nfkdPendingTrieInsertions, + nfkdBound, + nfkcBound); + if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (nfkcBound > 0xC0) { + if (nfkdBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (nfkdBound != nfkcBound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty(); USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); std::vector uts46PendingTrieInsertions; - computeDecompositions("uts46d", backwardCombiningStarters, storage16, storage32, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PendingTrieInsertions); + UChar32 uts46dBound = 0x10FFFF; + UChar32 uts46Bound = 0x10FFFF; + computeDecompositions("uts46d", + backwardCombiningStarters, + storage16, + storage32, + uts46DecompositionStartsWithNonStarter, + uts46DecompositionStartsWithBackwardCombiningStarter, + uts46PendingTrieInsertions, + uts46dBound, + uts46Bound); + if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (uts46Bound > 0xC0) { + if (uts46dBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (uts46dBound != uts46Bound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } uint32_t supplementSize16 = storage16.size() - baseSize16; uint32_t supplementSize32 = storage32.size() - baseSize32; - writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions); - writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions); - writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions); + writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound)); + writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound)); + writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound)); writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32); writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32); - USet* nfcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); - const Normalizer2* nfc = Normalizer2::getNFCInstance(status); - writePotentialCompositionPassThrough("nfc", nfc, nfdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithBackwardCombiningStarter, nfcPotentialPassthroughAndNotBackwardCombining); - - USet* nfkcPotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); - const Normalizer2* nfkc = Normalizer2::getNFKCInstance(status); - writePotentialCompositionPassThrough("nfkc", nfkc, nfkdDecompositionStartsWithNonStarter, nfkdDecompositionStartsWithBackwardCombiningStarter, nfkcPotentialPassthroughAndNotBackwardCombining); - - USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty(); - writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining); - - writeNopCompositionPassThrough("passthroughnop"); - - // Check that NFKC set has no characters that NFC doesn't also have. - uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining); - if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) { - status.set(U_INTERNAL_PROGRAM_ERROR); - handleError(status, "exportNorm"); - } - - uset_close(nfcPotentialPassthroughAndNotBackwardCombining); - uset_close(nfkcPotentialPassthroughAndNotBackwardCombining); - uset_close(uts46PotentialPassthroughAndNotBackwardCombining); - uset_close(nfdDecompositionStartsWithNonStarter); uset_close(nfkdDecompositionStartsWithNonStarter); uset_close(uts46DecompositionStartsWithNonStarter);