From 7d2a3267ecabd8b1c4c378cff7f56706319cb0b9 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sat, 26 Aug 2023 21:13:00 +0700 Subject: [PATCH 01/21] P2675R1 generator converted from C++ to Python --- .../format_width_estimate_intervals.cpp | 209 ------------------ .../format_width_estimate_intervals.py | 184 +++++++++++++++ 2 files changed, 184 insertions(+), 209 deletions(-) delete mode 100644 tools/unicode_properties_parse/format_width_estimate_intervals.cpp create mode 100644 tools/unicode_properties_parse/format_width_estimate_intervals.py diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.cpp b/tools/unicode_properties_parse/format_width_estimate_intervals.cpp deleted file mode 100644 index 06946779b7..0000000000 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.cpp +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -// The following code generates data for `_Width_estimate_intervals_v2` in . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; - -void verify(bool test, const char* msg, source_location loc = source_location::current()) { - if (!test) { - cerr << "Error at line " << loc.line() << ": " << msg << endl; - exit(EXIT_FAILURE); - } -} -constexpr const char* impl_assertion_failed = "impl assertion failed"; - -struct range_u { - uint32_t from; - uint32_t to; - constexpr range_u(uint32_t f, uint32_t t) : from(f), to(t) {} - constexpr explicit range_u(uint32_t v) : from(v), to(v) {} -}; - -enum class width_u : bool { is_1 = false, is_2 = true }; - -class table_u { -private: - // A valid Unicode code point won't exceed `max_u`. - static constexpr uint32_t max_u = 0x10'ffff; - vector table; - -public: - table_u() : table(max_u + 1, width_u::is_1) {} - - void fill_range(const range_u rng, const width_u width) { - const auto [from, to] = rng; - verify(from <= to, impl_assertion_failed); - verify(to <= max_u, impl_assertion_failed); - for (uint32_t u = from; u <= to; ++u) { - table[u] = width; - } - } - - void print_intervals() const { - // Print table for `_Width_estimate_intervals_v2`. - int c = 0; - width_u last = table[0]; - for (uint32_t u = 0; u <= max_u; ++u) { - if (table[u] != last) { - cout << "0x" << hex << uppercase << u << "u, "; - if (++c == 12) { - c = 0; - cout << endl; - } - } - last = table[u]; - } - cout << endl; - } - - void print_clusters_1_vs_2(const table_u& other) const { - vector cluster_table(max_u + 1, false); - for (uint32_t u = 0; u <= max_u; ++u) { - if (table[u] == width_u::is_1 && other.table[u] == width_u::is_2) { - cluster_table[u] = true; - } - } - - for (uint32_t u = 0; u <= max_u; ++u) { - if (cluster_table[u]) { - const uint32_t from = u; - uint32_t to = from; - while (to + 1 <= max_u && cluster_table[to + 1]) { - ++to; - } - if (from == to) { - cout << hex << uppercase << "U+" << from << endl; - } else { - cout << hex << uppercase << "U+" << from << "..U+" << to << endl; - } - u = to; - } - } - } -}; - -table_u get_table_cpp20() { - static constexpr range_u std_wide_ranges_cpp20[]{ - {0x1100, 0x115F}, - {0x2329, 0x232A}, - {0x2E80, 0x303E}, - {0x3040, 0xA4CF}, - {0xAC00, 0xD7A3}, - {0xF900, 0xFAFF}, - {0xFE10, 0xFE19}, - {0xFE30, 0xFE6F}, - {0xFF00, 0xFF60}, - {0xFFE0, 0xFFE6}, - {0x1F300, 0x1F64F}, - {0x1F900, 0x1F9FF}, - {0x20000, 0x2FFFD}, - {0x30000, 0x3FFFD}, - }; - - table_u table; - for (const range_u& rng : std_wide_ranges_cpp20) { - table.fill_range(rng, width_u::is_2); - } - return table; -} - -// Read data from "EastAsianWidth.txt". -// The latest version can be found at: -// https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt -// The current implementation works for: -// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt -// To make this function work, the file should not contain a BOM. -table_u read_from(ifstream& source) { - table_u table; - - // "The unassigned code points in the following blocks default to "W":" - static constexpr range_u default_wide_ranges[]{ - {0x4E00, 0x9FFF}, {0x3400, 0x4DBF}, {0xF900, 0xFAFF}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}}; - for (const range_u& rng : default_wide_ranges) { - table.fill_range(rng, width_u::is_2); - } - - // Read explicitly assigned ranges. - // The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment". - auto get_width = [](const string& str) { - if (str == "F" || str == "W") { - return width_u::is_2; - } else { - verify(str == "A" || str == "H" || str == "N" || str == "Na", impl_assertion_failed); - return width_u::is_1; - } - }; - auto get_value = [](const string& str) { - uint32_t value{}; - const auto [end_ptr, ec] = from_chars(str.data(), str.data() + str.size(), value, 16); - verify(end_ptr == str.data() + str.size(), impl_assertion_failed); - verify(ec == errc{}, impl_assertion_failed); - return value; - }; - - verify(!!source, "invalid path"); - string line; - const regex reg(R"(([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*)"); - while (getline(source, line)) { - if (!line.empty() && !line.starts_with("#")) { - smatch match; - verify(regex_match(line, match, reg), "invalid line"); - verify(match[1].matched, impl_assertion_failed); - verify(match[3].matched, impl_assertion_failed); - const width_u width = get_width(match[3].str()); - const uint32_t from = get_value(match[1].str()); - if (match[2].matched) { - // range (HEX..HEX) - const string match2 = match[2].str(); - verify(match2.starts_with(".."), impl_assertion_failed); - table.fill_range({from, get_value(match2.substr(2))}, width); - } else { - // single character (HEX) - table.fill_range(range_u{from}, width); - } - } - } - - return table; -} - -table_u get_table_cpp23(ifstream& source) { - table_u table = read_from(source); - - // Override with ranges specified by the C++ standard. - static constexpr range_u std_wide_ranges_cpp23[]{{0x4DC0, 0x4DFF}, {0x1F300, 0x1F5FF}, {0x1F900, 0x1F9FF}}; - for (const range_u& rng : std_wide_ranges_cpp23) { - table.fill_range(rng, width_u::is_2); - } - - return table; -} - -int main() { - cout << "Old table:\n"; - const table_u old_table = get_table_cpp20(); - old_table.print_intervals(); - - cout << "\nNew table:\nInput path for EastAsianWidth.txt: "; - string path; - getline(cin, path); - ifstream source(path); - const table_u new_table = get_table_cpp23(source); - new_table.print_intervals(); - - cout << "\nWas 1, now 2:\n"; - old_table.print_clusters_1_vs_2(new_table); - cout << "\nWas 2, now 1:\n"; - new_table.print_clusters_1_vs_2(old_table); -} diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py new file mode 100644 index 0000000000..353ea6975e --- /dev/null +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -0,0 +1,184 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# The following code generates data for _Width_estimate_intervals_v2 in . + +import re +from dataclasses import dataclass +from typing import TextIO + + +@dataclass +class range_u: + from_: int + to_: int = None + + def __post_init__(self): + if self.to_ is None: + self.to_ = self.from_ + + +class width_u: + is_1: bool = False + is_2: bool = True + + +class table_u: + # A valid Unicode code point won't exceed max_u. + max_u = 0x10FFFF + + def __init__(self): + self.table = [width_u.is_1] * (self.max_u + 1) + + def fill_range(self, rng: range_u, width: bool): + from_, to_ = rng.from_, rng.to_ + assert from_ <= to_, "impl assertion failed" + assert to_ <= self.max_u, "impl assertion failed" + for u in range(from_, to_ + 1): + self.table[u] = width + + def print_intervals(self): + # Print table for _Width_estimate_intervals_v2. + c = 0 + last = self.table[0] + for u in range(self.max_u + 1): + if self.table[u] != last: + print(f"0x{u:X}u, ", end="") + if c == 11: + c = 0 + print() + else: + c += 1 + last = self.table[u] + print() + + def print_clusters_1_vs_2(self, other): + cluster_table = [False] * (self.max_u + 1) + for u in range(self.max_u + 1): + if self.table[u] == width_u.is_1 and other.table[u] == width_u.is_2: + cluster_table[u] = True + + u = 0 + while u < self.max_u + 1: + if cluster_table[u]: + from_ = u + to_ = from_ + while to_ + 1 <= self.max_u and cluster_table[to_ + 1]: + to_ += 1 + if from_ == to_: + print(f"U+{from_:X}") + else: + print(f"U+{from_:X}..U+{to_:X}") + u = to_ + u += 1 + + +def get_table_cpp20() -> table_u: + std_wide_ranges_cpp20 = [ + range_u(0x1100, 0x115F), + range_u(0x2329, 0x232A), + range_u(0x2E80, 0x303E), + range_u(0x3040, 0xA4CF), + range_u(0xAC00, 0xD7A3), + range_u(0xF900, 0xFAFF), + range_u(0xFE10, 0xFE19), + range_u(0xFE30, 0xFE6F), + range_u(0xFF00, 0xFF60), + range_u(0xFFE0, 0xFFE6), + range_u(0x1F300, 0x1F64F), + range_u(0x1F900, 0x1F9FF), + range_u(0x20000, 0x2FFFD), + range_u(0x30000, 0x3FFFD), + ] + + table = table_u() + for rng in std_wide_ranges_cpp20: + table.fill_range(rng, width_u.is_2) + return table + + +# Read data from "EastAsianWidth.txt". +# The latest version can be found at: +# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt +# The current implementation works for: +# https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt +# To make this function work, the file should not contain a BOM. +def read_from(source: TextIO) -> table_u: + table = table_u() + + # "The unassigned code points in the following blocks default to "W":" + default_wide_ranges = [ + range_u(0x4E00, 0x9FFF), + range_u(0x3400, 0x4DBF), + range_u(0xF900, 0xFAFF), + range_u(0x20000, 0x2FFFD), + range_u(0x30000, 0x3FFFD), + ] + for rng in default_wide_ranges: + table.fill_range(rng, width_u.is_2) + + # Read explicitly assigned ranges. + # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment". + def get_width(str: str): + if str == "F" or str == "W": + return width_u.is_2 + else: + assert str == "A" or str == "H" or str == "N" or str == "Na" + return width_u.is_1 + + if not source: + raise ValueError("invalid path") + reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") + for line in source: + line = line.strip() + if line and not line.startswith("#"): + match = reg.fullmatch(line) + assert match, "invalid line" + from_val = int(match.group(1), base=16) + width = get_width(match.group(3)) + if match.group(2): + # range (HEX..HEX) + to_val = int(match.group(2)[2:], base=16) + table.fill_range(range_u(from_val, to_val), width) + else: + # single character (HEX) + table.fill_range(range_u(from_val), width) + + return table + + +def get_table_cpp23(source: TextIO) -> table_u: + table = read_from(source) + + # Override with ranges specified by the C++ standard. + std_wide_ranges_cpp23 = [ + range_u(0x4DC0, 0x4DFF), + range_u(0x1F300, 0x1F5FF), + range_u(0x1F900, 0x1F9FF), + ] + + for rng in std_wide_ranges_cpp23: + table.fill_range(rng, width_u.is_2) + + return table + + +def main(): + print("Old table:") + old_table = get_table_cpp20() + old_table.print_intervals() + + print("\nNew table:\nInput path for EastAsianWidth.txt: ", end="") + path = input() + with open(path) as source: + new_table = get_table_cpp23(source) + new_table.print_intervals() + + print("\nWas 1, now 2:") + old_table.print_clusters_1_vs_2(new_table) + print("\nWas 2, now 1:") + new_table.print_clusters_1_vs_2(old_table) + + +if __name__ == "__main__": + main() From 2ba4369af6b78d0735dddd0b72dfd276c8bfbc62 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sat, 26 Aug 2023 22:16:15 +0700 Subject: [PATCH 02/21] change a comment Co-authored-by: achabense <60953653+achabense@users.noreply.github.com> --- stl/inc/format | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/format b/stl/inc/format index 5b976f2f5d..8ac6721c4e 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -979,7 +979,7 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() { #endif // ^^^ EDG workaround ^^^ } -// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.cpp +// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py // in the https://github.com/microsoft/stl repository. inline constexpr char32_t _Width_estimate_intervals_v2[] = { // 0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu, From aec3166f4938b724fa7413e10e0ac75756265a91 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sat, 26 Aug 2023 22:22:22 +0700 Subject: [PATCH 03/21] add some spaces --- .../format_width_estimate_intervals.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 353ea6975e..be185cde8b 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -25,7 +25,7 @@ class width_u: class table_u: # A valid Unicode code point won't exceed max_u. - max_u = 0x10FFFF + max_u: int = 0x10FFFF def __init__(self): self.table = [width_u.is_1] * (self.max_u + 1) @@ -50,6 +50,7 @@ def print_intervals(self): else: c += 1 last = self.table[u] + print() def print_clusters_1_vs_2(self, other): @@ -94,6 +95,7 @@ def get_table_cpp20() -> table_u: table = table_u() for rng in std_wide_ranges_cpp20: table.fill_range(rng, width_u.is_2) + return table From 0956d67e5389b8a3608a7720d7a2e926f65a3eb4 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sat, 26 Aug 2023 22:46:49 +0700 Subject: [PATCH 04/21] remove unneded if --- .../unicode_properties_parse/format_width_estimate_intervals.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index be185cde8b..9f6515bf70 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -128,8 +128,6 @@ def get_width(str: str): assert str == "A" or str == "H" or str == "N" or str == "Na" return width_u.is_1 - if not source: - raise ValueError("invalid path") reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") for line in source: line = line.strip() From 6daea61cc7e90eb276ffa9791dc8fe4b51f31bfd Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sun, 17 Sep 2023 20:55:03 +0700 Subject: [PATCH 05/21] use a slice instead of a custom class, also remove input --- .../format_width_estimate_intervals.py | 72 ++++++++----------- 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 9f6515bf70..7ae08201da 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -6,16 +6,7 @@ import re from dataclasses import dataclass from typing import TextIO - - -@dataclass -class range_u: - from_: int - to_: int = None - - def __post_init__(self): - if self.to_ is None: - self.to_ = self.from_ +from pathlib import Path class width_u: @@ -30,12 +21,11 @@ class table_u: def __init__(self): self.table = [width_u.is_1] * (self.max_u + 1) - def fill_range(self, rng: range_u, width: bool): - from_, to_ = rng.from_, rng.to_ + def fill_range(self, rng: tuple, width: bool): + from_, to_ = rng[0], rng[1] assert from_ <= to_, "impl assertion failed" assert to_ <= self.max_u, "impl assertion failed" - for u in range(from_, to_ + 1): - self.table[u] = width + self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) def print_intervals(self): # Print table for _Width_estimate_intervals_v2. @@ -76,20 +66,20 @@ def print_clusters_1_vs_2(self, other): def get_table_cpp20() -> table_u: std_wide_ranges_cpp20 = [ - range_u(0x1100, 0x115F), - range_u(0x2329, 0x232A), - range_u(0x2E80, 0x303E), - range_u(0x3040, 0xA4CF), - range_u(0xAC00, 0xD7A3), - range_u(0xF900, 0xFAFF), - range_u(0xFE10, 0xFE19), - range_u(0xFE30, 0xFE6F), - range_u(0xFF00, 0xFF60), - range_u(0xFFE0, 0xFFE6), - range_u(0x1F300, 0x1F64F), - range_u(0x1F900, 0x1F9FF), - range_u(0x20000, 0x2FFFD), - range_u(0x30000, 0x3FFFD), + (0x1100, 0x115F), + (0x2329, 0x232A), + (0x2E80, 0x303E), + (0x3040, 0xA4CF), + (0xAC00, 0xD7A3), + (0xF900, 0xFAFF), + (0xFE10, 0xFE19), + (0xFE30, 0xFE6F), + (0xFF00, 0xFF60), + (0xFFE0, 0xFFE6), + (0x1F300, 0x1F64F), + (0x1F900, 0x1F9FF), + (0x20000, 0x2FFFD), + (0x30000, 0x3FFFD), ] table = table_u() @@ -110,11 +100,11 @@ def read_from(source: TextIO) -> table_u: # "The unassigned code points in the following blocks default to "W":" default_wide_ranges = [ - range_u(0x4E00, 0x9FFF), - range_u(0x3400, 0x4DBF), - range_u(0xF900, 0xFAFF), - range_u(0x20000, 0x2FFFD), - range_u(0x30000, 0x3FFFD), + (0x4E00, 0x9FFF), + (0x3400, 0x4DBF), + (0xF900, 0xFAFF), + (0x20000, 0x2FFFD), + (0x30000, 0x3FFFD), ] for rng in default_wide_ranges: table.fill_range(rng, width_u.is_2) @@ -139,10 +129,10 @@ def get_width(str: str): if match.group(2): # range (HEX..HEX) to_val = int(match.group(2)[2:], base=16) - table.fill_range(range_u(from_val, to_val), width) + table.fill_range((from_val, to_val), width) else: # single character (HEX) - table.fill_range(range_u(from_val), width) + table.fill_range((from_val, from_val), width) return table @@ -152,9 +142,9 @@ def get_table_cpp23(source: TextIO) -> table_u: # Override with ranges specified by the C++ standard. std_wide_ranges_cpp23 = [ - range_u(0x4DC0, 0x4DFF), - range_u(0x1F300, 0x1F5FF), - range_u(0x1F900, 0x1F9FF), + (0x4DC0, 0x4DFF), + (0x1F300, 0x1F5FF), + (0x1F900, 0x1F9FF), ] for rng in std_wide_ranges_cpp23: @@ -168,10 +158,10 @@ def main(): old_table = get_table_cpp20() old_table.print_intervals() - print("\nNew table:\nInput path for EastAsianWidth.txt: ", end="") - path = input() - with open(path) as source: + path = Path(__file__).absolute().with_name("EastAsianWidth.txt") + with open(path, mode="rt", encoding="utf-8") as source: new_table = get_table_cpp23(source) + print("New table:") new_table.print_intervals() print("\nWas 1, now 2:") From 83ef03abeff8500ed3fb79373f086ce036eccf7c Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Sun, 17 Sep 2023 21:02:18 +0700 Subject: [PATCH 06/21] use enumerate --- .../format_width_estimate_intervals.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 7ae08201da..331a40f3d7 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -31,15 +31,15 @@ def print_intervals(self): # Print table for _Width_estimate_intervals_v2. c = 0 last = self.table[0] - for u in range(self.max_u + 1): - if self.table[u] != last: + for u, el in enumerate(self.table): + if el != last: print(f"0x{u:X}u, ", end="") if c == 11: c = 0 print() else: c += 1 - last = self.table[u] + last = el print() From 207e428979b1f6aff871c1ad3f45202e9e196380 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Thu, 21 Sep 2023 21:28:33 +0700 Subject: [PATCH 07/21] apply code review suggestions --- .../format_width_estimate_intervals.py | 103 ++++++++++-------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 331a40f3d7..d14078e1a6 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -4,57 +4,73 @@ # The following code generates data for _Width_estimate_intervals_v2 in . import re -from dataclasses import dataclass +from enum import Enum from typing import TextIO from pathlib import Path -class width_u: - is_1: bool = False - is_2: bool = True +LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") -class table_u: - # A valid Unicode code point won't exceed max_u. - max_u: int = 0x10FFFF +class UnicodeWidth(Enum): + IS_1: int = 1 + IS_2: int = 2 + + +class UnicodeTable: + # A valid Unicode code point won't exceed MAX_UNICODE_POINT. + MAX_UNICODE_POINT: int = 0x10FFFF + UNICODE_TABLE_SIZE: int = MAX_UNICODE_POINT + 1 def __init__(self): - self.table = [width_u.is_1] * (self.max_u + 1) + self.table = [UnicodeWidth.IS_1] * (self.UNICODE_TABLE_SIZE) def fill_range(self, rng: tuple, width: bool): - from_, to_ = rng[0], rng[1] + from_, to_ = rng assert from_ <= to_, "impl assertion failed" - assert to_ <= self.max_u, "impl assertion failed" + assert to_ <= self.MAX_UNICODE_POINT, "impl assertion failed" self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) def print_intervals(self): - # Print table for _Width_estimate_intervals_v2. - c = 0 - last = self.table[0] - for u, el in enumerate(self.table): - if el != last: + """ + Generates _Width_estimate_intervals_v2. + That is, starting from the second code point ([1]) + until and including the last code point([MAX_UNICODE_POINT]), + for code point [U], if width[U]!=width[U-1], we print I to indicate the new range. + """ + printed_elements_on_one_line = 0 + assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed" + for u in range(1, self.UNICODE_TABLE_SIZE): + if self.table[u] != self.table[u - 1]: print(f"0x{u:X}u, ", end="") - if c == 11: - c = 0 + if printed_elements_on_one_line == 11: + printed_elements_on_one_line = 0 print() else: - c += 1 - last = el + printed_elements_on_one_line += 1 print() def print_clusters_1_vs_2(self, other): - cluster_table = [False] * (self.max_u + 1) - for u in range(self.max_u + 1): - if self.table[u] == width_u.is_1 and other.table[u] == width_u.is_2: + """ + Print all ranges, in closed-end form + (to match with the standard/data file/and the annex in the paper), + that self.width[range] are all 1 and other.width[range] are all 2. + """ + cluster_table = [False] * (self.UNICODE_TABLE_SIZE) + for u in range(self.UNICODE_TABLE_SIZE): + if ( + self.table[u] == UnicodeWidth.IS_1 + and other.table[u] == UnicodeWidth.IS_2 + ): cluster_table[u] = True u = 0 - while u < self.max_u + 1: + while u < self.UNICODE_TABLE_SIZE: if cluster_table[u]: from_ = u to_ = from_ - while to_ + 1 <= self.max_u and cluster_table[to_ + 1]: + while to_ + 1 <= self.MAX_UNICODE_POINT and cluster_table[to_ + 1]: to_ += 1 if from_ == to_: print(f"U+{from_:X}") @@ -64,7 +80,7 @@ def print_clusters_1_vs_2(self, other): u += 1 -def get_table_cpp20() -> table_u: +def get_table_cpp20() -> UnicodeTable: std_wide_ranges_cpp20 = [ (0x1100, 0x115F), (0x2329, 0x232A), @@ -82,21 +98,23 @@ def get_table_cpp20() -> table_u: (0x30000, 0x3FFFD), ] - table = table_u() + table = UnicodeTable() for rng in std_wide_ranges_cpp20: - table.fill_range(rng, width_u.is_2) + table.fill_range(rng, UnicodeWidth.IS_2) return table -# Read data from "EastAsianWidth.txt". -# The latest version can be found at: -# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt -# The current implementation works for: -# https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt -# To make this function work, the file should not contain a BOM. -def read_from(source: TextIO) -> table_u: - table = table_u() +def read_from(source: TextIO) -> UnicodeTable: + """ + Read data from "EastAsianWidth.txt". + The latest version can be found at: + https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt + The current implementation works for: + https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt + To make this function work, the file should not contain a BOM. + """ + table = UnicodeTable() # "The unassigned code points in the following blocks default to "W":" default_wide_ranges = [ @@ -107,22 +125,21 @@ def read_from(source: TextIO) -> table_u: (0x30000, 0x3FFFD), ] for rng in default_wide_ranges: - table.fill_range(rng, width_u.is_2) + table.fill_range(rng, UnicodeWidth.IS_2) # Read explicitly assigned ranges. # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment". def get_width(str: str): if str == "F" or str == "W": - return width_u.is_2 + return UnicodeWidth.IS_2 else: assert str == "A" or str == "H" or str == "N" or str == "Na" - return width_u.is_1 + return UnicodeWidth.IS_1 - reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") for line in source: line = line.strip() if line and not line.startswith("#"): - match = reg.fullmatch(line) + match = LINE_REGEX.fullmatch(line) assert match, "invalid line" from_val = int(match.group(1), base=16) width = get_width(match.group(3)) @@ -132,12 +149,12 @@ def get_width(str: str): table.fill_range((from_val, to_val), width) else: # single character (HEX) - table.fill_range((from_val, from_val), width) + table.table[from_val] = width return table -def get_table_cpp23(source: TextIO) -> table_u: +def get_table_cpp23(source: TextIO) -> UnicodeTable: table = read_from(source) # Override with ranges specified by the C++ standard. @@ -148,7 +165,7 @@ def get_table_cpp23(source: TextIO) -> table_u: ] for rng in std_wide_ranges_cpp23: - table.fill_range(rng, width_u.is_2) + table.fill_range(rng, UnicodeWidth.IS_2) return table From 7c75b614f674e68b2e4206b0cba50d3920b35497 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Thu, 21 Sep 2023 21:47:53 +0700 Subject: [PATCH 08/21] rephrase some comments --- .../format_width_estimate_intervals.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index d14078e1a6..1caabd31d7 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -34,9 +34,9 @@ def fill_range(self, rng: tuple, width: bool): def print_intervals(self): """ Generates _Width_estimate_intervals_v2. - That is, starting from the second code point ([1]) - until and including the last code point([MAX_UNICODE_POINT]), - for code point [U], if width[U]!=width[U-1], we print I to indicate the new range. + It begins from the second code point and continues up to the last one, encompassing it as well. + Whenever a code point's width differs from the previous one, + the function displays the code point's index to indicate the start of a new range. """ printed_elements_on_one_line = 0 assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed" @@ -53,9 +53,9 @@ def print_intervals(self): def print_clusters_1_vs_2(self, other): """ - Print all ranges, in closed-end form - (to match with the standard/data file/and the annex in the paper), - that self.width[range] are all 1 and other.width[range] are all 2. + Print closed-end ranges for all code points + where the width is consistently 1 in the self.table range and 2 in the other.table range. + This output is consistent with the standard/data file and the annex in the paper """ cluster_table = [False] * (self.UNICODE_TABLE_SIZE) for u in range(self.UNICODE_TABLE_SIZE): From c8b9454e405e15847e9666a6dbc6aaacf9e46326 Mon Sep 17 00:00:00 2001 From: Igor Zhukov Date: Thu, 21 Sep 2023 21:55:52 +0700 Subject: [PATCH 09/21] change the comments a bit --- .../format_width_estimate_intervals.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 1caabd31d7..690f451041 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -34,9 +34,9 @@ def fill_range(self, rng: tuple, width: bool): def print_intervals(self): """ Generates _Width_estimate_intervals_v2. - It begins from the second code point and continues up to the last one, encompassing it as well. + It begins from the second code point and continues up to the last one, including it as well. Whenever a code point's width differs from the previous one, - the function displays the code point's index to indicate the start of a new range. + the function print the code point to indicate the start of a new range. """ printed_elements_on_one_line = 0 assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed" @@ -53,7 +53,7 @@ def print_intervals(self): def print_clusters_1_vs_2(self, other): """ - Print closed-end ranges for all code points + Print all ranges, in closed-end form where the width is consistently 1 in the self.table range and 2 in the other.table range. This output is consistent with the standard/data file and the annex in the paper """ From 294f157981dcb9071adc08e190e3cdec5cbe85bb Mon Sep 17 00:00:00 2001 From: achabense <60953653+achabense@users.noreply.github.com> Date: Fri, 22 Sep 2023 00:26:41 +0800 Subject: [PATCH 10/21] > restore the location of `LINE_REGEX` > add comment for `fill_range` > rename some variables > type: bool~>int > refine names for the two function; simplify comments > better message --- .../format_width_estimate_intervals.py | 66 ++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 690f451041..e5fa1c7250 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -9,38 +9,30 @@ from pathlib import Path -LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") - - class UnicodeWidth(Enum): IS_1: int = 1 IS_2: int = 2 -class UnicodeTable: - # A valid Unicode code point won't exceed MAX_UNICODE_POINT. - MAX_UNICODE_POINT: int = 0x10FFFF - UNICODE_TABLE_SIZE: int = MAX_UNICODE_POINT + 1 +class UnicodeWidthTable: + # A valid Unicode code point won't exceed MAX_CODE_POINT. + MAX_CODE_POINT: int = 0x10FFFF + TABLE_SIZE: int = MAX_CODE_POINT + 1 def __init__(self): - self.table = [UnicodeWidth.IS_1] * (self.UNICODE_TABLE_SIZE) + self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE) - def fill_range(self, rng: tuple, width: bool): + # "rng" denotes a right-closed range. + def fill_range(self, rng: tuple, width: int): from_, to_ = rng assert from_ <= to_, "impl assertion failed" - assert to_ <= self.MAX_UNICODE_POINT, "impl assertion failed" + assert to_ <= self.MAX_CODE_POINT, "impl assertion failed" self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) - def print_intervals(self): - """ - Generates _Width_estimate_intervals_v2. - It begins from the second code point and continues up to the last one, including it as well. - Whenever a code point's width differs from the previous one, - the function print the code point to indicate the start of a new range. - """ + def print_width_estimate_intervals(self): printed_elements_on_one_line = 0 assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed" - for u in range(1, self.UNICODE_TABLE_SIZE): + for u in range(1, self.TABLE_SIZE): if self.table[u] != self.table[u - 1]: print(f"0x{u:X}u, ", end="") if printed_elements_on_one_line == 11: @@ -51,14 +43,10 @@ def print_intervals(self): print() - def print_clusters_1_vs_2(self, other): - """ - Print all ranges, in closed-end form - where the width is consistently 1 in the self.table range and 2 in the other.table range. - This output is consistent with the standard/data file and the annex in the paper - """ - cluster_table = [False] * (self.UNICODE_TABLE_SIZE) - for u in range(self.UNICODE_TABLE_SIZE): + # Print all ranges (right-closed), where self's width is 1 and other's width is 2. + def print_ranges_1_vs_2(self, other): + cluster_table = [False] * (self.TABLE_SIZE) + for u in range(self.TABLE_SIZE): if ( self.table[u] == UnicodeWidth.IS_1 and other.table[u] == UnicodeWidth.IS_2 @@ -66,11 +54,11 @@ def print_clusters_1_vs_2(self, other): cluster_table[u] = True u = 0 - while u < self.UNICODE_TABLE_SIZE: + while u < self.TABLE_SIZE: if cluster_table[u]: from_ = u to_ = from_ - while to_ + 1 <= self.MAX_UNICODE_POINT and cluster_table[to_ + 1]: + while to_ + 1 <= self.MAX_CODE_POINT and cluster_table[to_ + 1]: to_ += 1 if from_ == to_: print(f"U+{from_:X}") @@ -80,7 +68,7 @@ def print_clusters_1_vs_2(self, other): u += 1 -def get_table_cpp20() -> UnicodeTable: +def get_table_cpp20() -> UnicodeWidthTable: std_wide_ranges_cpp20 = [ (0x1100, 0x115F), (0x2329, 0x232A), @@ -98,14 +86,14 @@ def get_table_cpp20() -> UnicodeTable: (0x30000, 0x3FFFD), ] - table = UnicodeTable() + table = UnicodeWidthTable() for rng in std_wide_ranges_cpp20: table.fill_range(rng, UnicodeWidth.IS_2) return table -def read_from(source: TextIO) -> UnicodeTable: +def read_from(source: TextIO) -> UnicodeWidthTable: """ Read data from "EastAsianWidth.txt". The latest version can be found at: @@ -114,7 +102,7 @@ def read_from(source: TextIO) -> UnicodeTable: https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt To make this function work, the file should not contain a BOM. """ - table = UnicodeTable() + table = UnicodeWidthTable() # "The unassigned code points in the following blocks default to "W":" default_wide_ranges = [ @@ -129,6 +117,8 @@ def read_from(source: TextIO) -> UnicodeTable: # Read explicitly assigned ranges. # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment". + LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") + def get_width(str: str): if str == "F" or str == "W": return UnicodeWidth.IS_2 @@ -140,7 +130,7 @@ def get_width(str: str): line = line.strip() if line and not line.startswith("#"): match = LINE_REGEX.fullmatch(line) - assert match, "invalid line" + assert match, line # invalid line from_val = int(match.group(1), base=16) width = get_width(match.group(3)) if match.group(2): @@ -154,7 +144,7 @@ def get_width(str: str): return table -def get_table_cpp23(source: TextIO) -> UnicodeTable: +def get_table_cpp23(source: TextIO) -> UnicodeWidthTable: table = read_from(source) # Override with ranges specified by the C++ standard. @@ -173,18 +163,18 @@ def get_table_cpp23(source: TextIO) -> UnicodeTable: def main(): print("Old table:") old_table = get_table_cpp20() - old_table.print_intervals() + old_table.print_width_estimate_intervals() path = Path(__file__).absolute().with_name("EastAsianWidth.txt") with open(path, mode="rt", encoding="utf-8") as source: new_table = get_table_cpp23(source) print("New table:") - new_table.print_intervals() + new_table.print_width_estimate_intervals() print("\nWas 1, now 2:") - old_table.print_clusters_1_vs_2(new_table) + old_table.print_ranges_1_vs_2(new_table) print("\nWas 2, now 1:") - new_table.print_clusters_1_vs_2(old_table) + new_table.print_ranges_1_vs_2(old_table) if __name__ == "__main__": From 6e2c7603acb50b81c9f2e70a7daebf09f94efe42 Mon Sep 17 00:00:00 2001 From: achabense <60953653+achabense@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:14:40 +0800 Subject: [PATCH 11/21] > simplify `print_ranges_1_vs_2` --- .../format_width_estimate_intervals.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index e5fa1c7250..3ecc6c602d 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -45,20 +45,18 @@ def print_width_estimate_intervals(self): # Print all ranges (right-closed), where self's width is 1 and other's width is 2. def print_ranges_1_vs_2(self, other): - cluster_table = [False] * (self.TABLE_SIZE) - for u in range(self.TABLE_SIZE): - if ( + def _1_vs_2(u: int): + return ( self.table[u] == UnicodeWidth.IS_1 and other.table[u] == UnicodeWidth.IS_2 - ): - cluster_table[u] = True + ) u = 0 while u < self.TABLE_SIZE: - if cluster_table[u]: + if _1_vs_2(u): from_ = u to_ = from_ - while to_ + 1 <= self.MAX_CODE_POINT and cluster_table[to_ + 1]: + while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1): to_ += 1 if from_ == to_: print(f"U+{from_:X}") From 77ce2320c11d13309b03275f895ea9e8a087b601 Mon Sep 17 00:00:00 2001 From: achabense <60953653+achabense@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:13:33 +0800 Subject: [PATCH 12/21] > add documentation for `print_width_estimate_intervals` --- .../format_width_estimate_intervals.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 3ecc6c602d..ee195e5111 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -9,6 +9,7 @@ from pathlib import Path +# Width estimation. class UnicodeWidth(Enum): IS_1: int = 1 IS_2: int = 2 @@ -25,14 +26,24 @@ def __init__(self): # "rng" denotes a right-closed range. def fill_range(self, rng: tuple, width: int): from_, to_ = rng - assert from_ <= to_, "impl assertion failed" - assert to_ <= self.MAX_CODE_POINT, "impl assertion failed" + assert from_ <= to_, "invalid range" + assert to_ <= self.MAX_CODE_POINT, "invalid range" self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) def print_width_estimate_intervals(self): + """ + Divide [0..MAX_CODE_POINT] into ranges with different width estimations. + Represent each range with their starting values. + The starting value of the first range is always 0 and omitted. + The width estimation should be 1 for the first range, then alternate between 2 and 1. + """ printed_elements_on_one_line = 0 - assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed" + assert self.table[0] == UnicodeWidth.IS_1 for u in range(1, self.TABLE_SIZE): + assert ( + self.table[u] == UnicodeWidth.IS_1 + or self.table[u] == UnicodeWidth.IS_2 + ) if self.table[u] != self.table[u - 1]: print(f"0x{u:X}u, ", end="") if printed_elements_on_one_line == 11: From 8e88bb8f9ea0ae62be75b1bc63826e6a48f60241 Mon Sep 17 00:00:00 2001 From: achabense <60953653+achabense@users.noreply.github.com> Date: Sun, 24 Sep 2023 21:07:26 +0800 Subject: [PATCH 13/21] > update regex to support both `15.0.0` and `15.1.0` --- .../format_width_estimate_intervals.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index ee195e5111..5122428cf4 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -108,7 +108,7 @@ def read_from(source: TextIO) -> UnicodeWidthTable: The latest version can be found at: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt The current implementation works for: - https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt + https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt To make this function work, the file should not contain a BOM. """ table = UnicodeWidthTable() @@ -125,8 +125,8 @@ def read_from(source: TextIO) -> UnicodeWidthTable: table.fill_range(rng, UnicodeWidth.IS_2) # Read explicitly assigned ranges. - # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment". - LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*") + # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment". + LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*") def get_width(str: str): if str == "F" or str == "W": From e4f86656eac33e57a4ebcb188768bbd34929b3e6 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 13:12:42 -0700 Subject: [PATCH 14/21] Print the filename, timestamp, and C++ array. --- .../format_width_estimate_intervals.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 5122428cf4..59d6d2948d 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -176,9 +176,16 @@ def main(): path = Path(__file__).absolute().with_name("EastAsianWidth.txt") with open(path, mode="rt", encoding="utf-8") as source: + filename = source.readline().replace("#", "//").rstrip() + timestamp = source.readline().replace("#", "//").rstrip() new_table = get_table_cpp23(source) print("New table:") + print() + print(filename) + print(timestamp) + print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //") new_table.print_width_estimate_intervals() + print("};") print("\nWas 1, now 2:") old_table.print_ranges_1_vs_2(new_table) From 15320fa5cbf617b4b0869335d0316ed302f2af60 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 13:23:17 -0700 Subject: [PATCH 15/21] Don't bother carefully wrapping; we need to clang-format anyways. --- .../format_width_estimate_intervals.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 59d6d2948d..5bce0c2913 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -37,7 +37,6 @@ def print_width_estimate_intervals(self): The starting value of the first range is always 0 and omitted. The width estimation should be 1 for the first range, then alternate between 2 and 1. """ - printed_elements_on_one_line = 0 assert self.table[0] == UnicodeWidth.IS_1 for u in range(1, self.TABLE_SIZE): assert ( @@ -46,13 +45,6 @@ def print_width_estimate_intervals(self): ) if self.table[u] != self.table[u - 1]: print(f"0x{u:X}u, ", end="") - if printed_elements_on_one_line == 11: - printed_elements_on_one_line = 0 - print() - else: - printed_elements_on_one_line += 1 - - print() # Print all ranges (right-closed), where self's width is 1 and other's width is 2. def print_ranges_1_vs_2(self, other): @@ -179,7 +171,7 @@ def main(): filename = source.readline().replace("#", "//").rstrip() timestamp = source.readline().replace("#", "//").rstrip() new_table = get_table_cpp23(source) - print("New table:") + print("\n\nNew table:") print() print(filename) print(timestamp) From 9d7958c9fe5b0f5d8bf9087e2dcad909468a23ad Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 13:33:49 -0700 Subject: [PATCH 16/21] Use `join` to avoid a trailing comma interfering with clang-format. --- .../format_width_estimate_intervals.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 5bce0c2913..9f64bc7bde 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -30,13 +30,14 @@ def fill_range(self, rng: tuple, width: int): assert to_ <= self.MAX_CODE_POINT, "invalid range" self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1) - def print_width_estimate_intervals(self): + def width_estimate_intervals(self): """ Divide [0..MAX_CODE_POINT] into ranges with different width estimations. Represent each range with their starting values. The starting value of the first range is always 0 and omitted. The width estimation should be 1 for the first range, then alternate between 2 and 1. """ + values = [] assert self.table[0] == UnicodeWidth.IS_1 for u in range(1, self.TABLE_SIZE): assert ( @@ -44,7 +45,9 @@ def print_width_estimate_intervals(self): or self.table[u] == UnicodeWidth.IS_2 ) if self.table[u] != self.table[u - 1]: - print(f"0x{u:X}u, ", end="") + values.append(u) + + return ", ".join([f"0x{u:X}u" for u in values]) # Print all ranges (right-closed), where self's width is 1 and other's width is 2. def print_ranges_1_vs_2(self, other): @@ -164,7 +167,7 @@ def get_table_cpp23(source: TextIO) -> UnicodeWidthTable: def main(): print("Old table:") old_table = get_table_cpp20() - old_table.print_width_estimate_intervals() + print(old_table.width_estimate_intervals()) path = Path(__file__).absolute().with_name("EastAsianWidth.txt") with open(path, mode="rt", encoding="utf-8") as source: @@ -176,7 +179,7 @@ def main(): print(filename) print(timestamp) print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //") - new_table.print_width_estimate_intervals() + print(new_table.width_estimate_intervals()) print("};") print("\nWas 1, now 2:") From 8ebce4943cb540b4c3fb5db418cbba82b46b7a0b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 13:49:36 -0700 Subject: [PATCH 17/21] Use a template string to make the C++ output clearer, like the other generators. --- .../format_width_estimate_intervals.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 9f64bc7bde..58a9e32a3a 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -164,6 +164,14 @@ def get_table_cpp23(source: TextIO) -> UnicodeWidthTable: return table +WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """ +{filename} +{timestamp} +inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ // +{values} }}; +""" + + def main(): print("Old table:") old_table = get_table_cpp20() @@ -174,15 +182,15 @@ def main(): filename = source.readline().replace("#", "//").rstrip() timestamp = source.readline().replace("#", "//").rstrip() new_table = get_table_cpp23(source) - print("\n\nNew table:") - print() - print(filename) - print(timestamp) - print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //") - print(new_table.width_estimate_intervals()) - print("};") - - print("\nWas 1, now 2:") + print("\nNew table:") + print( + WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format( + filename=filename, + timestamp=timestamp, + values=new_table.width_estimate_intervals(), + ) + ) + print("Was 1, now 2:") old_table.print_ranges_1_vs_2(new_table) print("\nWas 2, now 1:") new_table.print_ranges_1_vs_2(old_table) From ab8a2759f211debce40f34de36b975ea30956667 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 14:00:00 -0700 Subject: [PATCH 18/21] Format with Prettier. --- .../format_width_estimate_intervals.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 58a9e32a3a..d6fb89a959 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -41,8 +41,7 @@ def width_estimate_intervals(self): assert self.table[0] == UnicodeWidth.IS_1 for u in range(1, self.TABLE_SIZE): assert ( - self.table[u] == UnicodeWidth.IS_1 - or self.table[u] == UnicodeWidth.IS_2 + self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2 ) if self.table[u] != self.table[u - 1]: values.append(u) @@ -134,7 +133,7 @@ def get_width(str: str): line = line.strip() if line and not line.startswith("#"): match = LINE_REGEX.fullmatch(line) - assert match, line # invalid line + assert match, line # invalid line from_val = int(match.group(1), base=16) width = get_width(match.group(3)) if match.group(2): From 398e37d18df2cf9b48fee77985cd17e79087b33a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 14:01:28 -0700 Subject: [PATCH 19/21] Regenerate, capturing the filename and timestamp. --- stl/inc/format | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stl/inc/format b/stl/inc/format index cfee5e3079..190c29557a 100644 --- a/stl/inc/format +++ b/stl/inc/format @@ -989,6 +989,9 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() { // Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py // in the https://github.com/microsoft/stl repository. + +// EastAsianWidth-15.0.0.txt +// Date: 2022-05-24, 17:40:20 GMT [KW, LI] inline constexpr char32_t _Width_estimate_intervals_v2[] = { // 0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu, 0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu, From 13e319748e3fb8ef3701a832feaf8e3e5de58505 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sun, 24 Sep 2023 14:12:31 -0700 Subject: [PATCH 20/21] Cite the Standardese. --- .../unicode_properties_parse/format_width_estimate_intervals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index d6fb89a959..56da9c2ad4 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -150,7 +150,7 @@ def get_width(str: str): def get_table_cpp23(source: TextIO) -> UnicodeWidthTable: table = read_from(source) - # Override with ranges specified by the C++ standard. + # Override with ranges specified by N4958 [format.string.std]/13. std_wide_ranges_cpp23 = [ (0x4DC0, 0x4DFF), (0x1F300, 0x1F5FF), From fa0b783e0eb18f5852e9a17e3424e4ca2ef33a1d Mon Sep 17 00:00:00 2001 From: nicole mazzuca <83086508+strega-nil-ms@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:05:04 -0800 Subject: [PATCH 21/21] Update tools/unicode_properties_parse/format_width_estimate_intervals.py --- .../format_width_estimate_intervals.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py index 56da9c2ad4..b0c4bc8a79 100644 --- a/tools/unicode_properties_parse/format_width_estimate_intervals.py +++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py @@ -32,10 +32,12 @@ def fill_range(self, rng: tuple, width: int): def width_estimate_intervals(self): """ - Divide [0..MAX_CODE_POINT] into ranges with different width estimations. - Represent each range with their starting values. - The starting value of the first range is always 0 and omitted. - The width estimation should be 1 for the first range, then alternate between 2 and 1. + Creates a string representation of the map (in `self.table`) from + unicode code points to their width, using hexadecimal unsigned integer literals. + Since there are long runs of code points of one width or the other, + this representation is a list of code points where the width switches. + Additionally, the width is assumed to start at `1` from the beginning of the list. + For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`. """ values = [] assert self.table[0] == UnicodeWidth.IS_1