From 7d2a3267ecabd8b1c4c378cff7f56706319cb0b9 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sat, 26 Aug 2023 21:13:00 +0700
Subject: [PATCH 01/21] P2675R1 <format> generator converted from C++ to Python

---
 .../format_width_estimate_intervals.cpp       | 209 ------------------
 .../format_width_estimate_intervals.py        | 184 +++++++++++++++
 2 files changed, 184 insertions(+), 209 deletions(-)
 delete mode 100644 tools/unicode_properties_parse/format_width_estimate_intervals.cpp
 create mode 100644 tools/unicode_properties_parse/format_width_estimate_intervals.py
diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.cpp b/tools/unicode_properties_parse/format_width_estimate_intervals.cpp
deleted file mode 100644
index 06946779b7..0000000000
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// The following code generates data for `_Width_estimate_intervals_v2` in <format>.
-
-#include <charconv>
-#include <cstdint>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <source_location>
-#include <string>
-#include <system_error>
-#include <vector>
-using namespace std;
-
-void verify(bool test, const char* msg, source_location loc = source_location::current()) {
-    if (!test) {
-        cerr << "Error at line " << loc.line() << ": " << msg << endl;
-        exit(EXIT_FAILURE);
-    }
-}
-constexpr const char* impl_assertion_failed = "impl assertion failed";
-
-struct range_u {
-    uint32_t from;
-    uint32_t to;
-    constexpr range_u(uint32_t f, uint32_t t) : from(f), to(t) {}
-    constexpr explicit range_u(uint32_t v) : from(v), to(v) {}
-};
-
-enum class width_u : bool { is_1 = false, is_2 = true };
-
-class table_u {
-private:
-    // A valid Unicode code point won't exceed `max_u`.
-    static constexpr uint32_t max_u = 0x10'ffff;
-    vector<width_u> table;
-
-public:
-    table_u() : table(max_u + 1, width_u::is_1) {}
-
-    void fill_range(const range_u rng, const width_u width) {
-        const auto [from, to] = rng;
-        verify(from <= to, impl_assertion_failed);
-        verify(to <= max_u, impl_assertion_failed);
-        for (uint32_t u = from; u <= to; ++u) {
-            table[u] = width;
-        }
-    }
-
-    void print_intervals() const {
-        // Print table for `_Width_estimate_intervals_v2`.
-        int c        = 0;
-        width_u last = table[0];
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (table[u] != last) {
-                cout << "0x" << hex << uppercase << u << "u, ";
-                if (++c == 12) {
-                    c = 0;
-                    cout << endl;
-                }
-            }
-            last = table[u];
-        }
-        cout << endl;
-    }
-
-    void print_clusters_1_vs_2(const table_u& other) const {
-        vector<bool> cluster_table(max_u + 1, false);
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (table[u] == width_u::is_1 && other.table[u] == width_u::is_2) {
-                cluster_table[u] = true;
-            }
-        }
-
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (cluster_table[u]) {
-                const uint32_t from = u;
-                uint32_t to         = from;
-                while (to + 1 <= max_u && cluster_table[to + 1]) {
-                    ++to;
-                }
-                if (from == to) {
-                    cout << hex << uppercase << "U+" << from << endl;
-                } else {
-                    cout << hex << uppercase << "U+" << from << "..U+" << to << endl;
-                }
-                u = to;
-            }
-        }
-    }
-};
-
-table_u get_table_cpp20() {
-    static constexpr range_u std_wide_ranges_cpp20[]{
-        {0x1100, 0x115F},
-        {0x2329, 0x232A},
-        {0x2E80, 0x303E},
-        {0x3040, 0xA4CF},
-        {0xAC00, 0xD7A3},
-        {0xF900, 0xFAFF},
-        {0xFE10, 0xFE19},
-        {0xFE30, 0xFE6F},
-        {0xFF00, 0xFF60},
-        {0xFFE0, 0xFFE6},
-        {0x1F300, 0x1F64F},
-        {0x1F900, 0x1F9FF},
-        {0x20000, 0x2FFFD},
-        {0x30000, 0x3FFFD},
-    };
-
-    table_u table;
-    for (const range_u& rng : std_wide_ranges_cpp20) {
-        table.fill_range(rng, width_u::is_2);
-    }
-    return table;
-}
-
-// Read data from "EastAsianWidth.txt".
-// The latest version can be found at:
-// https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
-// The current implementation works for:
-// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
-// To make this function work, the file should not contain a BOM.
-table_u read_from(ifstream& source) {
-    table_u table;
-
-    // "The unassigned code points in the following blocks default to "W":"
-    static constexpr range_u default_wide_ranges[]{
-        {0x4E00, 0x9FFF}, {0x3400, 0x4DBF}, {0xF900, 0xFAFF}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}};
-    for (const range_u& rng : default_wide_ranges) {
-        table.fill_range(rng, width_u::is_2);
-    }
-
-    // Read explicitly assigned ranges.
-    // The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
-    auto get_width = [](const string& str) {
-        if (str == "F" || str == "W") {
-            return width_u::is_2;
-        } else {
-            verify(str == "A" || str == "H" || str == "N" || str == "Na", impl_assertion_failed);
-            return width_u::is_1;
-        }
-    };
-    auto get_value = [](const string& str) {
-        uint32_t value{};
-        const auto [end_ptr, ec] = from_chars(str.data(), str.data() + str.size(), value, 16);
-        verify(end_ptr == str.data() + str.size(), impl_assertion_failed);
-        verify(ec == errc{}, impl_assertion_failed);
-        return value;
-    };
-
-    verify(!!source, "invalid path");
-    string line;
-    const regex reg(R"(([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*)");
-    while (getline(source, line)) {
-        if (!line.empty() && !line.starts_with("#")) {
-            smatch match;
-            verify(regex_match(line, match, reg), "invalid line");
-            verify(match[1].matched, impl_assertion_failed);
-            verify(match[3].matched, impl_assertion_failed);
-            const width_u width = get_width(match[3].str());
-            const uint32_t from = get_value(match[1].str());
-            if (match[2].matched) {
-                // range (HEX..HEX)
-                const string match2 = match[2].str();
-                verify(match2.starts_with(".."), impl_assertion_failed);
-                table.fill_range({from, get_value(match2.substr(2))}, width);
-            } else {
-                // single character (HEX)
-                table.fill_range(range_u{from}, width);
-            }
-        }
-    }
-
-    return table;
-}
-
-table_u get_table_cpp23(ifstream& source) {
-    table_u table = read_from(source);
-
-    // Override with ranges specified by the C++ standard.
-    static constexpr range_u std_wide_ranges_cpp23[]{{0x4DC0, 0x4DFF}, {0x1F300, 0x1F5FF}, {0x1F900, 0x1F9FF}};
-    for (const range_u& rng : std_wide_ranges_cpp23) {
-        table.fill_range(rng, width_u::is_2);
-    }
-
-    return table;
-}
-
-int main() {
-    cout << "Old table:\n";
-    const table_u old_table = get_table_cpp20();
-    old_table.print_intervals();
-
-    cout << "\nNew table:\nInput path for EastAsianWidth.txt: ";
-    string path;
-    getline(cin, path);
-    ifstream source(path);
-    const table_u new_table = get_table_cpp23(source);
-    new_table.print_intervals();
-
-    cout << "\nWas 1, now 2:\n";
-    old_table.print_clusters_1_vs_2(new_table);
-    cout << "\nWas 2, now 1:\n";
-    new_table.print_clusters_1_vs_2(old_table);
-}
diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
new file mode 100644
index 0000000000..353ea6975e
--- /dev/null
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# The following code generates data for _Width_estimate_intervals_v2 in <format>.
+
+import re
+from dataclasses import dataclass
+from typing import TextIO
+
+
+@dataclass
+class range_u:
+    from_: int
+    to_: int = None
+
+    def __post_init__(self):
+        if self.to_ is None:
+            self.to_ = self.from_
+
+
+class width_u:
+    is_1: bool = False
+    is_2: bool = True
+
+
+class table_u:
+    # A valid Unicode code point won't exceed max_u.
+    max_u = 0x10FFFF
+
+    def __init__(self):
+        self.table = [width_u.is_1] * (self.max_u + 1)
+
+    def fill_range(self, rng: range_u, width: bool):
+        from_, to_ = rng.from_, rng.to_
+        assert from_ <= to_, "impl assertion failed"
+        assert to_ <= self.max_u, "impl assertion failed"
+        for u in range(from_, to_ + 1):
+            self.table[u] = width
+
+    def print_intervals(self):
+        # Print table for _Width_estimate_intervals_v2.
+        c = 0
+        last = self.table[0]
+        for u in range(self.max_u + 1):
+            if self.table[u] != last:
+                print(f"0x{u:X}u, ", end="")
+                if c == 11:
+                    c = 0
+                    print()
+                else:
+                    c += 1
+            last = self.table[u]
+        print()
+
+    def print_clusters_1_vs_2(self, other):
+        cluster_table = [False] * (self.max_u + 1)
+        for u in range(self.max_u + 1):
+            if self.table[u] == width_u.is_1 and other.table[u] == width_u.is_2:
+                cluster_table[u] = True
+
+        u = 0
+        while u < self.max_u + 1:
+            if cluster_table[u]:
+                from_ = u
+                to_ = from_
+                while to_ + 1 <= self.max_u and cluster_table[to_ + 1]:
+                    to_ += 1
+                if from_ == to_:
+                    print(f"U+{from_:X}")
+                else:
+                    print(f"U+{from_:X}..U+{to_:X}")
+                u = to_
+            u += 1
+
+
+def get_table_cpp20() -> table_u:
+    std_wide_ranges_cpp20 = [
+        range_u(0x1100, 0x115F),
+        range_u(0x2329, 0x232A),
+        range_u(0x2E80, 0x303E),
+        range_u(0x3040, 0xA4CF),
+        range_u(0xAC00, 0xD7A3),
+        range_u(0xF900, 0xFAFF),
+        range_u(0xFE10, 0xFE19),
+        range_u(0xFE30, 0xFE6F),
+        range_u(0xFF00, 0xFF60),
+        range_u(0xFFE0, 0xFFE6),
+        range_u(0x1F300, 0x1F64F),
+        range_u(0x1F900, 0x1F9FF),
+        range_u(0x20000, 0x2FFFD),
+        range_u(0x30000, 0x3FFFD),
+    ]
+
+    table = table_u()
+    for rng in std_wide_ranges_cpp20:
+        table.fill_range(rng, width_u.is_2)
+    return table
+
+
+# Read data from "EastAsianWidth.txt".
+# The latest version can be found at:
+# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+# The current implementation works for:
+# https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
+# To make this function work, the file should not contain a BOM.
+def read_from(source: TextIO) -> table_u:
+    table = table_u()
+
+    # "The unassigned code points in the following blocks default to "W":"
+    default_wide_ranges = [
+        range_u(0x4E00, 0x9FFF),
+        range_u(0x3400, 0x4DBF),
+        range_u(0xF900, 0xFAFF),
+        range_u(0x20000, 0x2FFFD),
+        range_u(0x30000, 0x3FFFD),
+    ]
+    for rng in default_wide_ranges:
+        table.fill_range(rng, width_u.is_2)
+
+    # Read explicitly assigned ranges.
+    # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
+    def get_width(str: str):
+        if str == "F" or str == "W":
+            return width_u.is_2
+        else:
+            assert str == "A" or str == "H" or str == "N" or str == "Na"
+            return width_u.is_1
+
+    if not source:
+        raise ValueError("invalid path")
+    reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
+    for line in source:
+        line = line.strip()
+        if line and not line.startswith("#"):
+            match = reg.fullmatch(line)
+            assert match, "invalid line"
+            from_val = int(match.group(1), base=16)
+            width = get_width(match.group(3))
+            if match.group(2):
+                # range (HEX..HEX)
+                to_val = int(match.group(2)[2:], base=16)
+                table.fill_range(range_u(from_val, to_val), width)
+            else:
+                # single character (HEX)
+                table.fill_range(range_u(from_val), width)
+
+    return table
+
+
+def get_table_cpp23(source: TextIO) -> table_u:
+    table = read_from(source)
+
+    # Override with ranges specified by the C++ standard.
+    std_wide_ranges_cpp23 = [
+        range_u(0x4DC0, 0x4DFF),
+        range_u(0x1F300, 0x1F5FF),
+        range_u(0x1F900, 0x1F9FF),
+    ]
+
+    for rng in std_wide_ranges_cpp23:
+        table.fill_range(rng, width_u.is_2)
+
+    return table
+
+
+def main():
+    print("Old table:")
+    old_table = get_table_cpp20()
+    old_table.print_intervals()
+
+    print("\nNew table:\nInput path for EastAsianWidth.txt: ", end="")
+    path = input()
+    with open(path) as source:
+        new_table = get_table_cpp23(source)
+    new_table.print_intervals()
+
+    print("\nWas 1, now 2:")
+    old_table.print_clusters_1_vs_2(new_table)
+    print("\nWas 2, now 1:")
+    new_table.print_clusters_1_vs_2(old_table)
+
+
+if __name__ == "__main__":
+    main()

From 2ba4369af6b78d0735dddd0b72dfd276c8bfbc62 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sat, 26 Aug 2023 22:16:15 +0700
Subject: [PATCH 02/21] change a comment

Co-authored-by: achabense <60953653+achabense@users.noreply.github.com>
---
 stl/inc/format | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stl/inc/format b/stl/inc/format
index 5b976f2f5d..8ac6721c4e 100644
--- a/stl/inc/format
+++ b/stl/inc/format
@@ -979,7 +979,7 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
 #endif // ^^^ EDG workaround ^^^
 }
 
-// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.cpp
+// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
 // in the https://github.com/microsoft/stl repository.
 inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
     0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,

From aec3166f4938b724fa7413e10e0ac75756265a91 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sat, 26 Aug 2023 22:22:22 +0700
Subject: [PATCH 03/21] add some spaces

---
 .../format_width_estimate_intervals.py                        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 353ea6975e..be185cde8b 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -25,7 +25,7 @@ class width_u:
 
 class table_u:
     # A valid Unicode code point won't exceed max_u.
-    max_u = 0x10FFFF
+    max_u: int = 0x10FFFF
 
     def __init__(self):
         self.table = [width_u.is_1] * (self.max_u + 1)
@@ -50,6 +50,7 @@ def print_intervals(self):
                 else:
                     c += 1
             last = self.table[u]
+
         print()
 
     def print_clusters_1_vs_2(self, other):
@@ -94,6 +95,7 @@ def get_table_cpp20() -> table_u:
     table = table_u()
     for rng in std_wide_ranges_cpp20:
         table.fill_range(rng, width_u.is_2)
+
     return table
 
 

From 0956d67e5389b8a3608a7720d7a2e926f65a3eb4 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sat, 26 Aug 2023 22:46:49 +0700
Subject: [PATCH 04/21] remove unneded if

---
 .../unicode_properties_parse/format_width_estimate_intervals.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index be185cde8b..9f6515bf70 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -128,8 +128,6 @@ def get_width(str: str):
             assert str == "A" or str == "H" or str == "N" or str == "Na"
             return width_u.is_1
 
-    if not source:
-        raise ValueError("invalid path")
     reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
     for line in source:
         line = line.strip()

From 6daea61cc7e90eb276ffa9791dc8fe4b51f31bfd Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sun, 17 Sep 2023 20:55:03 +0700
Subject: [PATCH 05/21] use a slice instead of a custom class, also remove
 input

---
 .../format_width_estimate_intervals.py        | 72 ++++++++-----------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 9f6515bf70..7ae08201da 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -6,16 +6,7 @@
 import re
 from dataclasses import dataclass
 from typing import TextIO
-
-
-@dataclass
-class range_u:
-    from_: int
-    to_: int = None
-
-    def __post_init__(self):
-        if self.to_ is None:
-            self.to_ = self.from_
+from pathlib import Path
 
 
 class width_u:
@@ -30,12 +21,11 @@ class table_u:
     def __init__(self):
         self.table = [width_u.is_1] * (self.max_u + 1)
 
-    def fill_range(self, rng: range_u, width: bool):
-        from_, to_ = rng.from_, rng.to_
+    def fill_range(self, rng: tuple, width: bool):
+        from_, to_ = rng[0], rng[1]
         assert from_ <= to_, "impl assertion failed"
         assert to_ <= self.max_u, "impl assertion failed"
-        for u in range(from_, to_ + 1):
-            self.table[u] = width
+        self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
 
     def print_intervals(self):
         # Print table for _Width_estimate_intervals_v2.
@@ -76,20 +66,20 @@ def print_clusters_1_vs_2(self, other):
 
 def get_table_cpp20() -> table_u:
     std_wide_ranges_cpp20 = [
-        range_u(0x1100, 0x115F),
-        range_u(0x2329, 0x232A),
-        range_u(0x2E80, 0x303E),
-        range_u(0x3040, 0xA4CF),
-        range_u(0xAC00, 0xD7A3),
-        range_u(0xF900, 0xFAFF),
-        range_u(0xFE10, 0xFE19),
-        range_u(0xFE30, 0xFE6F),
-        range_u(0xFF00, 0xFF60),
-        range_u(0xFFE0, 0xFFE6),
-        range_u(0x1F300, 0x1F64F),
-        range_u(0x1F900, 0x1F9FF),
-        range_u(0x20000, 0x2FFFD),
-        range_u(0x30000, 0x3FFFD),
+        (0x1100, 0x115F),
+        (0x2329, 0x232A),
+        (0x2E80, 0x303E),
+        (0x3040, 0xA4CF),
+        (0xAC00, 0xD7A3),
+        (0xF900, 0xFAFF),
+        (0xFE10, 0xFE19),
+        (0xFE30, 0xFE6F),
+        (0xFF00, 0xFF60),
+        (0xFFE0, 0xFFE6),
+        (0x1F300, 0x1F64F),
+        (0x1F900, 0x1F9FF),
+        (0x20000, 0x2FFFD),
+        (0x30000, 0x3FFFD),
     ]
 
     table = table_u()
@@ -110,11 +100,11 @@ def read_from(source: TextIO) -> table_u:
 
     # "The unassigned code points in the following blocks default to "W":"
     default_wide_ranges = [
-        range_u(0x4E00, 0x9FFF),
-        range_u(0x3400, 0x4DBF),
-        range_u(0xF900, 0xFAFF),
-        range_u(0x20000, 0x2FFFD),
-        range_u(0x30000, 0x3FFFD),
+        (0x4E00, 0x9FFF),
+        (0x3400, 0x4DBF),
+        (0xF900, 0xFAFF),
+        (0x20000, 0x2FFFD),
+        (0x30000, 0x3FFFD),
     ]
     for rng in default_wide_ranges:
         table.fill_range(rng, width_u.is_2)
@@ -139,10 +129,10 @@ def get_width(str: str):
             if match.group(2):
                 # range (HEX..HEX)
                 to_val = int(match.group(2)[2:], base=16)
-                table.fill_range(range_u(from_val, to_val), width)
+                table.fill_range((from_val, to_val), width)
             else:
                 # single character (HEX)
-                table.fill_range(range_u(from_val), width)
+                table.fill_range((from_val, from_val), width)
 
     return table
 
@@ -152,9 +142,9 @@ def get_table_cpp23(source: TextIO) -> table_u:
 
     # Override with ranges specified by the C++ standard.
     std_wide_ranges_cpp23 = [
-        range_u(0x4DC0, 0x4DFF),
-        range_u(0x1F300, 0x1F5FF),
-        range_u(0x1F900, 0x1F9FF),
+        (0x4DC0, 0x4DFF),
+        (0x1F300, 0x1F5FF),
+        (0x1F900, 0x1F9FF),
     ]
 
     for rng in std_wide_ranges_cpp23:
@@ -168,10 +158,10 @@ def main():
     old_table = get_table_cpp20()
     old_table.print_intervals()
 
-    print("\nNew table:\nInput path for EastAsianWidth.txt: ", end="")
-    path = input()
-    with open(path) as source:
+    path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
+    with open(path, mode="rt", encoding="utf-8") as source:
         new_table = get_table_cpp23(source)
+    print("New table:")
     new_table.print_intervals()
 
     print("\nWas 1, now 2:")

From 83ef03abeff8500ed3fb79373f086ce036eccf7c Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Sun, 17 Sep 2023 21:02:18 +0700
Subject: [PATCH 06/21] use enumerate

---
 .../format_width_estimate_intervals.py                      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 7ae08201da..331a40f3d7 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -31,15 +31,15 @@ def print_intervals(self):
         # Print table for _Width_estimate_intervals_v2.
         c = 0
         last = self.table[0]
-        for u in range(self.max_u + 1):
-            if self.table[u] != last:
+        for u, el in enumerate(self.table):
+            if el != last:
                 print(f"0x{u:X}u, ", end="")
                 if c == 11:
                     c = 0
                     print()
                 else:
                     c += 1
-            last = self.table[u]
+            last = el
 
         print()
 

From 207e428979b1f6aff871c1ad3f45202e9e196380 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Thu, 21 Sep 2023 21:28:33 +0700
Subject: [PATCH 07/21] apply code review suggestions

---
 .../format_width_estimate_intervals.py        | 103 ++++++++++--------
 1 file changed, 60 insertions(+), 43 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 331a40f3d7..d14078e1a6 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -4,57 +4,73 @@
 # The following code generates data for _Width_estimate_intervals_v2 in <format>.
 
 import re
-from dataclasses import dataclass
+from enum import Enum
 from typing import TextIO
 from pathlib import Path
 
 
-class width_u:
-    is_1: bool = False
-    is_2: bool = True
+LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
 
 
-class table_u:
-    # A valid Unicode code point won't exceed max_u.
-    max_u: int = 0x10FFFF
+class UnicodeWidth(Enum):
+    IS_1: int = 1
+    IS_2: int = 2
+
+
+class UnicodeTable:
+    # A valid Unicode code point won't exceed MAX_UNICODE_POINT.
+    MAX_UNICODE_POINT: int = 0x10FFFF
+    UNICODE_TABLE_SIZE: int = MAX_UNICODE_POINT + 1
 
     def __init__(self):
-        self.table = [width_u.is_1] * (self.max_u + 1)
+        self.table = [UnicodeWidth.IS_1] * (self.UNICODE_TABLE_SIZE)
 
     def fill_range(self, rng: tuple, width: bool):
-        from_, to_ = rng[0], rng[1]
+        from_, to_ = rng
         assert from_ <= to_, "impl assertion failed"
-        assert to_ <= self.max_u, "impl assertion failed"
+        assert to_ <= self.MAX_UNICODE_POINT, "impl assertion failed"
         self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
 
     def print_intervals(self):
-        # Print table for _Width_estimate_intervals_v2.
-        c = 0
-        last = self.table[0]
-        for u, el in enumerate(self.table):
-            if el != last:
+        """
+        Generates _Width_estimate_intervals_v2.
+        That is, starting from the second code point ([1])
+        until and including the last code point([MAX_UNICODE_POINT]),
+        for code point [U], if width[U]!=width[U-1], we print I to indicate the new range.
+        """
+        printed_elements_on_one_line = 0
+        assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed"
+        for u in range(1, self.UNICODE_TABLE_SIZE):
+            if self.table[u] != self.table[u - 1]:
                 print(f"0x{u:X}u, ", end="")
-                if c == 11:
-                    c = 0
+                if printed_elements_on_one_line == 11:
+                    printed_elements_on_one_line = 0
                     print()
                 else:
-                    c += 1
-            last = el
+                    printed_elements_on_one_line += 1
 
         print()
 
     def print_clusters_1_vs_2(self, other):
-        cluster_table = [False] * (self.max_u + 1)
-        for u in range(self.max_u + 1):
-            if self.table[u] == width_u.is_1 and other.table[u] == width_u.is_2:
+        """
+        Print all ranges, in closed-end form
+        (to match with the standard/data file/and the annex in the paper),
+        that self.width[range] are all 1 and other.width[range] are all 2.
+        """
+        cluster_table = [False] * (self.UNICODE_TABLE_SIZE)
+        for u in range(self.UNICODE_TABLE_SIZE):
+            if (
+                self.table[u] == UnicodeWidth.IS_1
+                and other.table[u] == UnicodeWidth.IS_2
+            ):
                 cluster_table[u] = True
 
         u = 0
-        while u < self.max_u + 1:
+        while u < self.UNICODE_TABLE_SIZE:
             if cluster_table[u]:
                 from_ = u
                 to_ = from_
-                while to_ + 1 <= self.max_u and cluster_table[to_ + 1]:
+                while to_ + 1 <= self.MAX_UNICODE_POINT and cluster_table[to_ + 1]:
                     to_ += 1
                 if from_ == to_:
                     print(f"U+{from_:X}")
@@ -64,7 +80,7 @@ def print_clusters_1_vs_2(self, other):
             u += 1
 
 
-def get_table_cpp20() -> table_u:
+def get_table_cpp20() -> UnicodeTable:
     std_wide_ranges_cpp20 = [
         (0x1100, 0x115F),
         (0x2329, 0x232A),
@@ -82,21 +98,23 @@ def get_table_cpp20() -> table_u:
         (0x30000, 0x3FFFD),
     ]
 
-    table = table_u()
+    table = UnicodeTable()
     for rng in std_wide_ranges_cpp20:
-        table.fill_range(rng, width_u.is_2)
+        table.fill_range(rng, UnicodeWidth.IS_2)
 
     return table
 
 
-# Read data from "EastAsianWidth.txt".
-# The latest version can be found at:
-# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
-# The current implementation works for:
-# https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
-# To make this function work, the file should not contain a BOM.
-def read_from(source: TextIO) -> table_u:
-    table = table_u()
+def read_from(source: TextIO) -> UnicodeTable:
+    """
+    Read data from "EastAsianWidth.txt".
+    The latest version can be found at:
+    https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+    The current implementation works for:
+    https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
+    To make this function work, the file should not contain a BOM.
+    """
+    table = UnicodeTable()
 
     # "The unassigned code points in the following blocks default to "W":"
     default_wide_ranges = [
@@ -107,22 +125,21 @@ def read_from(source: TextIO) -> table_u:
         (0x30000, 0x3FFFD),
     ]
     for rng in default_wide_ranges:
-        table.fill_range(rng, width_u.is_2)
+        table.fill_range(rng, UnicodeWidth.IS_2)
 
     # Read explicitly assigned ranges.
     # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
     def get_width(str: str):
         if str == "F" or str == "W":
-            return width_u.is_2
+            return UnicodeWidth.IS_2
         else:
             assert str == "A" or str == "H" or str == "N" or str == "Na"
-            return width_u.is_1
+            return UnicodeWidth.IS_1
 
-    reg = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
     for line in source:
         line = line.strip()
         if line and not line.startswith("#"):
-            match = reg.fullmatch(line)
+            match = LINE_REGEX.fullmatch(line)
             assert match, "invalid line"
             from_val = int(match.group(1), base=16)
             width = get_width(match.group(3))
@@ -132,12 +149,12 @@ def get_width(str: str):
                 table.fill_range((from_val, to_val), width)
             else:
                 # single character (HEX)
-                table.fill_range((from_val, from_val), width)
+                table.table[from_val] = width
 
     return table
 
 
-def get_table_cpp23(source: TextIO) -> table_u:
+def get_table_cpp23(source: TextIO) -> UnicodeTable:
     table = read_from(source)
 
     # Override with ranges specified by the C++ standard.
@@ -148,7 +165,7 @@ def get_table_cpp23(source: TextIO) -> table_u:
     ]
 
     for rng in std_wide_ranges_cpp23:
-        table.fill_range(rng, width_u.is_2)
+        table.fill_range(rng, UnicodeWidth.IS_2)
 
     return table
 

From 7c75b614f674e68b2e4206b0cba50d3920b35497 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Thu, 21 Sep 2023 21:47:53 +0700
Subject: [PATCH 08/21] rephrase some comments

---
 .../format_width_estimate_intervals.py               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index d14078e1a6..1caabd31d7 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -34,9 +34,9 @@ def fill_range(self, rng: tuple, width: bool):
     def print_intervals(self):
         """
         Generates _Width_estimate_intervals_v2.
-        That is, starting from the second code point ([1])
-        until and including the last code point([MAX_UNICODE_POINT]),
-        for code point [U], if width[U]!=width[U-1], we print I to indicate the new range.
+        It begins from the second code point and continues up to the last one, encompassing it as well.
+        Whenever a code point's width differs from the previous one,
+        the function displays the code point's index to indicate the start of a new range.
         """
         printed_elements_on_one_line = 0
         assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed"
@@ -53,9 +53,9 @@ def print_intervals(self):
 
     def print_clusters_1_vs_2(self, other):
         """
-        Print all ranges, in closed-end form
-        (to match with the standard/data file/and the annex in the paper),
-        that self.width[range] are all 1 and other.width[range] are all 2.
+        Print closed-end ranges for all code points
+        where the width is consistently 1 in the self.table range and 2 in the other.table range.
+        This output is consistent with the standard/data file and the annex in the paper
         """
         cluster_table = [False] * (self.UNICODE_TABLE_SIZE)
         for u in range(self.UNICODE_TABLE_SIZE):

From c8b9454e405e15847e9666a6dbc6aaacf9e46326 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Thu, 21 Sep 2023 21:55:52 +0700
Subject: [PATCH 09/21] change the comments a bit

---
 .../format_width_estimate_intervals.py                      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 1caabd31d7..690f451041 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -34,9 +34,9 @@ def fill_range(self, rng: tuple, width: bool):
     def print_intervals(self):
         """
         Generates _Width_estimate_intervals_v2.
-        It begins from the second code point and continues up to the last one, encompassing it as well.
+        It begins from the second code point and continues up to the last one, including it as well.
         Whenever a code point's width differs from the previous one,
-        the function displays the code point's index to indicate the start of a new range.
+        the function print the code point to indicate the start of a new range.
         """
         printed_elements_on_one_line = 0
         assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed"
@@ -53,7 +53,7 @@ def print_intervals(self):
 
     def print_clusters_1_vs_2(self, other):
         """
-        Print closed-end ranges for all code points
+        Print all ranges, in closed-end form
         where the width is consistently 1 in the self.table range and 2 in the other.table range.
         This output is consistent with the standard/data file and the annex in the paper
         """

From 294f157981dcb9071adc08e190e3cdec5cbe85bb Mon Sep 17 00:00:00 2001
From: achabense <60953653+achabense@users.noreply.github.com>
Date: Fri, 22 Sep 2023 00:26:41 +0800
Subject: [PATCH 10/21] > restore the location of `LINE_REGEX` > add comment
 for `fill_range` > rename some variables > type: bool~>int > refine names for
 the two function; simplify comments > better message

---
 .../format_width_estimate_intervals.py        | 66 ++++++++-----------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 690f451041..e5fa1c7250 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -9,38 +9,30 @@
 from pathlib import Path
 
 
-LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
-
-
 class UnicodeWidth(Enum):
     IS_1: int = 1
     IS_2: int = 2
 
 
-class UnicodeTable:
-    # A valid Unicode code point won't exceed MAX_UNICODE_POINT.
-    MAX_UNICODE_POINT: int = 0x10FFFF
-    UNICODE_TABLE_SIZE: int = MAX_UNICODE_POINT + 1
+class UnicodeWidthTable:
+    # A valid Unicode code point won't exceed MAX_CODE_POINT.
+    MAX_CODE_POINT: int = 0x10FFFF
+    TABLE_SIZE: int = MAX_CODE_POINT + 1
 
     def __init__(self):
-        self.table = [UnicodeWidth.IS_1] * (self.UNICODE_TABLE_SIZE)
+        self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
 
-    def fill_range(self, rng: tuple, width: bool):
+    # "rng" denotes a right-closed range.
+    def fill_range(self, rng: tuple, width: int):
         from_, to_ = rng
         assert from_ <= to_, "impl assertion failed"
-        assert to_ <= self.MAX_UNICODE_POINT, "impl assertion failed"
+        assert to_ <= self.MAX_CODE_POINT, "impl assertion failed"
         self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
 
-    def print_intervals(self):
-        """
-        Generates _Width_estimate_intervals_v2.
-        It begins from the second code point and continues up to the last one, including it as well.
-        Whenever a code point's width differs from the previous one,
-        the function print the code point to indicate the start of a new range.
-        """
+    def print_width_estimate_intervals(self):
         printed_elements_on_one_line = 0
         assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed"
-        for u in range(1, self.UNICODE_TABLE_SIZE):
+        for u in range(1, self.TABLE_SIZE):
             if self.table[u] != self.table[u - 1]:
                 print(f"0x{u:X}u, ", end="")
                 if printed_elements_on_one_line == 11:
@@ -51,14 +43,10 @@ def print_intervals(self):
 
         print()
 
-    def print_clusters_1_vs_2(self, other):
-        """
-        Print all ranges, in closed-end form
-        where the width is consistently 1 in the self.table range and 2 in the other.table range.
-        This output is consistent with the standard/data file and the annex in the paper
-        """
-        cluster_table = [False] * (self.UNICODE_TABLE_SIZE)
-        for u in range(self.UNICODE_TABLE_SIZE):
+    # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
+    def print_ranges_1_vs_2(self, other):
+        cluster_table = [False] * (self.TABLE_SIZE)
+        for u in range(self.TABLE_SIZE):
             if (
                 self.table[u] == UnicodeWidth.IS_1
                 and other.table[u] == UnicodeWidth.IS_2
@@ -66,11 +54,11 @@ def print_clusters_1_vs_2(self, other):
                 cluster_table[u] = True
 
         u = 0
-        while u < self.UNICODE_TABLE_SIZE:
+        while u < self.TABLE_SIZE:
             if cluster_table[u]:
                 from_ = u
                 to_ = from_
-                while to_ + 1 <= self.MAX_UNICODE_POINT and cluster_table[to_ + 1]:
+                while to_ + 1 <= self.MAX_CODE_POINT and cluster_table[to_ + 1]:
                     to_ += 1
                 if from_ == to_:
                     print(f"U+{from_:X}")
@@ -80,7 +68,7 @@ def print_clusters_1_vs_2(self, other):
             u += 1
 
 
-def get_table_cpp20() -> UnicodeTable:
+def get_table_cpp20() -> UnicodeWidthTable:
     std_wide_ranges_cpp20 = [
         (0x1100, 0x115F),
         (0x2329, 0x232A),
@@ -98,14 +86,14 @@ def get_table_cpp20() -> UnicodeTable:
         (0x30000, 0x3FFFD),
     ]
 
-    table = UnicodeTable()
+    table = UnicodeWidthTable()
     for rng in std_wide_ranges_cpp20:
         table.fill_range(rng, UnicodeWidth.IS_2)
 
     return table
 
 
-def read_from(source: TextIO) -> UnicodeTable:
+def read_from(source: TextIO) -> UnicodeWidthTable:
     """
     Read data from "EastAsianWidth.txt".
     The latest version can be found at:
@@ -114,7 +102,7 @@ def read_from(source: TextIO) -> UnicodeTable:
     https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
     To make this function work, the file should not contain a BOM.
     """
-    table = UnicodeTable()
+    table = UnicodeWidthTable()
 
     # "The unassigned code points in the following blocks default to "W":"
     default_wide_ranges = [
@@ -129,6 +117,8 @@ def read_from(source: TextIO) -> UnicodeTable:
 
     # Read explicitly assigned ranges.
     # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
+    LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
+
     def get_width(str: str):
         if str == "F" or str == "W":
             return UnicodeWidth.IS_2
@@ -140,7 +130,7 @@ def get_width(str: str):
         line = line.strip()
         if line and not line.startswith("#"):
             match = LINE_REGEX.fullmatch(line)
-            assert match, "invalid line"
+            assert match, line # invalid line
             from_val = int(match.group(1), base=16)
             width = get_width(match.group(3))
             if match.group(2):
@@ -154,7 +144,7 @@ def get_width(str: str):
     return table
 
 
-def get_table_cpp23(source: TextIO) -> UnicodeTable:
+def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
     table = read_from(source)
 
     # Override with ranges specified by the C++ standard.
@@ -173,18 +163,18 @@ def get_table_cpp23(source: TextIO) -> UnicodeTable:
 def main():
     print("Old table:")
     old_table = get_table_cpp20()
-    old_table.print_intervals()
+    old_table.print_width_estimate_intervals()
 
     path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
     with open(path, mode="rt", encoding="utf-8") as source:
         new_table = get_table_cpp23(source)
     print("New table:")
-    new_table.print_intervals()
+    new_table.print_width_estimate_intervals()
 
     print("\nWas 1, now 2:")
-    old_table.print_clusters_1_vs_2(new_table)
+    old_table.print_ranges_1_vs_2(new_table)
     print("\nWas 2, now 1:")
-    new_table.print_clusters_1_vs_2(old_table)
+    new_table.print_ranges_1_vs_2(old_table)
 
 
 if __name__ == "__main__":

From 6e2c7603acb50b81c9f2e70a7daebf09f94efe42 Mon Sep 17 00:00:00 2001
From: achabense <60953653+achabense@users.noreply.github.com>
Date: Sun, 24 Sep 2023 13:14:40 +0800
Subject: [PATCH 11/21] > simplify `print_ranges_1_vs_2`

---
 .../format_width_estimate_intervals.py               | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index e5fa1c7250..3ecc6c602d 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -45,20 +45,18 @@ def print_width_estimate_intervals(self):
 
     # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
     def print_ranges_1_vs_2(self, other):
-        cluster_table = [False] * (self.TABLE_SIZE)
-        for u in range(self.TABLE_SIZE):
-            if (
+        def _1_vs_2(u: int):
+            return (
                 self.table[u] == UnicodeWidth.IS_1
                 and other.table[u] == UnicodeWidth.IS_2
-            ):
-                cluster_table[u] = True
+            )
 
         u = 0
         while u < self.TABLE_SIZE:
-            if cluster_table[u]:
+            if _1_vs_2(u):
                 from_ = u
                 to_ = from_
-                while to_ + 1 <= self.MAX_CODE_POINT and cluster_table[to_ + 1]:
+                while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
                     to_ += 1
                 if from_ == to_:
                     print(f"U+{from_:X}")

From 77ce2320c11d13309b03275f895ea9e8a087b601 Mon Sep 17 00:00:00 2001
From: achabense <60953653+achabense@users.noreply.github.com>
Date: Sun, 24 Sep 2023 14:13:33 +0800
Subject: [PATCH 12/21] > add documentation for
 `print_width_estimate_intervals`

---
 .../format_width_estimate_intervals.py          | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 3ecc6c602d..ee195e5111 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 
+# Width estimation.
 class UnicodeWidth(Enum):
     IS_1: int = 1
     IS_2: int = 2
@@ -25,14 +26,24 @@ def __init__(self):
     # "rng" denotes a right-closed range.
     def fill_range(self, rng: tuple, width: int):
         from_, to_ = rng
-        assert from_ <= to_, "impl assertion failed"
-        assert to_ <= self.MAX_CODE_POINT, "impl assertion failed"
+        assert from_ <= to_, "invalid range"
+        assert to_ <= self.MAX_CODE_POINT, "invalid range"
         self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
 
     def print_width_estimate_intervals(self):
+        """
+        Divide [0..MAX_CODE_POINT] into ranges with different width estimations.
+        Represent each range with their starting values.
+        The starting value of the first range is always 0 and omitted.
+        The width estimation should be 1 for the first range, then alternate between 2 and 1.
+        """
         printed_elements_on_one_line = 0
-        assert self.table[0] == UnicodeWidth.IS_1, "impl assertion failed"
+        assert self.table[0] == UnicodeWidth.IS_1
         for u in range(1, self.TABLE_SIZE):
+            assert (
+                self.table[u] == UnicodeWidth.IS_1
+                or self.table[u] == UnicodeWidth.IS_2
+            )
             if self.table[u] != self.table[u - 1]:
                 print(f"0x{u:X}u, ", end="")
                 if printed_elements_on_one_line == 11:

From 8e88bb8f9ea0ae62be75b1bc63826e6a48f60241 Mon Sep 17 00:00:00 2001
From: achabense <60953653+achabense@users.noreply.github.com>
Date: Sun, 24 Sep 2023 21:07:26 +0800
Subject: [PATCH 13/21] > update regex to support both `15.0.0` and `15.1.0`

---
 .../format_width_estimate_intervals.py                      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index ee195e5111..5122428cf4 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -108,7 +108,7 @@ def read_from(source: TextIO) -> UnicodeWidthTable:
     The latest version can be found at:
     https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
     The current implementation works for:
-    https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
+    https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
     To make this function work, the file should not contain a BOM.
     """
     table = UnicodeWidthTable()
@@ -125,8 +125,8 @@ def read_from(source: TextIO) -> UnicodeWidthTable:
         table.fill_range(rng, UnicodeWidth.IS_2)
 
     # Read explicitly assigned ranges.
-    # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
-    LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*")
+    # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
+    LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
 
     def get_width(str: str):
         if str == "F" or str == "W":

From e4f86656eac33e57a4ebcb188768bbd34929b3e6 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 13:12:42 -0700
Subject: [PATCH 14/21] Print the filename, timestamp, and C++ array.

---
 .../format_width_estimate_intervals.py                     | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 5122428cf4..59d6d2948d 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -176,9 +176,16 @@ def main():
 
     path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
     with open(path, mode="rt", encoding="utf-8") as source:
+        filename = source.readline().replace("#", "//").rstrip()
+        timestamp = source.readline().replace("#", "//").rstrip()
         new_table = get_table_cpp23(source)
     print("New table:")
+    print()
+    print(filename)
+    print(timestamp)
+    print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //")
     new_table.print_width_estimate_intervals()
+    print("};")
 
     print("\nWas 1, now 2:")
     old_table.print_ranges_1_vs_2(new_table)

From 15320fa5cbf617b4b0869335d0316ed302f2af60 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 13:23:17 -0700
Subject: [PATCH 15/21] Don't bother carefully wrapping; we need to
 clang-format anyways.

---
 .../format_width_estimate_intervals.py                 | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 59d6d2948d..5bce0c2913 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -37,7 +37,6 @@ def print_width_estimate_intervals(self):
         The starting value of the first range is always 0 and omitted.
         The width estimation should be 1 for the first range, then alternate between 2 and 1.
         """
-        printed_elements_on_one_line = 0
         assert self.table[0] == UnicodeWidth.IS_1
         for u in range(1, self.TABLE_SIZE):
             assert (
@@ -46,13 +45,6 @@ def print_width_estimate_intervals(self):
             )
             if self.table[u] != self.table[u - 1]:
                 print(f"0x{u:X}u, ", end="")
-                if printed_elements_on_one_line == 11:
-                    printed_elements_on_one_line = 0
-                    print()
-                else:
-                    printed_elements_on_one_line += 1
-
-        print()
 
     # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
     def print_ranges_1_vs_2(self, other):
@@ -179,7 +171,7 @@ def main():
         filename = source.readline().replace("#", "//").rstrip()
         timestamp = source.readline().replace("#", "//").rstrip()
         new_table = get_table_cpp23(source)
-    print("New table:")
+    print("\n\nNew table:")
     print()
     print(filename)
     print(timestamp)

From 9d7958c9fe5b0f5d8bf9087e2dcad909468a23ad Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 13:33:49 -0700
Subject: [PATCH 16/21] Use `join` to avoid a trailing comma interfering with
 clang-format.

---
 .../format_width_estimate_intervals.py                | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 5bce0c2913..9f64bc7bde 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -30,13 +30,14 @@ def fill_range(self, rng: tuple, width: int):
         assert to_ <= self.MAX_CODE_POINT, "invalid range"
         self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
 
-    def print_width_estimate_intervals(self):
+    def width_estimate_intervals(self):
         """
         Divide [0..MAX_CODE_POINT] into ranges with different width estimations.
         Represent each range with their starting values.
         The starting value of the first range is always 0 and omitted.
         The width estimation should be 1 for the first range, then alternate between 2 and 1.
         """
+        values = []
         assert self.table[0] == UnicodeWidth.IS_1
         for u in range(1, self.TABLE_SIZE):
             assert (
@@ -44,7 +45,9 @@ def print_width_estimate_intervals(self):
                 or self.table[u] == UnicodeWidth.IS_2
             )
             if self.table[u] != self.table[u - 1]:
-                print(f"0x{u:X}u, ", end="")
+                values.append(u)
+
+        return ", ".join([f"0x{u:X}u" for u in values])
 
     # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
     def print_ranges_1_vs_2(self, other):
@@ -164,7 +167,7 @@ def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
 def main():
     print("Old table:")
     old_table = get_table_cpp20()
-    old_table.print_width_estimate_intervals()
+    print(old_table.width_estimate_intervals())
 
     path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
     with open(path, mode="rt", encoding="utf-8") as source:
@@ -176,7 +179,7 @@ def main():
     print(filename)
     print(timestamp)
     print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //")
-    new_table.print_width_estimate_intervals()
+    print(new_table.width_estimate_intervals())
     print("};")
 
     print("\nWas 1, now 2:")

From 8ebce4943cb540b4c3fb5db418cbba82b46b7a0b Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 13:49:36 -0700
Subject: [PATCH 17/21] Use a template string to make the C++ output clearer,
 like the other generators.

---
 .../format_width_estimate_intervals.py        | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 9f64bc7bde..58a9e32a3a 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -164,6 +164,14 @@ def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
     return table
 
 
+WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
+{filename}
+{timestamp}
+inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
+{values} }};
+"""
+
+
 def main():
     print("Old table:")
     old_table = get_table_cpp20()
@@ -174,15 +182,15 @@ def main():
         filename = source.readline().replace("#", "//").rstrip()
         timestamp = source.readline().replace("#", "//").rstrip()
         new_table = get_table_cpp23(source)
-    print("\n\nNew table:")
-    print()
-    print(filename)
-    print(timestamp)
-    print("inline constexpr char32_t _Width_estimate_intervals_v2[] = { //")
-    print(new_table.width_estimate_intervals())
-    print("};")
-
-    print("\nWas 1, now 2:")
+    print("\nNew table:")
+    print(
+        WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
+            filename=filename,
+            timestamp=timestamp,
+            values=new_table.width_estimate_intervals(),
+        )
+    )
+    print("Was 1, now 2:")
     old_table.print_ranges_1_vs_2(new_table)
     print("\nWas 2, now 1:")
     new_table.print_ranges_1_vs_2(old_table)

From ab8a2759f211debce40f34de36b975ea30956667 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 14:00:00 -0700
Subject: [PATCH 18/21] Format with Prettier.

---
 .../format_width_estimate_intervals.py                       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 58a9e32a3a..d6fb89a959 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -41,8 +41,7 @@ def width_estimate_intervals(self):
         assert self.table[0] == UnicodeWidth.IS_1
         for u in range(1, self.TABLE_SIZE):
             assert (
-                self.table[u] == UnicodeWidth.IS_1
-                or self.table[u] == UnicodeWidth.IS_2
+                self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
             )
             if self.table[u] != self.table[u - 1]:
                 values.append(u)
@@ -134,7 +133,7 @@ def get_width(str: str):
         line = line.strip()
         if line and not line.startswith("#"):
             match = LINE_REGEX.fullmatch(line)
-            assert match, line # invalid line
+            assert match, line  # invalid line
             from_val = int(match.group(1), base=16)
             width = get_width(match.group(3))
             if match.group(2):

From 398e37d18df2cf9b48fee77985cd17e79087b33a Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 14:01:28 -0700
Subject: [PATCH 19/21] Regenerate, capturing the filename and timestamp.

---
 stl/inc/format | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/stl/inc/format b/stl/inc/format
index cfee5e3079..190c29557a 100644
--- a/stl/inc/format
+++ b/stl/inc/format
@@ -989,6 +989,9 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
 
 // Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
 // in the https://github.com/microsoft/stl repository.
+
+// EastAsianWidth-15.0.0.txt
+// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
 inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
     0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
     0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,

From 13e319748e3fb8ef3701a832feaf8e3e5de58505 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Sun, 24 Sep 2023 14:12:31 -0700
Subject: [PATCH 20/21] Cite the Standardese.

---
 .../unicode_properties_parse/format_width_estimate_intervals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index d6fb89a959..56da9c2ad4 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -150,7 +150,7 @@ def get_width(str: str):
 def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
     table = read_from(source)
 
-    # Override with ranges specified by the C++ standard.
+    # Override with ranges specified by N4958 [format.string.std]/13.
     std_wide_ranges_cpp23 = [
         (0x4DC0, 0x4DFF),
         (0x1F300, 0x1F5FF),

From fa0b783e0eb18f5852e9a17e3424e4ca2ef33a1d Mon Sep 17 00:00:00 2001
From: nicole mazzuca <83086508+strega-nil-ms@users.noreply.github.com>
Date: Thu, 5 Oct 2023 11:05:04 -0800
Subject: [PATCH 21/21] Update
 tools/unicode_properties_parse/format_width_estimate_intervals.py

---
 .../format_width_estimate_intervals.py                 | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/unicode_properties_parse/format_width_estimate_intervals.py b/tools/unicode_properties_parse/format_width_estimate_intervals.py
index 56da9c2ad4..b0c4bc8a79 100644
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@@ -32,10 +32,12 @@ def fill_range(self, rng: tuple, width: int):
 
     def width_estimate_intervals(self):
         """
-        Divide [0..MAX_CODE_POINT] into ranges with different width estimations.
-        Represent each range with their starting values.
-        The starting value of the first range is always 0 and omitted.
-        The width estimation should be 1 for the first range, then alternate between 2 and 1.
+        Creates a string representation of the map (in `self.table`) from
+        unicode code points to their width, using hexadecimal unsigned integer literals.
+        Since there are long runs of code points of one width or the other,
+        this representation is a list of code points where the width switches.
+        Additionally, the width is assumed to start at `1` from the beginning of the list.
+        For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
         """
         values = []
         assert self.table[0] == UnicodeWidth.IS_1