diff --git a/stl/inc/__msvc_format_ucd_tables.hpp b/stl/inc/__msvc_format_ucd_tables.hpp index 5f0d7161b2..61159396c0 100644 --- a/stl/inc/__msvc_format_ucd_tables.hpp +++ b/stl/inc/__msvc_format_ucd_tables.hpp @@ -4,7 +4,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // WARNING, this entire header is generated by -// tools/unicode_properties_parse/grapheme_break_property_data_gen.py +// tools/unicode_properties_parse/unicode_properties_data_gen.py // DO NOT MODIFY! // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE @@ -112,7 +112,7 @@ struct _Unicode_property_data { // The enums containing the values for the properties are also generated, in order to ensure they match // up correctly with how we're parsing them. // -// All sets of data tables are generated by tools/unicode_properties_parse/grapheme_break_property_data_gen.py in the +// All sets of data tables are generated by tools/unicode_properties_parse/unicode_properties_data_gen.py in the // https://github.com/microsoft/stl repository. // // The data format is a set of arrays for each character property. The first is an array of uint32_t encoding diff --git a/tools/unicode_properties_parse/download_unicode_data_files.py b/tools/unicode_properties_parse/download_unicode_data_files.py new file mode 100644 index 0000000000..bf5c587fd1 --- /dev/null +++ b/tools/unicode_properties_parse/download_unicode_data_files.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from urllib.request import urlretrieve + + +Unicode_data_files = { + "DerivedCoreProperties.txt": "https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt", + "DerivedGeneralCategory.txt": "https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt", + "EastAsianWidth.txt": "https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt", + "GraphemeBreakProperty.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt", + "GraphemeBreakTest.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt", + "emoji-data.txt": "https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt", +} + +def download_unicode_data_files(): + for filename, url in Unicode_data_files.items(): + print(f"downloading {filename} from {url}") + urlretrieve(url, filename) + + +if __name__ == "__main__": + download_unicode_data_files() diff --git a/tools/unicode_properties_parse/grapheme_break_test_data_gen.py b/tools/unicode_properties_parse/grapheme_break_test_data_gen.py index c7f85bac60..196f199c5b 100644 --- a/tools/unicode_properties_parse/grapheme_break_test_data_gen.py +++ b/tools/unicode_properties_parse/grapheme_break_test_data_gen.py @@ -91,7 +91,7 @@ def line_to_cpp_data_line_utf8(line: BreakTestItem) -> str: """ -Generate test data from "GraphemeBreakText.txt" +Generate test data from "GraphemeBreakTest.txt" This file can be downloaded from: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt This script looks for GraphemeBreakTest.txt in same directory as this script """ diff --git a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py b/tools/unicode_properties_parse/unicode_properties_data_gen.py similarity index 96% rename from tools/unicode_properties_parse/grapheme_break_property_data_gen.py rename to tools/unicode_properties_parse/unicode_properties_data_gen.py index d330700495..706471c7f0 100644 --- a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py +++ b/tools/unicode_properties_parse/unicode_properties_data_gen.py @@ -92,7 +92,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // WARNING, this entire header is generated by -// tools/unicode_properties_parse/grapheme_break_property_data_gen.py +// tools/unicode_properties_parse/unicode_properties_data_gen.py // DO NOT MODIFY! // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE @@ -201,7 +201,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: // The enums containing the values for the properties are also generated, in order to ensure they match // up correctly with how we're parsing them. // -// All sets of data tables are generated by tools/unicode_properties_parse/grapheme_break_property_data_gen.py in the +// All sets of data tables are generated by tools/unicode_properties_parse/unicode_properties_data_gen.py in the // https://github.com/microsoft/stl repository. // // The data format is a set of arrays for each character property. The first is an array of uint32_t encoding