From bf58343db1a57880ab882c37a279c519d2a9a0a1 Mon Sep 17 00:00:00 2001 From: Schuyler Martin Date: Wed, 4 Dec 2024 09:10:34 -0700 Subject: [PATCH 1/4] First pass at parallelizing multiple source fetches --- conda_recipe_manager/commands/bump_recipe.py | 50 +++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/conda_recipe_manager/commands/bump_recipe.py b/conda_recipe_manager/commands/bump_recipe.py index 98cf9562..80bd86e4 100644 --- a/conda_recipe_manager/commands/bump_recipe.py +++ b/conda_recipe_manager/commands/bump_recipe.py @@ -5,6 +5,7 @@ from __future__ import annotations import logging +import multiprocessing.pool as mp_pool import sys import time from pathlib import Path @@ -246,6 +247,25 @@ def _update_sha256_check_hash_var( return False +def _update_sha256_fetch_and_patch( + src_path: str, fetcher: HttpArtifactFetcher, retry_interval: float +) -> tuple[str, str]: + """ + Helper function that retrieves a single HTTP source artifact, so that we can parallelize network requests. + + :param src_path: Recipe key path to the applicable artifact source. + :param fetcher: Artifact fetching instance to use. + :param retry_interval: Scalable interval between fetch requests. + :returns: A tuple containing the path to and the actual SHA-256 value to be updated. + """ + try: + sha = _get_sha256(fetcher, retry_interval) + except FetchError: + # TODO how does `exit` handle thread in context manager? + _exit_on_failed_fetch(fetcher) + return (RecipeParser.append_to_path(src_path, "/sha256"), sha) + + def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: """ Attempts to update the SHA-256 hash(s) in the `/source` section of a recipe file, if applicable. Note that this is @@ -270,20 +290,24 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: # different hashes might imply there is a security threat. We will log some statistics so the user can best decide # what to do. unique_hashes: set[str] = set() - total_hash_cntr = 0 - # TODO parallelize requests for multiple sources - for src_path, fetcher in fetcher_tbl.items(): - if not isinstance(fetcher, HttpArtifactFetcher): - continue - try: - sha = _get_sha256(fetcher, retry_interval) - except FetchError: - _exit_on_failed_fetch(fetcher) - total_hash_cntr += 1 - unique_hashes.add(sha) - sha_path = RecipeParser.append_to_path(src_path, "/sha256") + # Filter-out artifacts that don't need a SHA-256 hash. + http_fetcher_tbl: Final[dict[str, HttpArtifactFetcher]] = { + k: v for k, v in fetcher_tbl.items() if isinstance(v, HttpArtifactFetcher) + } + # Parallelize on acquiring multiple source artifacts on the network. In testing, using the process Pool took ~9 + # seconds and used significantly more resources than using the ThreadPool took ~2 seconds. This test simulated a + # recipe with 15 sources. Because this process is so I/O bound, the ThreadPool class is the way to go. + with mp_pool.ThreadPool() as pool: + sha_path_to_sha_tbl = dict( + pool.starmap( + _update_sha256_fetch_and_patch, + [(src_path, fetcher, retry_interval) for src_path, fetcher in http_fetcher_tbl.items()], + ) + ) + for sha_path, sha in sha_path_to_sha_tbl.items(): + unique_hashes.add(sha) # Guard against the unlikely scenario that the `sha256` field is missing. patch_op = "replace" if recipe_parser.contains_value(sha_path) else "add" _exit_on_failed_patch(recipe_parser, {"op": patch_op, "path": sha_path, "value": sha}) @@ -291,7 +315,7 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: log.info( "Found %d unique SHA-256 hash(es) out of a total of %d hash(es) in %d sources.", len(unique_hashes), - total_hash_cntr, + len(sha_path_to_sha_tbl), len(fetcher_tbl), ) From 2fd17f8b6238f3ef221ab8bed12108f4ed637f04 Mon Sep 17 00:00:00 2001 From: Schuyler Martin Date: Wed, 4 Dec 2024 11:08:14 -0700 Subject: [PATCH 2/4] Moves to use ThreadPoolExecutor model over ThreadPool so that the script can be exited gracefully on a network failure --- conda_recipe_manager/commands/bump_recipe.py | 40 ++++++++++---------- tests/commands/test_bump_recipe.py | 5 +-- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/conda_recipe_manager/commands/bump_recipe.py b/conda_recipe_manager/commands/bump_recipe.py index 80bd86e4..1385dae7 100644 --- a/conda_recipe_manager/commands/bump_recipe.py +++ b/conda_recipe_manager/commands/bump_recipe.py @@ -4,8 +4,8 @@ from __future__ import annotations +import concurrent.futures as cf import logging -import multiprocessing.pool as mp_pool import sys import time from pathlib import Path @@ -247,22 +247,17 @@ def _update_sha256_check_hash_var( return False -def _update_sha256_fetch_and_patch( - src_path: str, fetcher: HttpArtifactFetcher, retry_interval: float -) -> tuple[str, str]: +def _update_sha256_fetch_one(src_path: str, fetcher: HttpArtifactFetcher, retry_interval: float) -> tuple[str, str]: """ Helper function that retrieves a single HTTP source artifact, so that we can parallelize network requests. :param src_path: Recipe key path to the applicable artifact source. :param fetcher: Artifact fetching instance to use. :param retry_interval: Scalable interval between fetch requests. + :raises FetchError: In the event that the retry mechanism failed to fetch a source artifact. :returns: A tuple containing the path to and the actual SHA-256 value to be updated. """ - try: - sha = _get_sha256(fetcher, retry_interval) - except FetchError: - # TODO how does `exit` handle thread in context manager? - _exit_on_failed_fetch(fetcher) + sha = _get_sha256(fetcher, retry_interval) return (RecipeParser.append_to_path(src_path, "/sha256"), sha) @@ -295,16 +290,23 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: http_fetcher_tbl: Final[dict[str, HttpArtifactFetcher]] = { k: v for k, v in fetcher_tbl.items() if isinstance(v, HttpArtifactFetcher) } - # Parallelize on acquiring multiple source artifacts on the network. In testing, using the process Pool took ~9 - # seconds and used significantly more resources than using the ThreadPool took ~2 seconds. This test simulated a - # recipe with 15 sources. Because this process is so I/O bound, the ThreadPool class is the way to go. - with mp_pool.ThreadPool() as pool: - sha_path_to_sha_tbl = dict( - pool.starmap( - _update_sha256_fetch_and_patch, - [(src_path, fetcher, retry_interval) for src_path, fetcher in http_fetcher_tbl.items()], - ) - ) + # Parallelize on acquiring multiple source artifacts on the network. In testing, using a process pool took + # significantly more time and resources. That aligns with how I/O bound this process is. We use the + # `ThreadPoolExecutor` class over a `ThreadPool` so the script may exit gracefully if we failed to acquire an + # artifact. + sha_path_to_sha_tbl: dict[str, str] = {} + with cf.ThreadPoolExecutor() as executor: + artifact_futures_tbl = { + executor.submit(_update_sha256_fetch_one, src_path, fetcher, retry_interval): fetcher + for src_path, fetcher in http_fetcher_tbl.items() + } + for future in cf.as_completed(artifact_futures_tbl): + fetcher = artifact_futures_tbl[future] + try: + resolved_tuple = future.result() + sha_path_to_sha_tbl[resolved_tuple[0]] = resolved_tuple[1] + except FetchError: + _exit_on_failed_fetch(fetcher) for sha_path, sha in sha_path_to_sha_tbl.items(): unique_hashes.add(sha) diff --git a/tests/commands/test_bump_recipe.py b/tests/commands/test_bump_recipe.py index 0ee3bebc..0f65f1d0 100644 --- a/tests/commands/test_bump_recipe.py +++ b/tests/commands/test_bump_recipe.py @@ -158,9 +158,8 @@ def test_bump_recipe_cli( [ ("bump_recipe/types-toml_bad_url.yaml", "0.10.8.20240310", 5), ("bump_recipe/types-toml_bad_url_hash_var.yaml", "0.10.8.20240310", 5), - # As of writing, the script will fail on the first URL that fails to be fetched, thus the count is half what - # one might expect. - ("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 5), + # Note that with futures, all 10 (5 by 2 sources) should occur by the time the futures are fully resolved. + ("bump_recipe/types-toml_bad_url_multi_source.yaml", "0.10.8.20240310", 10), # TODO validate V1 recipe files ], ) From e650b25fc5c096a0dc673cd80becc4929c46c50b Mon Sep 17 00:00:00 2001 From: Schuyler Martin Date: Thu, 5 Dec 2024 08:46:23 -0700 Subject: [PATCH 3/4] Adds notes on slow tests --- tests/commands/test_bump_recipe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/commands/test_bump_recipe.py b/tests/commands/test_bump_recipe.py index 0f65f1d0..7f4b6588 100644 --- a/tests/commands/test_bump_recipe.py +++ b/tests/commands/test_bump_recipe.py @@ -87,6 +87,7 @@ def test_usage() -> None: # Does not use `version` variable, has a non-zero build number. Note that the URL is not parameterized on the # version field. ("gsm-amzn2-aarch64.yaml", None, "bump_recipe/gsm-amzn2-aarch64_build_num_6.yaml"), + # TODO Fix this slow test tracked by Issue #265 ("gsm-amzn2-aarch64.yaml", "2.0.20210721.2", "bump_recipe/gsm-amzn2-aarch64_version_bump.yaml"), # Has a `sha256` variable ("pytest-pep8.yaml", None, "bump_recipe/pytest-pep8_build_num_2.yaml"), @@ -102,6 +103,7 @@ def test_usage() -> None: ("curl.yaml", "8.11.0", "bump_recipe/curl_version_bump.yaml"), # NOTE: libprotobuf has multiple sources, on top of being multi-output ("libprotobuf.yaml", None, "bump_recipe/libprotobuf_build_num_1.yaml"), + # TODO Fix this slow test tracked by Issue #265 ("libprotobuf.yaml", "25.3", "bump_recipe/libprotobuf_version_bump.yaml"), # Validates removal of `hash_type` variable that is sometimes used instead of the `/source/sha256` key ("types-toml_hash_type.yaml", None, "bump_recipe/types-toml_hash_type_build_num_1.yaml"), From 21f325e0c5aa32124a6fa28a6a0575e4add6e272 Mon Sep 17 00:00:00 2001 From: Schuyler Martin Date: Thu, 5 Dec 2024 08:49:17 -0700 Subject: [PATCH 4/4] Adds a --dry-run flag that can dump a modified recipe to the console instead of saving it to a file --- conda_recipe_manager/commands/bump_recipe.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/conda_recipe_manager/commands/bump_recipe.py b/conda_recipe_manager/commands/bump_recipe.py index 1385dae7..a92a2139 100644 --- a/conda_recipe_manager/commands/bump_recipe.py +++ b/conda_recipe_manager/commands/bump_recipe.py @@ -334,6 +334,12 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: is_flag=True, help="Bump the build number by 1.", ) +@click.option( + "-d", + "--dry-run", + is_flag=True, + help="Performs a dry-run operation that prints the recipe to STDOUT and does not save to the recipe file.", +) @click.option( "-t", "--target-version", @@ -353,7 +359,9 @@ def _update_sha256(recipe_parser: RecipeParser, retry_interval: float) -> None: f" Defaults to {_DEFAULT_RETRY_INTERVAL} seconds" ), ) -def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional[str], retry_interval: float) -> None: +def bump_recipe( + recipe_file_path: str, build_num: bool, dry_run: bool, target_version: Optional[str], retry_interval: float +) -> None: """ Bumps a recipe to a new version. @@ -393,5 +401,8 @@ def bump_recipe(recipe_file_path: str, build_num: bool, target_version: Optional _update_version(recipe_parser, target_version) _update_sha256(recipe_parser, retry_interval) - Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8") + if dry_run: + print(recipe_parser.render()) + else: + Path(recipe_file_path).write_text(recipe_parser.render(), encoding="utf-8") sys.exit(ExitCode.SUCCESS)