From 6163699f9fe5b65c9b889c918ff5e955ef7affc0 Mon Sep 17 00:00:00 2001 From: fliiiix Date: Tue, 22 Oct 2024 20:08:59 +0200 Subject: [PATCH] Improve runtime of dir listing and add memoizing instead of files Messured results: $ python test_before.py n_projects=412 n_versions=4038 storage='43 GB' first run: 12.587313868105412 second run: 4.187226295471191e-06 $ python test_after.py n_projects=412 n_versions=4038 storage='43 GB' first run: 3.5831107571721077 second run: 5.334615707397461e-06 cash clear run: 3.1085935048758984 tests show small differences in size but this does not matter in the grand schema of things --- docat/docat/app.py | 19 +++++------ docat/docat/utils.py | 56 +++++++++++++++++++++++---------- docat/tests/test_hide_show.py | 4 +-- docat/tests/test_stats.py | 8 ++--- docat/tests/test_upload_icon.py | 2 +- 5 files changed, 56 insertions(+), 33 deletions(-) diff --git a/docat/docat/app.py b/docat/docat/app.py index bc4a84b5..7ec862e1 100644 --- a/docat/docat/app.py +++ b/docat/docat/app.py @@ -28,9 +28,9 @@ UPLOAD_FOLDER, calculate_token, create_symlink, - directory_size, extract_archive, get_all_projects, + get_dir_size, get_project_details, get_system_stats, is_forbidden_project_name, @@ -140,9 +140,9 @@ def upload_icon( with icon_path.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) - # recalculate size cache - directory_size(project_base_path, recalculate=True) - directory_size(DOCAT_UPLOAD_FOLDER, recalculate=True) + # force cache revalidation + get_system_stats.cache_clear() + get_dir_size.cache_clear() return ApiResponse(message="Icon successfully uploaded") @@ -271,9 +271,9 @@ def upload( if not (base_path / "index.html").exists(): return ApiResponse(message="Documentation uploaded successfully, but no index.html found at root of archive.") - # recalculate size cache - directory_size(project_base_path, recalculate=True) - directory_size(DOCAT_UPLOAD_FOLDER, recalculate=True) + # force cache revalidation + get_system_stats.cache_clear() + get_dir_size.cache_clear() return ApiResponse(message="Documentation uploaded successfully") @@ -381,8 +381,9 @@ def delete( response.status_code = status.HTTP_404_NOT_FOUND return ApiResponse(message=message) - # recalculate size cache - directory_size(DOCAT_UPLOAD_FOLDER, recalculate=True) + # force cache revalidation + get_system_stats.cache_clear() + get_dir_size.cache_clear() return ApiResponse(message=f"Successfully deleted version '{version}'") diff --git a/docat/docat/utils.py b/docat/docat/utils.py index 6a4f45c1..26adf2ae 100644 --- a/docat/docat/utils.py +++ b/docat/docat/utils.py @@ -6,6 +6,7 @@ import os import shutil from datetime import datetime +from functools import cache from pathlib import Path from zipfile import ZipFile, ZipInfo @@ -161,29 +162,50 @@ def readable_size(bytes: int) -> str: return str(amount) + size_suffix -def directory_size(path: Path, recalculate: bool = False) -> int: +@cache +def get_dir_size(path: Path) -> int: """ - Returns the size of a directory and caches it's size unless - recalculate is set to true or the cache is missed - """ - size_info = path / ".size" - if size_info.exists() and not recalculate: - return int(size_info.read_text()) - - dir_size = sum(file.stat().st_size for file in path.rglob("*") if file.is_file()) - size_info.write_text(str(dir_size)) # cache directory size + Calculate the total size of a directory. - return dir_size + Results are cached (memoizing) by path. + """ + total = 0 + with os.scandir(path) as it: + for entry in it: + if entry.is_file(): + total += entry.stat().st_size + elif entry.is_dir(): + total += get_dir_size(entry.path) + return total +@cache def get_system_stats(upload_folder_path: Path) -> Stats: """ - Return all docat statistics + Return all docat statistics. + + Results are cached (memoizing) by path. """ + + dirs = 0 + versions = 0 + size = 0 + # Note: Not great nesting with the deep nesting + # but it needs to run fast, consider speed when refactoring! + with os.scandir(upload_folder_path) as root: + for f in root: + if f.is_dir(): + dirs += 1 + with os.scandir(f.path) as project: + for v in project: + if v.is_dir() and not v.is_symlink(): + size += get_dir_size(v.path) + versions += 1 + return Stats( - n_projects=len([p for p in upload_folder_path.iterdir() if p.is_dir()]), - n_versions=sum(len([p for p in d.iterdir() if p.is_dir() and not p.is_symlink()]) for d in upload_folder_path.glob("*/")), - storage=readable_size(directory_size(upload_folder_path)), + n_projects=dirs, + n_versions=versions, + storage=readable_size(size), ) @@ -212,7 +234,7 @@ def get_all_projects(upload_folder_path: Path, include_hidden: bool) -> Projects name=project_name, logo=project_has_logo, versions=details.versions, - storage=readable_size(directory_size(upload_folder_path / project)), + storage=readable_size(get_dir_size(upload_folder_path / project)), ) ) @@ -245,7 +267,7 @@ def should_include(name: str) -> bool: return ProjectDetail( name=project_name, - storage=readable_size(directory_size(docs_folder)), + storage=readable_size(get_dir_size(docs_folder)), versions=sorted( [ ProjectVersion( diff --git a/docat/tests/test_hide_show.py b/docat/tests/test_hide_show.py index 28b605f6..5c1457cb 100644 --- a/docat/tests/test_hide_show.py +++ b/docat/tests/test_hide_show.py @@ -394,7 +394,7 @@ def test_hide_and_show_with_tag(_, client_with_claimed_project): assert project_details_response.status_code == 200 assert project_details_response.json() == { "name": "some-project", - "storage": "20 bytes", + "storage": "40 bytes", "versions": [], } @@ -408,6 +408,6 @@ def test_hide_and_show_with_tag(_, client_with_claimed_project): assert project_details_response.status_code == 200 assert project_details_response.json() == { "name": "some-project", - "storage": "20 bytes", + "storage": "40 bytes", "versions": [{"name": "1.0.0", "timestamp": "2000-01-01T01:01:00", "tags": ["latest"], "hidden": False}], } diff --git a/docat/tests/test_stats.py b/docat/tests/test_stats.py index f1bfe7ba..166c4a9b 100644 --- a/docat/tests/test_stats.py +++ b/docat/tests/test_stats.py @@ -9,10 +9,10 @@ @pytest.mark.parametrize( ("project_config", "n_projects", "n_versions", "storage"), [ - ([("some-project", ["1.0.0"])], 1, 1, "22 bytes"), - ([("some-project", ["1.0.0", "2.0.0"])], 1, 2, "44 bytes"), - ([("some-project", ["1.0.0", "2.0.0"])], 1, 2, "44 bytes"), - ([("some-project", ["1.0.0", "2.0.0"]), ("another-project", ["1"])], 2, 3, "66 bytes"), + ([("some-project", ["1.0.0"])], 1, 1, "20 bytes"), + ([("some-project", ["1.0.0", "2.0.0"])], 1, 2, "40 bytes"), + ([("some-project", ["1.0.0", "2.0.0"])], 1, 2, "40 bytes"), + ([("some-project", ["1.0.0", "2.0.0"]), ("another-project", ["1"])], 2, 3, "60 bytes"), ], ) def test_get_stats(_, project_config, n_projects, n_versions, storage, client_with_claimed_project): diff --git a/docat/tests/test_upload_icon.py b/docat/tests/test_upload_icon.py index 03a04655..1a22e447 100644 --- a/docat/tests/test_upload_icon.py +++ b/docat/tests/test_upload_icon.py @@ -181,7 +181,7 @@ def test_get_project_recongizes_icon(_, client_with_claimed_project): { "name": "some-project", "logo": True, - "storage": "105 bytes", + "storage": "103 bytes", "versions": [{"name": "1.0.0", "timestamp": "2000-01-01T01:01:00", "tags": [], "hidden": False}], } ]