Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft: File ID Manager base implementation #921

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions jupyter_server/benchmarks/fileidmanager_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
import timeit

from jupyter_core.paths import jupyter_data_dir

from jupyter_server.services.contents.fileidmanager import FileIdManager

db_path = os.path.join(jupyter_data_dir(), "file_id_manager_perftest.db")


def build_setup(n, insert=True):
def setup():
try:
os.remove(db_path)
except:
pass
fid_manager = FileIdManager(db_path=db_path)

if not insert:
return

for i in range(n):
fid_manager.con.execute(
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
)
fid_manager.con.commit()

return setup


BATCH_SIZE = 100_000


def build_test_index(n, single_transaction, batched=False):
def test_index():
fid_manager = FileIdManager(db_path=db_path)

if single_transaction:
if batched:
for batch_start in range(0, n, BATCH_SIZE):
batch_end = batch_start + BATCH_SIZE
fid_manager.con.execute(
"INSERT INTO FILES (path) VALUES "
+ ",".join(
[f'("abracadabra/{i}.txt")' for i in range(batch_start, batch_end)]
)
)
else:
for i in range(n):
fid_manager.con.execute(
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
)

fid_manager.con.commit()
else:
for i in range(n):
fid_manager.index(f"abracadabra/{i}.txt")

return test_index


def test_copy():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.copy("abracadabra", "shazam", recursive=True)


def test_move():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.move("abracadabra", "shazam", recursive=True)


def test_delete():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.delete("abracadabra", recursive=True)


row_template = "{:<9,d} files | {:<8.4f} s"


# too slow for 1k+
print("Index benchmark (separate transactions)")
for i in [100, 1_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=False),
build_setup(i, insert=False),
number=1,
),
)
)

print("Index benchmark (single transaction, atomic INSERTs)")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=True, batched=False),
build_setup(i, insert=False),
number=1,
),
)
)

# suggested by https://stackoverflow.com/a/72527058/12548458
# asymptotically faster because it reduces work being done by the SQLite VDBE https://www.sqlite.org/opcode.html
# weird constant time factor that makes it sub-optimal for <1M records.
print("Index benchmark (single transaction, batched INSERTs)")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=True, batched=True),
build_setup(i, insert=False),
number=1,
),
)
)

print("Recursive move benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_move, build_setup(i), number=1)))

print("Recursive copy benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_copy, build_setup(i), number=1)))

print("Recursive delete benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_delete, build_setup(i), number=1)))
29 changes: 29 additions & 0 deletions jupyter_server/pytest_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from jupyter_server.extension import serverextension
from jupyter_server.serverapp import ServerApp
from jupyter_server.services.contents.fileidmanager import FileIdManager
from jupyter_server.services.contents.filemanager import FileContentsManager
from jupyter_server.services.contents.largefilemanager import LargeFileManager
from jupyter_server.utils import url_path_join
Expand Down Expand Up @@ -457,6 +458,34 @@ def jp_large_contents_manager(tmp_path):
return LargeFileManager(root_dir=str(tmp_path))


@pytest.fixture
def fid_db_path(jp_data_dir):
"""Fixture that returns the file ID DB path used for tests."""
return str(jp_data_dir / "fileidmanager_test.db")


@pytest.fixture(autouse=True)
def delete_fid_db(fid_db_path):
"""Fixture that automatically deletes the DB file after each test."""
yield
try:
os.remove(fid_db_path)
except OSError:
pass


@pytest.fixture
def fid_manager(fid_db_path, jp_root_dir):
"""Fixture returning a test-configured instance of `FileIdManager`."""
fid_manager = FileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir))
# disable journal so no temp journal file is created under `tmp_path`.
# reduces test flakiness since sometimes journal file has same ino and
# crtime as a deleted file, so FID manager detects it wrongly as a move
# also makes tests run faster :)
fid_manager.con.execute("PRAGMA journal_mode = OFF")
return fid_manager


@pytest.fixture
def jp_create_notebook(jp_root_dir):
"""Creates a notebook in the test's home directory."""
Expand Down
11 changes: 9 additions & 2 deletions jupyter_server/serverapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
)
from jupyter_server.log import log_request
from jupyter_server.services.config import ConfigManager
from jupyter_server.services.contents.fileidmanager import FileIdManager
from jupyter_server.services.contents.filemanager import (
AsyncFileContentsManager,
FileContentsManager,
Expand Down Expand Up @@ -1886,9 +1887,9 @@ def init_configurables(self):
connection_dir=self.runtime_dir,
kernel_spec_manager=self.kernel_spec_manager,
)
self.file_id_manager = FileIdManager(parent=self, log=self.log, root_dir=self.root_dir)
self.contents_manager = self.contents_manager_class(
parent=self,
log=self.log,
parent=self, log=self.log, file_id_manager=self.file_id_manager
)
self.session_manager = self.session_manager_class(
parent=self,
Expand Down Expand Up @@ -2508,6 +2509,11 @@ async def cleanup_extensions(self):
self.log.info(extension_msg % n_extensions)
await run_sync_in_loop(self.extension_manager.stop_all_extensions())

def cleanup_file_id_manager(self):
if not getattr(self, "file_id_manager", None):
return
self.file_id_manager._cleanup()
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

def running_server_info(self, kernel_count=True):
"Return the current working directory and the server url information"
info = self.contents_manager.info_string() + "\n"
Expand Down Expand Up @@ -2780,6 +2786,7 @@ async def _cleanup(self):
self.remove_browser_open_files()
await self.cleanup_extensions()
await self.cleanup_kernels()
self.cleanup_file_id_manager()
if getattr(self, "session_manager", None):
self.session_manager.close()
if getattr(self, "event_bus", None):
Expand Down
Loading