From 86e89fe9ac4350d9fc0e2548859fa3ca2ba24a4d Mon Sep 17 00:00:00 2001 From: jdorsch Date: Mon, 9 Sep 2024 17:54:40 +0200 Subject: [PATCH 1/2] added F7T_HOME_ENABLED variable --- CHANGELOG.md | 2 ++ deploy/demo/common/common.env | 1 + deploy/k8s/config/templates/cm.common.yaml | 1 + doc/configuration.md | 1 + src/common/cscs_api_common.py | 13 ++++++++----- src/compute/compute.py | 14 +++++++------- src/utilities/utilities.py | 2 +- 7 files changed, 21 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 110ad1e8..7afaa520 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for multiple JWT signature algorithms - Added option to follow symbolic links in the `POST /utilities/compress` and `POST /storage/xfer-internal/compress` endpoints - Added new "general" section to status/parameters describing `FIRECREST_VERSION` and `FIRECREST_BUILD` timestamp +- Environment variable `F7T_HOME_ENABLED` to set `False` if `$HOME` is not mounted on systems executing FirecREST commands ### Changed @@ -27,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix parsing in `GET /utilities/ls` endpoint. - The job fields `job_data_out` and `job_file_err` from `GET /compute/jobs` will be empty for jobs that are still pending (so that there is no confusion with older output/error files). - Added retry on task creation workflow +- Error message when `$HOME` is not mounted ## [1.16.0] diff --git a/deploy/demo/common/common.env b/deploy/demo/common/common.env index 06647000..3e207084 100644 --- a/deploy/demo/common/common.env +++ b/deploy/demo/common/common.env @@ -67,6 +67,7 @@ F7T_SYSTEMS_INTERNAL_ADDR='192.168.220.12:22;192.168.220.12:22' #F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR='192.168.220.12:22;192.168.220.12:22' #F7T_SYSTEMS_INTERNAL_STORAGE_ADDR='192.168.220.12:22;192.168.220.12:22' #F7T_SYSTEMS_INTERNAL_UTILITIES_ADDR='192.168.220.12:22;192.168.220.12:22' +#F7T_HOME_ENABLED=True #------- # COMPUTE options # Base filesystem where job submission files will be stored. diff --git a/deploy/k8s/config/templates/cm.common.yaml b/deploy/k8s/config/templates/cm.common.yaml index 829ec0e6..49ed42f4 100644 --- a/deploy/k8s/config/templates/cm.common.yaml +++ b/deploy/k8s/config/templates/cm.common.yaml @@ -32,6 +32,7 @@ data: F7T_SSL_ENABLED: {{ .Values.F7T_SSL_ENABLED | default "true" | quote }} F7T_SSL_CRT: {{ .Values.F7T_SSL_CRT | default "" | quote }} F7T_SSL_KEY: {{ .Values.F7T_SSL_KEY | default "" | quote }} + F7T_HOME_ENABLED: {{ .Values.F7T_HOME_ENABLED | default "True" | quote }} F7T_SYSTEMS_INTERNAL_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }} F7T_SYSTEMS_INTERNAL_STATUS_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_STATUS_ADDR | default .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }} F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR | default .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }} diff --git a/doc/configuration.md b/doc/configuration.md index b03afdc9..68dd3844 100644 --- a/doc/configuration.md +++ b/doc/configuration.md @@ -91,6 +91,7 @@ The most complete way of installing is to setup 3 hosts: |`F7T_UTILITIES_TIMEOUT` | NO | `5` | Value in **seconds** for timing out a login node command using `/utilities` | `Backend` | |`F7T_PERSIST_HOST` | NO | `'127.0.0.1'` | Hostname or IP of the redis database used in `taskpersistence` container | `Backend` | Replaces `F7T_PERSISTENCE_IP` | |`F7T_PERSIST_PORT` | NO | `'6379'` | Port number of the redis database used in `taskpersistence` container | `Backend` | +|`F7T_HOME_ENABLED` | NO | `True` | Set to `True` if the `$HOME` directory is mounted on the systems interfacing FirecREST | `Backend` | |`F7T_SPANK_PLUGIN_ENABLED` | NO | `False` | Set to `True` if the system scheduler uses a [spank](https://slurm.schedmd.com/spank.html) when submitting jobs. If there is more than one system configured, there should be a semicolon separated list in relative order to `F7T_SYSTEMS_PUBLIC_NAME` values | `Backend`| Replaces `F7T_USE_SPANK_PLUGIN` | |`F7T_SPANK_PLUGIN_OPTION` | only if `F7T_SPANK_PLUGIN_ENABLED=True` | `--nohome`| Name of the option to use in the workload manager command. If there is more than one system configured, there should be a semicolon separated list in relative order to `F7T_SYSTEMS_PUBLIC_NAME` values | `Backend`| |`F7T_COMPUTE_SCHEDULER` | NO | `'Slurm'`| Set to the name of the of the Workload Manager scheduler adapter class. By default it can be found in `/src/common/schedulers` | `Backend`| diff --git a/src/common/cscs_api_common.py b/src/common/cscs_api_common.py index 9e622370..558c88f1 100644 --- a/src/common/cscs_api_common.py +++ b/src/common/cscs_api_common.py @@ -109,7 +109,8 @@ def get_null_var(var): OPA_URL = os.environ.get("F7T_OPA_URL","http://localhost:8181").strip('\'"') OPA_POLICY_PATH = os.environ.get("F7T_OPA_POLICY_PATH","v1/data/f7t/authz").strip('\'"') - +HOME_ENABLED = get_boolean_var( + os.environ.get("F7T_HOME_ENABLED", True)) ### SSH key paths PUB_USER_KEY_PATH = os.environ.get("F7T_PUB_USER_KEY_PATH", "/user-key.pub") @@ -363,7 +364,7 @@ def create_certificate(headers, cluster_name, cluster_addr, command=None, option # execute remote commands with Paramiko: -def exec_remote_command(headers, system_name, system_addr, action, file_transfer=None, file_content=None, no_home=False): +def exec_remote_command(headers, system_name, system_addr, action, file_transfer=None, file_content=None): import paramiko, socket @@ -530,14 +531,16 @@ def exec_remote_command(headers, system_name, system_addr, action, file_transfer else: result = {"error": 0, "msg": outlines} elif stderr_errno > 0: - # Solving when stderr_errno = 1 and no_home plugin used (F7T_SPANK_PLUGIN_ENABLED) + # Solving when stderr_errno = 1 and $HOME is not mounted: # stderr_errno = 1 # stderr_errda = "Could not chdir to home directory /users/eirinik: No such file or directory # ERROR: you must specify a project account (-A )sbatch: error: cli_filter plugin terminated with error" - if no_home and in_str(stderr_errda,"Could not chdir to home directory"): + if not HOME_ENABLED and in_str(stderr_errda, "Could not chdir to home directory"): # checking for 2nd 'directory' string (first is at index 33) # 2nd comes after username - idx = stderr_errda.index("directory",33) + logging.info(f"$HOME directory is not enabled" + f"(F7T_HOME_ENABLED={HOME_ENABLED})") + idx = stderr_errda.index("directory", 33) # len(directory) = 9 result = {"error": stderr_errno, "msg": stderr_errda[idx+9:]} diff --git a/src/compute/compute.py b/src/compute/compute.py index 16fe01f6..84f4dedc 100644 --- a/src/compute/compute.py +++ b/src/compute/compute.py @@ -148,7 +148,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun ID = headers.get(TRACER_HEADER, '') # create tmpdir for sbatch file action = f"ID={ID} timeout {UTILITIES_TIMEOUT} mkdir -p -- '{job_dir}'" - retval = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + retval = exec_remote_command(headers, system_name, system_addr, action) if retval["error"] != 0: app.logger.error(f"(Error creating directory: {retval['msg']}") @@ -158,7 +158,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun # save the sbatch file in the target cluster FS if job_file['content']: action = f"ID={ID} cat > '{job_dir}/{job_file['filename']}'" - retval = exec_remote_command(headers, system_name, system_addr, action, file_transfer="upload", file_content=job_file['content'], no_home=use_plugin) + retval = exec_remote_command(headers, system_name, system_addr, action, file_transfer="upload", file_content=job_file['content']) if retval["error"] != 0: app.logger.error(f"(Error uploading file: {retval['msg']}") update_task(task_id, headers, async_task.ERROR, "Failed to upload file") @@ -171,7 +171,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun action = f"ID={ID} {scheduler_command}" app.logger.info(action) - retval = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + retval = exec_remote_command(headers, system_name, system_addr, action) if retval["error"] != 0: app.logger.error(f"(Error: {retval['msg']}") @@ -231,7 +231,7 @@ def get_job_files(headers, system_name, system_addr, job_info, output=False, use for n_try in range(n_tries): - resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + resp = exec_remote_command(headers, system_name, system_addr, action) # if there was an error, the result will be SUCESS but not available outputs if resp["error"] == 0: @@ -265,12 +265,12 @@ def get_job_files(headers, system_name, system_addr, job_info, output=False, use # tail -c {number_of_bytes} --> 1000B = 1KB action = f"ID={ID} timeout {UTILITIES_TIMEOUT} tail -c {TAIL_BYTES} -- '{control_info['job_file_out']}'" - resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + resp = exec_remote_command(headers, system_name, system_addr, action) if resp["error"] == 0: control_info["job_data_out"] = resp["msg"] action = f"ID={ID} timeout {UTILITIES_TIMEOUT} tail -c {TAIL_BYTES} -- '{control_info['job_file_err']}'" - resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + resp = exec_remote_command(headers, system_name, system_addr, action) if resp["error"] == 0: control_info["job_data_err"] = resp["msg"] @@ -287,7 +287,7 @@ def submit_job_path_task(headers, system_name, system_addr, fileName, job_dir, a action=f"ID={ID} {scheduler_command}" app.logger.info(action) - resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin) + resp = exec_remote_command(headers, system_name, system_addr, action) # in case of error: if resp["error"] != 0: diff --git a/src/utilities/utilities.py b/src/utilities/utilities.py index c0063db2..3e1df561 100644 --- a/src/utilities/utilities.py +++ b/src/utilities/utilities.py @@ -591,7 +591,7 @@ def common_fs_operation(request, command): [headers, ID] = get_tracing_headers(request) action = f"ID={ID} timeout {UTILITIES_TIMEOUT} {action}" - retval = exec_remote_command(headers, system_name ,system_addr, action, file_transfer, file_content) + retval = exec_remote_command(headers, system_name ,system_addr, action, file_transfer, file_content, ) if retval["error"] != 0: error_str = retval["msg"] From 2d529f27b28c7cca533c84977de72a347566c90d Mon Sep 17 00:00:00 2001 From: jdorsch Date: Tue, 10 Sep 2024 12:27:47 +0200 Subject: [PATCH 2/2] fixing some typing --- src/common/cscs_api_common.py | 2 +- src/utilities/utilities.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/cscs_api_common.py b/src/common/cscs_api_common.py index 558c88f1..39d42fa7 100644 --- a/src/common/cscs_api_common.py +++ b/src/common/cscs_api_common.py @@ -539,7 +539,7 @@ def exec_remote_command(headers, system_name, system_addr, action, file_transfer # checking for 2nd 'directory' string (first is at index 33) # 2nd comes after username logging.info(f"$HOME directory is not enabled" - f"(F7T_HOME_ENABLED={HOME_ENABLED})") + f" (F7T_HOME_ENABLED={HOME_ENABLED})") idx = stderr_errda.index("directory", 33) # len(directory) = 9 result = {"error": stderr_errno, "msg": stderr_errda[idx+9:]} diff --git a/src/utilities/utilities.py b/src/utilities/utilities.py index 3e1df561..c0063db2 100644 --- a/src/utilities/utilities.py +++ b/src/utilities/utilities.py @@ -591,7 +591,7 @@ def common_fs_operation(request, command): [headers, ID] = get_tracing_headers(request) action = f"ID={ID} timeout {UTILITIES_TIMEOUT} {action}" - retval = exec_remote_command(headers, system_name ,system_addr, action, file_transfer, file_content, ) + retval = exec_remote_command(headers, system_name ,system_addr, action, file_transfer, file_content) if retval["error"] != 0: error_str = retval["msg"]