codalab · mergify · May 14, 2022 · Apr 30, 2022 · Apr 30, 2022 · Apr 30, 2022
diff --git a/codalab/worker_manager/kubernetes_worker_manager.py b/codalab/worker_manager/kubernetes_worker_manager.py
@@ -74,6 +74,12 @@ def __init__(self, args):
         configuration.api_key['authorization'] = args.auth_token
         configuration.host = args.cluster_host
         configuration.ssl_ca_cert = args.cert_path
+        if configuration.host == "https://codalab-control-plane:6443":
+            configuration.verify_ssl = False
+            configuration.ssl_ca_cert = None
+            del configuration.api_key_prefix['authorization']
+            del configuration.api_key['authorization']
+            configuration.debug = False
 
         self.k8_client: client.ApiClient = client.ApiClient(configuration)
         self.k8_api: client.CoreV1Api = client.CoreV1Api(self.k8_client)

diff --git a/codalab/worker_manager/slurm_batch_worker_manager.py b/codalab/worker_manager/slurm_batch_worker_manager.py
@@ -32,6 +32,11 @@ class SlurmBatchWorkerManager(WorkerManager):
 
     @staticmethod
     def add_arguments_to_subparser(subparser):
+        try:
+            user_id = getpass.getuser()
+        except Exception:
+            # Sometimes getpass.getuser() doesn't work.
+            user_id = ""
         subparser.add_argument(
             '--job-name',
             type=str,
@@ -74,7 +79,7 @@ def add_arguments_to_subparser(subparser):
             help='Print out Slurm batch job definition without submitting to Slurm',
         )
         subparser.add_argument(
-            '--user', type=str, default=getpass.getuser(), help='User to run the Batch jobs as'
+            '--user', type=str, default=user_id, help='User to run the Batch jobs as'
         )
         subparser.add_argument(
             '--password-file',

diff --git a/codalab_service.py b/codalab_service.py
@@ -481,17 +481,17 @@ def has_callable_default(self):
         CodalabArg(
             name=f'worker_manager_{worker_manager_type}_kubernetes_cluster_host',
             type=str,
-            help=f'Host address of the Kubernetes cluster for the {worker_manager_type} Kubernetes worker manager',
+            help='Host address of the Kubernetes cluster for the Kubernetes worker manager',
         ),
         CodalabArg(
             name=f'worker_manager_{worker_manager_type}_kubernetes_auth_token',
             type=str,
-            help=f'Kubernetes cluster authorization token for the {worker_manager_type} Kubernetes worker manager',
+            help='Kubernetes cluster authorization token for the Kubernetes worker manager',
         ),
         CodalabArg(
             name=f'worker_manager_{worker_manager_type}_kubernetes_cert_path',
             type=str,
-            help=f'Path to the generated SSL cert for the {worker_manager_type} Kubernetes worker manager',
+            help='Path to the generated SSL cert for the Kubernetes worker manager',
         ),
     ]
 

diff --git a/docker_config/compose_files/docker-compose.yml b/docker_config/compose_files/docker-compose.yml
@@ -288,6 +288,7 @@ services:
     <<: *codalab-base
     <<: *codalab-server
     volumes:
+      - "${CODALAB_HOME}:${CODALAB_HOME}"
       - ${CODALAB_WORKER_MANAGER_CPU_KUBERNETES_CERT_PATH}:${CODALAB_WORKER_MANAGER_CPU_KUBERNETES_CERT_PATH}:ro
     networks:
       - rest-server
@@ -320,6 +321,7 @@ services:
     <<: *codalab-base
     <<: *codalab-server
     volumes:
+      - "${CODALAB_HOME}:${CODALAB_HOME}"
       - ${CODALAB_WORKER_MANAGER_GPU_KUBERNETES_CERT_PATH}:${CODALAB_WORKER_MANAGER_GPU_KUBERNETES_CERT_PATH}:ro
     networks:
       - rest-server

diff --git a/docs/Server-Setup.md b/docs/Server-Setup.md
@@ -327,3 +327,68 @@ If you need to send Slack notifications from monitor.py service, you can configu
    Slack email address which will show up in a designated Slack channel.
 * Note that this integration only works with workspaces on *the Slack Standard Plan and above*.
 
+
+## Start a local Kubernetes Batch Worker Manager (with kind, for testing / development only)
+
+If you want to test or develop with kubernetes locally, follow these steps to do so:
+
+### Initial (one-time) setup
+
+```
+# First, start codalab without a worker:
+codalab-service start -bds default no-worker
+
+# Install initial dependencies
+wget https://go.dev/dl/go1.18.1.linux-amd64.tar.gz && rm -rf /usr/local/go && sudo tar -C /usr/local -xzf go1.18.1.linux-amd64.tar.gz && rm go1.18.1.linux-amd64.tar.gz # Install go: instructions from https://go.dev/doc/install
+export PATH=$PATH:/usr/local/go/bin:~/go/bin # add to your bash profile
+go version # go should be installed
+go install sigs.k8s.io/[email protected]
+go install github.com/cloudflare/cfssl/cmd/...@latest
+kind version # kind should be installed
+cfssl version # cfssl should be installed
+
+# Set up local kind cluster. follow the instructions that display to view the web dashboard.
+./scripts/local-k8s/setup.sh
+```
+
+If all is successful, you should be able to log into your dashboard. You should have one node running (codalab-control-plane). After you follow the steps below, you should also be able to view each pod (which corresponds to each worker) and then check their logs by clicking on the icon in the top-right.
+
+![Local Kubernetes Dashboard](./images/local-k8s-dashboard.png)
+
+### Build worker docker image
+
+You should repeat this step each time you change the worker docker image and want the local kind cluster to load it:
+
+```bash
+codalab-service build -s worker
+```
+
+### Run codalab and worker managers
+
+Run:
+
+```
+export CODALAB_SERVER=http://nginx
+export CODALAB_WORKER_MANAGER_CPU_KUBERNETES_CLUSTER_HOST=https://codalab-control-plane:6443
+export CODALAB_WORKER_MANAGER_TYPE=kubernetes
+export CODALAB_WORKER_MANAGER_CPU_KUBERNETES_CERT_PATH=/dev/null
+export CODALAB_WORKER_MANAGER_CPU_KUBERNETES_AUTH_TOKEN=/dev/null
+export CODALAB_WORKER_MANAGER_CPU_DEFAULT_CPUS=1
+export CODALAB_WORKER_MANAGER_CPU_DEFAULT_MEMORY_MB=100
+export CODALAB_WORKER_MANAGER_MIN_CPU_WORKERS=0
+codalab-service start -bds default no-worker worker-manager-cpu
+```
+
+Or if you just want to run the worker manager and check its logs, run:
+```
+codalab-service start -bds worker-manager-cpu && docker logs codalab_kubernetes-worker-manager-cpu_1 --follow
+```
+
+### Teardown
+
+You can remove the kind cluster by running:
+
+```
+kind delete cluster --name codalab
+```
+
diff --git a/docs/images/local-k8s-dashboard.png b/docs/images/local-k8s-dashboard.png
diff --git a/scripts/local-k8s/anonymous-users.yaml b/scripts/local-k8s/anonymous-users.yaml
@@ -0,0 +1,21 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: anonymous-role
+rules:
+- apiGroups: [""]
+  resources: ["*"]
+  verbs: ["*"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: anonymous-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: anonymous-role
+subjects:
+- apiGroup: rbac.authorization.k8s.io
+  kind: User
+  name: system:anonymous
diff --git a/scripts/local-k8s/dashboard-user.yaml b/scripts/local-k8s/dashboard-user.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: admin-user
+  namespace: kubernetes-dashboard
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: admin-user
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+- kind: ServiceAccount
+  name: admin-user
+  namespace: kubernetes-dashboard
diff --git a/scripts/local-k8s/kind-config.yaml b/scripts/local-k8s/kind-config.yaml
@@ -0,0 +1,10 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: codalab
+networking:
+  apiServerAddress: "127.0.0.1"
+  apiServerPort: 6443
+nodes:
+- role: control-plane
+  # TODO: upgrade this version to 1.22 / 1.23 / 1.24 once kind versions are released. We pinned it to 1.21 to avoid: https://github.com/kubernetes-sigs/kind/issues/2723.
+  image: kindest/node:v1.21.10@sha256:84709f09756ba4f863769bdcabe5edafc2ada72d3c8c44d6515fc581b66b029c
diff --git a/scripts/local-k8s/setup.sh b/scripts/local-k8s/setup.sh
@@ -0,0 +1,18 @@
+# Sets up a local kubernetes cluster using kind,
+# along with a web dashboard.
+
+set -e
+
+docker container prune -f # remove all stopped containers
+kind create cluster --wait 30s --config scripts/local-k8s/kind-config.yaml # create cluster
+kubectl config use-context kind-codalab # makes sure kubectl is connected to local cluster
+kubectl get nodes -o=name | sed "s/^node\///" | xargs -L1 docker network connect rest-server # connects all kind nodes (which are Docker containers) to codalab docker network, so they can communicate.
+kubectl apply -f scripts/local-k8s/anonymous-users.yaml # gives anonymous users access to the local k8s cluster. Worker managers currently use anonymous authentication to access local k8s clusters.
+kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.5.0/aio/deploy/recommended.yaml # create web ui dashboard. full instructions from tutorial here: https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/
+kubectl apply -f scripts/local-k8s/dashboard-user.yaml # create dashboard user
+kubectl -n kubernetes-dashboard get secret $(kubectl -n kubernetes-dashboard get sa/admin-user -o jsonpath="{.secrets[0].name}") -o go-template="{{.data.token | base64decode}}" # copy this token and use it for web ui auth in the next step
+
+echo ""
+echo ""
+echo "^^Copy this token and use it for web ui auth in the next step."
+echo "# to view the dashboard, run \"kubectl proxy\" in a terminal and open up: http://localhost:8001/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:/proxy/#/workloads?namespace=default"