Skip to content

Commit

Permalink
Do not create the launcher job if the job starts suspended
Browse files Browse the repository at this point in the history
  • Loading branch information
GonzaloSaez committed Jan 10, 2025
1 parent c50eb45 commit c1ea13d
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
4 changes: 3 additions & 1 deletion pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,9 @@ func (c *MPIJobController) syncHandler(key string) error {
return err
}
}
if launcher == nil {
// If the job is suspended, the list of worker pods will be incorrect. We also do
// not want to start the launcher job if the MPIJob starts suspended.
if launcher == nil && !isMPIJobSuspended(mpiJob) {
if mpiJob.Spec.LauncherCreationPolicy == kubeflow.LauncherCreationPolicyAtStartup || c.countReadyWorkerPods(worker) == len(worker) {
launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
if err != nil {
Expand Down
54 changes: 52 additions & 2 deletions test/integration/mpi_job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,27 @@ func TestMPIJobSuccess(t *testing.T) {
}

func TestMPIJobWaitWorkers(t *testing.T) {
testcases := []struct {
name string
startSuspended bool
}{
{
name: "don't start suspended",
startSuspended: false,
},
{
name: "start suspended",
startSuspended: true,
},
}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
testMpiJobWaitWorkers(t, tc.startSuspended)
})
}
}

func testMpiJobWaitWorkers(t *testing.T, startSuspended bool) {
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
s := newTestSetup(ctx, t)
Expand All @@ -187,6 +208,7 @@ func TestMPIJobWaitWorkers(t *testing.T) {
SlotsPerWorker: ptr.To[int32](1),
LauncherCreationPolicy: "WaitForWorkersReady",
RunPolicy: kubeflow.RunPolicy{
Suspend: ptr.To(startSuspended),
CleanPodPolicy: ptr.To(kubeflow.CleanPodPolicyRunning),
},
MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{
Expand Down Expand Up @@ -237,9 +259,37 @@ func TestMPIJobWaitWorkers(t *testing.T) {
}
s.events.verify(t)

workerPods, err := getPodsForJob(ctx, s.kClient, mpiJob)
// The launcher job should not be created until all workers are ready even when we start in suspended mode.
job, err := getLauncherJobForMPIJob(ctx, s.kClient, mpiJob)
if err != nil {
t.Fatalf("Cannot get worker pods from job: %v", err)
t.Fatalf("Cannot get launcher job from job: %v", err)
}
if job != nil {
t.Fatalf("Launcher is created before workers")
}

if startSuspended {
// Resume the MPIJob so that the test can follow the normal path.
mpiJob.Spec.RunPolicy.Suspend = ptr.To(false)
mpiJob, err = s.mpiClient.KubeflowV2beta1().MPIJobs(mpiJob.Namespace).Update(ctx, mpiJob, metav1.UpdateOptions{})
if err != nil {
t.Fatalf("Error Updating MPIJob: %v", err)
}
}

var workerPods []corev1.Pod
if err = wait.PollUntilContextTimeout(ctx, util.WaitInterval, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
var err error
workerPods, err = getPodsForJob(ctx, s.kClient, mpiJob)
if err != nil {
return false, err
}
if len(workerPods) != 2 {
return false, nil
}
return true, nil
}); err != nil {
t.Errorf("Failed updating scheduler-plugins PodGroup: %v", err)
}

err = updatePodsToPhase(ctx, s.kClient, workerPods, corev1.PodRunning)
Expand Down

0 comments on commit c1ea13d

Please sign in to comment.