Skip to content

Commit

Permalink
Updating control nodes one by one (#864)
Browse files Browse the repository at this point in the history
* Updating control nodes one by one.

Signed-off-by: Alexey Makhov <[email protected]>

* Updating control nodes one by one. Test and fix

Signed-off-by: Alexey Makhov <[email protected]>

---------

Signed-off-by: Alexey Makhov <[email protected]>
  • Loading branch information
makhov authored Jan 8, 2025
1 parent 90c13e3 commit c7d9069
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 138 deletions.
145 changes: 77 additions & 68 deletions internal/controller/controlplane/k0s_controlplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,15 +352,18 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv
desiredMachineNames := make(map[string]bool)

var clusterIsUpdating bool
var clusterIsMutating bool
for _, m := range machines.SortedByCreationTimestamp() {
if m.Spec.Version == nil || (!versionMatches(m, kcp.Spec.Version)) {
clusterIsUpdating = true
clusterIsMutating = true
if kcp.Spec.UpdateStrategy == cpv1beta1.UpdateInPlace {
desiredMachineNames[m.Name] = true
} else {
machineNamesToDelete[m.Name] = true
}
} else if !matchesTemplateClonedFrom(infraMachines, kcp, m) {
clusterIsMutating = true
machineNamesToDelete[m.Name] = true
} else if machines.Len() > int(kcp.Spec.Replicas)+len(machineNamesToDelete) {
machineNamesToDelete[m.Name] = true
Expand Down Expand Up @@ -394,75 +397,15 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv
}
}

i := 0
for len(desiredMachineNames) < int(kcp.Spec.Replicas) {
name := machineName(kcp.Name, i)
log.Log.Info("desire machine", "name", len(desiredMachineNames))
_, ok := machineNamesToDelete[name]
if !ok {
_, exists := machines[name]
desiredMachineNames[name] = exists
}
i++
}
log.Log.Info("Desired machines", "count", len(desiredMachineNames))

for name, exists := range desiredMachineNames {
if !exists || kcp.Spec.UpdateStrategy == cpv1beta1.UpdateInPlace {

// Wait for the previous machine to be created to avoid etcd issues if cluster if updating
// OR
// Wait for the first controller to start before creating the next one
// Some providers don't publish failure domains immediately, so wait for the first machine to be ready
// It's not slowing down the process overall, as we wait to the first machine anyway to create join tokens
if clusterIsUpdating || (machines.Len() == 1 && kcp.Spec.Replicas > 1) {
err := c.checkMachineIsReady(ctx, machines.Newest().Name, cluster)
if err != nil {
return err
}
}

machineFromTemplate, err := c.createMachineFromTemplate(ctx, name, cluster, kcp)
if err != nil {
return fmt.Errorf("error creating machine from template: %w", err)
}

infraRef := corev1.ObjectReference{
APIVersion: machineFromTemplate.GetAPIVersion(),
Kind: machineFromTemplate.GetKind(),
Name: machineFromTemplate.GetName(),
Namespace: kcp.Namespace,
}
if len(machineNamesToDelete)+len(desiredMachineNames) > int(kcp.Spec.Replicas) {

selectedFailureDomain := failuredomains.PickFewest(ctx, cluster.Status.FailureDomains.FilterControlPlane(), machines)
machine, err := c.createMachine(ctx, name, cluster, kcp, infraRef, selectedFailureDomain)
if err != nil {
return fmt.Errorf("error creating machine: %w", err)
}
machines[machine.Name] = machine
}

err = c.createBootstrapConfig(ctx, name, cluster, kcp, machines[name])
m := machines.Newest().Name
err := c.checkMachineIsReady(ctx, m, cluster)
if err != nil {
return fmt.Errorf("error creating bootstrap config: %w", err)
}
}

if len(machineNamesToDelete) > 0 {
for m := range machines {
if machineNamesToDelete[m] {
continue
}

err := c.checkMachineIsReady(ctx, m, cluster)
if err != nil {
logger.Error(err, "Error checking machine left", "machine", m)
return err
}
logger.Error(err, "Error checking machine left", "machine", m)
return err
}
}

if len(machineNamesToDelete) > 0 {
logger.Info("Found machines to delete", "count", len(machineNamesToDelete))

// Remove the oldest machine abd wait for the machine to be deleted to avoid etcd issues
Expand All @@ -475,13 +418,61 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv
return fmt.Errorf("waiting for previous machine to be deleted")
}

err := c.runMachineDeletionSequence(ctx, logger, cluster, kcp, machineToDelete)
err = c.runMachineDeletionSequence(ctx, logger, cluster, kcp, machineToDelete)
if err != nil {
return err
}

logger.Info("Deleted machine", "machine", machineToDelete.Name)
}

if len(desiredMachineNames) < int(kcp.Spec.Replicas) {

name := machineName(kcp, machineNamesToDelete, desiredMachineNames)
log.Log.Info("desire machine", "name", len(desiredMachineNames))

// Wait for the previous machine to be created to avoid etcd issues if cluster if updating
// OR
// Wait for the first controller to start before creating the next one
// Some providers don't publish failure domains immediately, so wait for the first machine to be ready
// It's not slowing down the process overall, as we wait to the first machine anyway to create join tokens
if clusterIsMutating || (machines.Len() == 1 && kcp.Spec.Replicas > 1) {
err := c.checkMachineIsReady(ctx, machines.Newest().Name, cluster)
if err != nil {
return err
}
}

machineFromTemplate, err := c.createMachineFromTemplate(ctx, name, cluster, kcp)
if err != nil {
return fmt.Errorf("error creating machine from template: %w", err)
}

infraRef := corev1.ObjectReference{
APIVersion: machineFromTemplate.GetAPIVersion(),
Kind: machineFromTemplate.GetKind(),
Name: machineFromTemplate.GetName(),
Namespace: kcp.Namespace,
}

selectedFailureDomain := failuredomains.PickFewest(ctx, cluster.Status.FailureDomains.FilterControlPlane(), machines)
machine, err := c.createMachine(ctx, name, cluster, kcp, infraRef, selectedFailureDomain)
if err != nil {
return fmt.Errorf("error creating machine: %w", err)
}
machines[machine.Name] = machine
desiredMachineNames[machine.Name] = true

err = c.createBootstrapConfig(ctx, name, cluster, kcp, machines[name])
if err != nil {
return fmt.Errorf("error creating bootstrap config: %w", err)
}
}

if len(desiredMachineNames) < int(kcp.Spec.Replicas) {
return ErrNewMachinesNotReady
}

return nil
}

Expand Down Expand Up @@ -871,8 +862,26 @@ func (c *K0sController) createFRPToken(ctx context.Context, cluster *clusterv1.C
})
}

func machineName(base string, i int) string {
return fmt.Sprintf("%s-%d", base, i)
func machineName(kcp *cpv1beta1.K0sControlPlane, machineToDelete, desiredMachines map[string]bool) string {
if len(machineToDelete) == 0 {
for i := 0; i < int(kcp.Spec.Replicas); i++ {
name := fmt.Sprintf("%s-%d", kcp.Name, len(desiredMachines)-i)
_, ok := desiredMachines[name]
if !ok {
return name
}
}
}

for i := 0; i < int(kcp.Spec.Replicas); i++ {
name := fmt.Sprintf("%s-%d", kcp.Name, i)
_, ok := machineToDelete[name]
if ok {
return fmt.Sprintf("%s-%d", kcp.Name, len(desiredMachines)+int(kcp.Spec.Replicas))
}
}

return fmt.Sprintf("%s-%d", kcp.Name, len(desiredMachines))
}

// SetupWithManager sets up the controller with the Manager.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package controlplane

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"testing"

"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -119,3 +120,95 @@ func TestK0sConfigEnrichment(t *testing.T) {
})
}
}

func Test_machineName(t *testing.T) {
var testCases = []struct {
replicas int32
machineToDelete map[string]bool
desiredMachines map[string]bool
want string
}{
{
replicas: 3,
machineToDelete: nil,
desiredMachines: map[string]bool{},
want: "test-0",
},
{
replicas: 3,
machineToDelete: nil,
desiredMachines: map[string]bool{
"test-1": true,
},
want: "test-0",
},
{
replicas: 3,
machineToDelete: map[string]bool{
"test-0": true,
"test-1": true,
"test-2": true,
},
desiredMachines: map[string]bool{
"test-3": true,
},
want: "test-4",
},
{
replicas: 3,
machineToDelete: map[string]bool{
"test-3": true,
"test-4": true,
"test-5": true,
},
desiredMachines: map[string]bool{},
want: "test-0",
},
{
replicas: 3,
machineToDelete: map[string]bool{
"test-4": true,
"test-5": true,
},
desiredMachines: map[string]bool{
"test-0": true,
},
want: "test-1",
},
{
replicas: 3,
machineToDelete: map[string]bool{
"test-5": true,
},
desiredMachines: map[string]bool{
"test-0": true,
"test-1": true,
},
want: "test-2",
},
{
replicas: 3,
machineToDelete: nil,
desiredMachines: map[string]bool{
"test-1": true,
"test-2": true,
},
want: "test-0",
},
}

for _, tc := range testCases {
kcp := &v1beta1.K0sControlPlane{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
},
Spec: v1beta1.K0sControlPlaneSpec{
Replicas: tc.replicas,
},
}
t.Run("", func(t *testing.T) {
actual := machineName(kcp, tc.machineToDelete, tc.desiredMachines)
require.Equal(t, tc.want, actual)
})
}
}
4 changes: 2 additions & 2 deletions inttest/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ check-capi-remote-machine: TIMEOUT=12m
check-capi-remote-machine-template: TIMEOUT=12m
check-capi-remote-machine-template-update: TIMEOUT=10m
check-capi-docker-machine-template-update: TIMEOUT=15m
check-capi-docker-machine-template-update-recreate: TIMEOUT=15m
check-capi-docker-machine-change-template: TIMEOUT=15m
check-capi-docker-machine-template-update-recreate: TIMEOUT=25m
check-capi-docker-machine-change-template: TIMEOUT=25m
check-capi-remote-machine-job-provision: TIMEOUT=10m
check-upgrade: TIMEOUT=20m
Loading

0 comments on commit c7d9069

Please sign in to comment.