From 1957e5d2707f5779b469e3ab4e161934611a82c0 Mon Sep 17 00:00:00 2001 From: Sascha Schwarze Date: Tue, 30 Apr 2024 16:03:08 +0200 Subject: [PATCH] Add StepOutOfMemory as reason for BuildRun failure --- docs/buildrun.md | 1 + pkg/apis/build/v1beta1/buildrun_types.go | 3 + .../buildrun/resources/conditions.go | 28 ++++++--- .../buildrun/resources/conditions_test.go | 63 +++++++++++++++++++ .../buildrun/resources/failure_details.go | 19 +++--- .../integration/buildruns_to_taskruns_test.go | 2 +- 6 files changed, 97 insertions(+), 19 deletions(-) diff --git a/docs/buildrun.md b/docs/buildrun.md index 96f4ece9a6..9b420133ad 100644 --- a/docs/buildrun.md +++ b/docs/buildrun.md @@ -391,6 +391,7 @@ The following table illustrates the different states a BuildRun can have under i | False | BuildRunAmbiguousBuild | Yes | The defined `BuildRun` uses both `spec.build.name` and `spec.build.spec`. Only one of them is allowed at the same time. | | False | BuildRunBuildFieldOverrideForbidden | Yes | The defined `BuildRun` uses an override (e.g. `timeout`, `paramValues`, `output`, or `env`) in combination with `spec.build.spec`, which is not allowed. Use the `spec.build.spec` to directly specify the respective value. | | False | PodEvicted | Yes | The BuildRun Pod was evicted from the node it was running on. See [API-initiated Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/) and [Node-pressure Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/) for more information. | +| False | StepOutOfMemory | Yes | The BuildRun Pod failed because a step went out of memory. | **Note**: We heavily rely on the Tekton TaskRun [Conditions](https://github.com/tektoncd/pipeline/blob/main/docs/taskruns.md#monitoring-execution-status) for populating the BuildRun ones, with some exceptions. diff --git a/pkg/apis/build/v1beta1/buildrun_types.go b/pkg/apis/build/v1beta1/buildrun_types.go index 428da6c392..d3a2dc361f 100644 --- a/pkg/apis/build/v1beta1/buildrun_types.go +++ b/pkg/apis/build/v1beta1/buildrun_types.go @@ -106,6 +106,9 @@ const ( // BuildRunStatePodEvicted indicates that if the pods got evicted // due to some reason. (Probably ran out of ephemeral storage) BuildRunStatePodEvicted = "PodEvicted" + + // BuildRunStateStepOutOfMemory indicates that a step failed because it went out of memory. + BuildRunStateStepOutOfMemory = "StepOutOfMemory" ) // SourceResult holds the results emitted from the different sources diff --git a/pkg/reconciler/buildrun/resources/conditions.go b/pkg/reconciler/buildrun/resources/conditions.go index 0d3ef8380a..e48b30a431 100644 --- a/pkg/reconciler/buildrun/resources/conditions.go +++ b/pkg/reconciler/buildrun/resources/conditions.go @@ -81,7 +81,7 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie case pipelineapi.TaskRunReasonFailed: if taskRun.Status.CompletionTime != nil { - pod, failedContainer, err := extractFailedPodAndContainer(ctx, client, taskRun) + pod, failedContainer, failedContainerStatus, err := extractFailedPodAndContainer(ctx, client, taskRun) if err != nil { // when trying to customize the Condition Message field, ensure the Message cover the case // when a Pod is deleted. @@ -105,19 +105,27 @@ func UpdateBuildRunUsingTaskRunCondition(ctx context.Context, client client.Clie message = pod.Status.Message reason = buildv1beta1.BuildRunStatePodEvicted if failedContainer != nil { - //nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailureDetails.Location.Container = failedContainer.Name } } else if failedContainer != nil { - //nolint:staticcheck // SA1019 we want to give users some time to adopt to failureDetails buildRun.Status.FailureDetails.Location.Container = failedContainer.Name - message = fmt.Sprintf("buildrun step %s failed in pod %s, for detailed information: kubectl --namespace %s logs %s --container=%s", - failedContainer.Name, - pod.Name, - pod.Namespace, - pod.Name, - failedContainer.Name, - ) + + if failedContainerStatus != nil && failedContainerStatus.State.Terminated != nil && failedContainerStatus.State.Terminated.Reason == "OOMKilled" { + reason = buildv1beta1.BuildRunStateStepOutOfMemory + message = fmt.Sprintf("buildrun step %s failed due to out-of-memory, for detailed information: kubectl --namespace %s logs %s --container=%s", + failedContainer.Name, + pod.Namespace, + pod.Name, + failedContainer.Name, + ) + } else { + message = fmt.Sprintf("buildrun step %s failed, for detailed information: kubectl --namespace %s logs %s --container=%s", + failedContainer.Name, + pod.Namespace, + pod.Name, + failedContainer.Name, + ) + } } else { message = fmt.Sprintf("buildrun failed due to an unexpected error in pod %s: for detailed information: kubectl --namespace %s logs %s --all-containers", pod.Name, diff --git a/pkg/reconciler/buildrun/resources/conditions_test.go b/pkg/reconciler/buildrun/resources/conditions_test.go index 06dbe2fbcc..5662eb6aba 100644 --- a/pkg/reconciler/buildrun/resources/conditions_test.go +++ b/pkg/reconciler/buildrun/resources/conditions_test.go @@ -242,6 +242,7 @@ var _ = Describe("Conditions", func() { }, }, Status: corev1.PodStatus{ + Phase: corev1.PodFailed, Reason: "Evicted", }, } @@ -281,6 +282,68 @@ var _ = Describe("Conditions", func() { ).To(Equal(build.BuildRunStatePodEvicted)) }) + It("updates BuildRun condition when TaskRun fails and container is out of memory", func() { + // Generate a pod with the status to out of memory + failedTaskRunOOMContainer := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "evilpod", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "evilpod-container", + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "evilpod-container", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: "OOMKilled", + }, + }, + }}, + }, + } + + // stub a GET API call with to pass the created pod + getClientStub := func(_ context.Context, nn types.NamespacedName, object crc.Object, _ ...crc.GetOption) error { + switch object := object.(type) { + case *corev1.Pod: + failedTaskRunOOMContainer.DeepCopyInto(object) + return nil + } + return k8serrors.NewNotFound(schema.GroupResource{}, nn.Name) + } + + // fake the calls with the above stub + client.GetCalls(getClientStub) + + // Now we need to create a fake failed taskrun so that it hits the code + fakeTRCondition := &apis.Condition{ + Type: apis.ConditionSucceeded, + Reason: "Failed", + Message: "not relevant", + } + + // We call the function with all the info + Expect(resources.UpdateBuildRunUsingTaskRunCondition( + context.TODO(), + client, + br, + tr, + fakeTRCondition, + )).To(BeNil()) + + // Finally, check the output of the buildRun + Expect(br.Status.GetCondition( + build.Succeeded).Reason, + ).To(Equal(build.BuildRunStateStepOutOfMemory)) + }) + It("updates a BuildRun condition when the related TaskRun fails and pod containers are not available", func() { taskRunGeneratedPod := corev1.Pod{ diff --git a/pkg/reconciler/buildrun/resources/failure_details.go b/pkg/reconciler/buildrun/resources/failure_details.go index 3be2dc272e..df9a3aca9e 100644 --- a/pkg/reconciler/buildrun/resources/failure_details.go +++ b/pkg/reconciler/buildrun/resources/failure_details.go @@ -64,31 +64,34 @@ func extractFailureReasonAndMessage(taskRun *pipelineapi.TaskRun) (errorReason s return errorReason, errorMessage } -func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, error) { +func extractFailedPodAndContainer(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (*corev1.Pod, *corev1.Container, *corev1.ContainerStatus, error) { var pod corev1.Pod if err := client.Get(ctx, types.NamespacedName{Namespace: taskRun.Namespace, Name: taskRun.Status.PodName}, &pod); err != nil { ctxlog.Error(ctx, err, "failed to get pod for failure extraction", namespace, taskRun.Namespace, name, taskRun.Status.PodName) - return nil, nil, err + return nil, nil, nil, err } - failures := make(map[string]struct{}) + failures := make(map[string]*corev1.ContainerStatus) // Find the names of all containers with failure status - for _, containerStatus := range pod.Status.ContainerStatuses { + for i := range pod.Status.ContainerStatuses { + containerStatus := pod.Status.ContainerStatuses[i] if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 { - failures[containerStatus.Name] = struct{}{} + failures[containerStatus.Name] = &containerStatus } } // Find the first container that has a failure status var failedContainer *corev1.Container + var failedContainerStatus *corev1.ContainerStatus for i, container := range pod.Spec.Containers { - if _, has := failures[container.Name]; has { + if containerStatus, has := failures[container.Name]; has { failedContainer = &pod.Spec.Containers[i] + failedContainerStatus = containerStatus break } } - return &pod, failedContainer, nil + return &pod, failedContainer, failedContainerStatus, nil } func extractFailureDetails(ctx context.Context, client client.Client, taskRun *pipelineapi.TaskRun) (failure *buildv1beta1.FailureDetails) { @@ -97,7 +100,7 @@ func extractFailureDetails(ctx context.Context, client client.Client, taskRun *p failure.Reason, failure.Message = extractFailureReasonAndMessage(taskRun) failure.Location = &buildv1beta1.Location{Pod: taskRun.Status.PodName} - pod, container, _ := extractFailedPodAndContainer(ctx, client, taskRun) + pod, container, _, _ := extractFailedPodAndContainer(ctx, client, taskRun) if pod != nil && container != nil { failure.Location.Pod = pod.Name diff --git a/test/integration/buildruns_to_taskruns_test.go b/test/integration/buildruns_to_taskruns_test.go index 2284e9ccfa..ecae76947b 100644 --- a/test/integration/buildruns_to_taskruns_test.go +++ b/test/integration/buildruns_to_taskruns_test.go @@ -187,7 +187,7 @@ var _ = Describe("Integration tests BuildRuns and TaskRuns", func() { condition := buildRun.Status.GetCondition(v1beta1.Succeeded) Expect(condition.Status).To(Equal(corev1.ConditionFalse)) Expect(condition.Reason).To(Equal("Failed")) - Expect(condition.Message).To(ContainSubstring("buildrun step %s failed in pod %s", "step-step-build-and-push", taskRun.Status.PodName)) + Expect(condition.Message).To(ContainSubstring("buildrun step %s failed", "step-step-build-and-push")) }) }) })