spacelift-io · truszkowski · Apr 18, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024
diff --git a/docs/resources/run.md b/docs/resources/run.md
@@ -40,7 +40,26 @@ resource "spacelift_run" "this" {
 - `commit_sha` (String) The commit SHA for which to trigger a run.
 - `keepers` (Map of String) Arbitrary map of values that, when changed, will trigger recreation of the resource.
 - `proposed` (Boolean) Whether the run is a proposed run. Defaults to `false`.
+- `timeouts` (Block, Optional) (see [below for nested schema](#nestedblock--timeouts))
+- `wait` (Block List, Max: 1) Wait for the run to finish (see [below for nested schema](#nestedblock--wait))
 
 ### Read-Only
 
 - `id` (String) The ID of the triggered run.
+
+<a id="nestedblock--timeouts"></a>
+### Nested Schema for `timeouts`
+
+Optional:
+
+- `create` (String)
+
+
+<a id="nestedblock--wait"></a>
+### Nested Schema for `wait`
+
+Optional:
+
+- `continue_on_state` (Set of String) Continue on the specified states of a finished run. If not specified, the default is `[ 'finished' ]`. You can use following states: `applying`, `canceled`, `confirmed`, `destroying`, `discarded`, `failed`, `finished`, `initializing`, `pending_review`, `performing`, `planning`, `preparing_apply`, `preparing_replan`, `preparing`, `queued`, `ready`, `replan_requested`, `skipped`, `stopped`, `unconfirmed`.
+- `continue_on_timeout` (Boolean) Continue if run timed out, i.e. did not reach any defined end state in time. Default: `false`
+- `disabled` (Boolean) Whether waiting for a job is disabled or not. Default: `false`
diff --git a/go.mod b/go.mod
@@ -6,6 +6,7 @@ require (
 	github.com/dgrijalva/jwt-go/v4 v4.0.0-preview1
 	github.com/hashicorp/go-cty v1.4.1-0.20200414143053-d3edf31b6320
 	github.com/hashicorp/go-retryablehttp v0.7.4
+	github.com/hashicorp/terraform-plugin-log v0.9.0
 	github.com/hashicorp/terraform-plugin-sdk/v2 v2.29.0
 	github.com/kelseyhightower/envconfig v1.4.0
 	github.com/pkg/errors v0.9.1
@@ -36,7 +37,6 @@ require (
 	github.com/hashicorp/terraform-exec v0.19.0 // indirect
 	github.com/hashicorp/terraform-json v0.17.1 // indirect
 	github.com/hashicorp/terraform-plugin-go v0.19.0 // indirect
-	github.com/hashicorp/terraform-plugin-log v0.9.0 // indirect
 	github.com/hashicorp/terraform-registry-address v0.2.2 // indirect
 	github.com/hashicorp/terraform-svchost v0.1.1 // indirect
 	github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d // indirect

diff --git a/spacelift/internal/error.go b/spacelift/internal/error.go
@@ -46,3 +46,17 @@ func parseExtensions(ext map[string]interface{}) string {
 
 	return strings.Join(errorParts, ", ")
 }
+
+// AsError is an inline form of errors.As.
+func AsError[TError error](err error) (TError, bool) {
+	var as TError
+	ok := errors.As(err, &as)
+	return as, ok
+}
+
+// IsErrorType reports whether or not the type of any error in err's chain matches
+// the Error type.
+func IsErrorType[TError error](err error) bool {
+	_, ok := AsError[TError](err)
+	return ok
+}
diff --git a/spacelift/resource_run.go b/spacelift/resource_run.go
@@ -2,9 +2,16 @@ package spacelift
 
 import (
 	"context"
+	"fmt"
+	"slices"
+	"strings"
+	"time"
 
+	"github.com/hashicorp/terraform-plugin-log/tflog"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
+	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
+	"github.com/pkg/errors"
 	"github.com/shurcooL/graphql"
 
 	"github.com/spacelift-io/terraform-provider-spacelift/spacelift/internal"
@@ -20,6 +27,11 @@ func resourceRun() *schema.Resource {
 		CreateContext: resourceRunCreate,
 		ReadContext:   schema.NoopContext,
 		Delete:        schema.RemoveFromState,
+		UpdateContext: schema.NoopContext,
+
+		Timeouts: &schema.ResourceTimeout{
+			Create: schema.DefaultTimeout(30 * time.Minute),
+		},
 
 		Schema: map[string]*schema.Schema{
 			"stack_id": {
@@ -55,16 +67,144 @@ func resourceRun() *schema.Resource {
 				Type:        schema.TypeString,
 				Computed:    true,
 			},
+			"wait": {
+				Type:        schema.TypeList,
+				Optional:    true,
+				Description: "Wait for the run to finish",
+				MaxItems:    1,
+				Elem: &schema.Resource{
+					Schema: map[string]*schema.Schema{
+						"disabled": {
+							Type:        schema.TypeBool,
+							Description: "Whether waiting for a job is disabled or not. Default: `false`",
+							Optional:    true,
+							Default:     false,
+						},
+						"continue_on_state": {
+							Type: schema.TypeSet,
+							Elem: &schema.Schema{
+								Type: schema.TypeString,
+							},
+							Description: "Continue on the specified states of a finished run. If not specified, the default is `[ 'finished' ]`. You can use following states: `applying`, `canceled`, `confirmed`, `destroying`, `discarded`, `failed`, `finished`, `initializing`, `pending_review`, `performing`, `planning`, `preparing_apply`, `preparing_replan`, `preparing`, `queued`, `ready`, `replan_requested`, `skipped`, `stopped`, `unconfirmed`.",
+							Optional:    true,
+						},
+						"continue_on_timeout": {
+							Type:        schema.TypeBool,
+							Description: "Continue if run timed out, i.e. did not reach any defined end state in time. Default: `false`",
+							Optional:    true,
+							Default:     false,
+						},
+					},
+				},
+			},
 		},
 	}
 }
 
+type waitConfiguration struct {
+	disabled          bool
+	continueOnState   []string
+	continueOnTimeout bool
+}
+
+func expandWaitConfiguration(input []interface{}) *waitConfiguration {
+	if len(input) == 0 {
+		return nil
+	}
+	v := input[0].(map[string]interface{})
+	cfg := &waitConfiguration{
+		disabled:          v["disabled"].(bool),
+		continueOnState:   []string{},
+		continueOnTimeout: v["continue_on_timeout"].(bool),
+	}
+
+	if v, ok := v["continue_on_state"]; ok {
+		for _, item := range v.(*schema.Set).List() {
+			str, ok := item.(string)
+			if !ok {
+				panic(fmt.Sprintf("continue_on_state contains a non-string element %+v", str))
+			}
+			cfg.continueOnState = append(cfg.continueOnState, str)
+		}
+	}
+	if len(cfg.continueOnState) == 0 {
+		cfg.continueOnState = append(cfg.continueOnState, "finished")
+	}
+	return cfg
+}
+
+func (wait *waitConfiguration) Wait(ctx context.Context, d *schema.ResourceData, client *internal.Client, stackID, mutationID string) diag.Diagnostics {
+	if wait.disabled {
+		return nil
+	}
+
+	stateConf := &retry.StateChangeConf{
+		ContinuousTargetOccurence: 1,
+		Delay:                     10 * time.Second,
+		MinTimeout:                10 * time.Second,
+		Pending: []string{
+			"running",
+		},
+		Target: []string{
+			"finished",
+			"unconfirmed", // Let's treat unconfirmed as the target state.
+			// It's not finished, but we don't want to wait for it because it requires confirmation from someone.
+		},
+		Refresh: checkStackStatusFunc(ctx, client, stackID, mutationID),
+		Timeout: d.Timeout(schema.TimeoutCreate),
+	}
+
+	finalState, err := stateConf.WaitForStateContext(ctx)
+	if err != nil {
+		if timeoutErr, ok := internal.AsError[*retry.TimeoutError](err); ok {
+			tflog.Debug(ctx, "received retry.TimeoutError from WaitForStateContext", map[string]any{
+				"stackID":       stackID,
+				"runID":         mutationID,
+				"lastState":     timeoutErr.LastState,
+				"expectedState": timeoutErr.ExpectedState,
+			})
+			finalState = "__timeout__"
+		} else if err == context.DeadlineExceeded {
+			tflog.Debug(ctx, "received context.DeadlineExceeded from WaitForStateContext", map[string]any{
+				"stackID": stackID,
+				"runID":   mutationID,
+			})
+			finalState = "__timeout__"
+		} else {
+			return diag.Errorf("failed waiting for run %s on stack %s to finish. error(%T): %+v ", mutationID, stackID, err, err)
+		}
+	}
+
+	switch finalState.(string) {
+	case "__timeout__":
+		if !wait.continueOnTimeout {
+			return diag.Errorf("run %s on stack %s has timed out", mutationID, stackID)
+		}
+		tflog.Info(ctx, "run timed out but continue_on_timeout=true",
+			map[string]any{
+				"stackID": stackID,
+				"runID":   mutationID,
+			})
+	default:
+		if !slices.Contains[[]string](wait.continueOnState, finalState.(string)) {
+			return diag.Errorf("run %s on stack %s has ended with status %s. expected %v", mutationID, stackID, finalState, wait.continueOnState)
+		}
+		tflog.Debug(ctx, "run finished", map[string]any{
+			"stackID":    stackID,
+			"runID":      mutationID,
+			"finalState": finalState,
+		})
+	}
+
+	return nil
+}
+
 func resourceRunCreate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
 	var mutation struct {
 		ID string `graphql:"runResourceCreate(stack: $stack, commitSha: $sha, proposed: $proposed)"`
 	}
 
-	stackID := d.Get("stack_id")
+	stackID := d.Get("stack_id").(string)
 
 	variables := map[string]interface{}{
 		"stack":    toID(stackID),
@@ -80,11 +220,72 @@ func resourceRunCreate(ctx context.Context, d *schema.ResourceData, meta interfa
 		variables["proposed"] = graphql.NewBoolean(graphql.Boolean(proposed.(bool)))
 	}
 
-	if err := meta.(*internal.Client).Mutate(ctx, "ResourceRunCreate", &mutation, variables); err != nil {
+	client := meta.(*internal.Client)
+	if err := client.Mutate(ctx, "ResourceRunCreate", &mutation, variables); err != nil {
 		return diag.Errorf("could not trigger run for stack %s: %v", stackID, internal.FromSpaceliftError(err))
 	}
 
-	d.SetId(mutation.ID)
+	if waitRaw, ok := d.GetOk("wait"); ok {
+		wait := expandWaitConfiguration(waitRaw.([]interface{}))
+		if diag := wait.Wait(ctx, d, client, stackID, mutation.ID); len(diag) > 0 {
+			return diag
+		}
+	}
 
+	d.SetId(mutation.ID)
 	return nil
 }
+
+func checkStackStatusFunc(ctx context.Context, client *internal.Client, stackID string, runID string) retry.StateRefreshFunc {
+	return func() (result any, state string, err error) {
+		// instead of a resource handle we return the current state as result
+		// Makes it easier to detect which end state has been reached.
+		// Otherwise we would need another GraphQL query
+		result, finished, err := getStackRunStateByID(ctx, client, stackID, runID)
+		if err != nil {
+			return
+		}
+		state = "running"
+		if finished {
+			state = "finished"
+		}
+		// Let's treat unconfirmed as the target state.
+		// It's not finished, but we don't want to wait for it because it requires confirmation from someone.
+		if result == "unconfirmed" {
+			state = "unconfirmed"
+		}
+		return
+	}
+}
+
+func getStackRunStateByID(ctx context.Context, client *internal.Client, stackID string, runID string) (string, bool, error) {
+	var query struct {
+		Stack struct {
+			RunResourceState struct {
+				ID       graphql.String
+				State    graphql.String
+				Finished graphql.Boolean
+			} `graphql:"runResourceState(id: $runId)"`
+		} `graphql:"stack(id: $stackId)"`
+	}
+
+	variables := map[string]interface{}{
+		"stackId": graphql.ID(stackID),
+		"runId":   graphql.ID(runID),
+	}
+
+	if err := client.Query(ctx, "StackRunRead", &query, variables); err != nil {
+		return "", false, errors.Wrap(err, fmt.Sprintf("could not query for run %s of stack %s", runID, stackID))
+	}
+
+	rrs := query.Stack.RunResourceState
+
+	currentState := strings.ToLower(string(rrs.State))
+	tflog.Debug(ctx, "current state of run", map[string]interface{}{
+		"stackID":      stackID,
+		"runID":        runID,
+		"currentState": currentState,
+		"finished":     rrs.Finished,
+	})
+	return currentState, bool(rrs.Finished), nil
+}