From a601e834f1e34c3b34e0e9a987974ca7a46ccff6 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Mon, 25 Nov 2024 11:56:33 +0100 Subject: [PATCH 1/9] Add rollback detection --- ecs_deploy/cli.py | 76 +++++++++++++++++++++++++++++++++-------------- ecs_deploy/ecs.py | 55 ++++++++++++++++++++++++++++++---- 2 files changed, 102 insertions(+), 29 deletions(-) diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index 679fe93..dd0b7da 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -155,13 +155,15 @@ def deploy(cluster, service, tag, image, command, health_check, cpu, memory, mem ignore_warnings=ignore_warnings, sleep_time=sleep_time ) - + if deployment.rollback: + slack.notify_failure(cluster, "Rollback", service=service) + exit(100) except TaskPlacementError as e: slack.notify_failure(cluster, str(e), service=service) if rollback: click.secho('%s\n' % str(e), fg='red', err=True) rollback_task_definition(deployment, td, new_td, sleep_time=sleep_time) - exit(1) + exit(100) else: raise @@ -532,7 +534,6 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, waiting = True while waiting and datetime.now() < waiting_timeout: - click.secho('.', nl=False) service = action.get_service() inspected_until = inspect_errors( service=service, @@ -541,7 +542,10 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, since=inspected_until, timeout=False ) + waiting = not action.is_deployed(service) + if action.primary_deployment_updated(service): + click.secho(f"{action.primary_deployment.status_message}", nl=True) if waiting: sleep(sleep_time) @@ -556,7 +560,8 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, click.secho('\n%s' % success_message, fg='green') click.secho('Duration: %s sec\n' % (datetime.now() - start_timestamp).seconds) - + if action.rollback: + click.secho('Rollback complete', fg='green') def deploy_task_definition(deployment, task_definition, title, success_message, failure_message, timeout, deregister, @@ -669,28 +674,53 @@ def print_diff(task_definition, title='Updating task definition'): click.secho('') +# def inspect_events(service, since): +# last_event_timestamp = since +# events = service.get_events(since) +# for timestamp in events: +# message = events[timestamp] +# last_event_timestamp = timestamp +# click.secho('%s\INFO: %s' % (timestamp, message)) + +# return last_event_timestamp + + def inspect_errors(service, failure_message, ignore_warnings, since, timeout): error = False - last_error_timestamp = since - warnings = service.get_warnings(since) - for timestamp in warnings: - message = warnings[timestamp] + last_event_timestamp = since + events = service.get_events(since) + for timestamp in events: + message = events[timestamp] + last_event_timestamp = timestamp click.secho('') - if ignore_warnings: - last_error_timestamp = timestamp - click.secho( - '%s\nWARNING: %s' % (timestamp, message), - fg='yellow', - err=False - ) - click.secho('Continuing.', nl=False) + if 'unable' in message.lower(): + if ignore_warnings: + click.secho( + '%s\nWARNING: %s' % (timestamp, message), + fg='yellow', + err=False + ) + click.secho('Continuing.', nl=False) + else: + click.secho( + '%s\nERROR: %s\n' % (timestamp, message), + fg='red', + err=True + ) + error = True else: - click.secho( - '%s\nERROR: %s\n' % (timestamp, message), - fg='red', - err=True - ) - error = True + if 'rolling back' in message.lower(): + click.secho( + '%s\WARNING: %s' % (timestamp, message), + fg='yellow', + err=False + ) + else: + click.secho( + '%s\INFO: %s' % (timestamp, message), + fg='green', + err=False + ) if service.older_errors: click.secho('') @@ -711,7 +741,7 @@ def inspect_errors(service, failure_message, ignore_warnings, since, timeout): if error: raise TaskPlacementError(failure_message) - return last_error_timestamp + return last_event_timestamp ecs.add_command(deploy) diff --git a/ecs_deploy/ecs.py b/ecs_deploy/ecs.py index 2fb4824..87f5723 100644 --- a/ecs_deploy/ecs.py +++ b/ecs_deploy/ecs.py @@ -205,6 +205,21 @@ class EcsDeployment(dict): ROLLOUT_STATE_FAILED = u'FAILED' ROLLOUT_STATE_COMPLETED = u'COMPLETED' + def __init__(self, deployment_definition): + super(EcsDeployment, self).__init__(deployment_definition) + + def __eq__(self, other): + return self.get(u'status') == other.get(u'status') and \ + other.get('runningCount') == self.get('runningCount') and \ + other.get('pendingCount') == self.get('pendingCount') and \ + other.get('failedTasks') == self.get('failedTasks') and \ + other.get('rolloutState') == self.get('rolloutState') and \ + other.get('rolloutStateReason') == self.get('rolloutStateReason') + + @property + def status_message(self): + return f"{self.get('rolloutState')} : {self.rollout_state_reason} Running: {self.get('runningCount')} / Pending: {self.get('pendingCount')} / Failed: {self.get('failedTasks')}" + @property def is_primary(self): return self.get(u'status') == self.STATUS_PRIMARY @@ -217,6 +232,10 @@ def is_active(self): def has_failed(self): return self.get(u'rolloutState') == self.ROLLOUT_STATE_FAILED + @property + def is_rollback(self): + return "circuit breaker: rolling back" in self.get(u'rolloutStateReason') + @property def has_completed(self): return self.get(u'rolloutState') == self.ROLLOUT_STATE_COMPLETED @@ -298,15 +317,21 @@ def older_errors(self): ) def get_warnings(self, since=None, until=None): + warning = {} + events = self.get_events(since, until) + for k in events: + if u'unable' in events[k]: + warning[k] = events[k] + return warning + + def get_events(self, since=None, until=None): since = since or self.deployment_created_at until = until or datetime.now(tz=tzlocal()) - errors = {} + events = {} for event in self.get(u'events'): - if u'unable' not in event[u'message']: - continue if since < event[u'createdAt'] < until: - errors[event[u'createdAt']] = event[u'message'] - return errors + events[event[u'createdAt']] = event[u'message'] + return events class EcsTaskDefinition(object): @@ -1308,6 +1333,9 @@ def __init__(self, client, cluster_name, service_name): try: if service_name: self._service = self.get_service() + self.primary_deployment = self._service.primary_deployment + self.active_deployment = self._service.active_deployment + self.rollback = False except IndexError: raise EcsConnectionError( u'An error occurred when calling the DescribeServices ' @@ -1372,8 +1400,23 @@ def update_service(self, service, desired_count=None): task_definition=service.task_definition ) return EcsService(self._cluster_name, response[u'service']) + + def primary_deployment_updated(self, service): + if service.primary_deployment != self.primary_deployment: + self.primary_deployment = service.primary_deployment + return True + + def active_deployment_updated(self, service): + if service.active_deployment != self.active_deployment: + self.active_deployment = service.active_deployment + return True + def deployment_status_updated(self, service): + return self.primary_deployment_updated(service) or self.active_deployment_updated(service) + def is_deployed(self, service): + if service.primary_deployment.is_rollback: + self.rollback = True if service.primary_deployment.has_failed: raise EcsDeploymentError(u'Deployment Failed! ' + service.primary_deployment.rollout_state_reason) if service.primary_deployment.failed_tasks > 0 and service.primary_deployment.failed_tasks != self.FAILED_TASKS: @@ -1392,7 +1435,7 @@ def is_deployed(self, service): service=service, task_arns=running_tasks[u'taskArns'] ) - return service.desired_count == running_count + return service.desired_count == running_count and service.primary_deployment.has_completed def get_running_tasks_count(self, service, task_arns): running_count = 0 From da4acb67b881fa09f7adbb14753c0abbce038d10 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Mon, 25 Nov 2024 15:35:02 +0100 Subject: [PATCH 2/9] add rollback --- Makefile | 11 +++++++ ecs_deploy/cli.py | 80 ++++++++++++++++++----------------------------- ecs_deploy/ecs.py | 9 +----- 3 files changed, 43 insertions(+), 57 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f6b6618 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +IMAGE_NAME ?= sre/ecs-deploy +BUILD_TAG ?= build-local +REGISTRY ?= 404977151305.dkr.ecr.eu-central-1.amazonaws.com +IMAGE_URI ?= ${REGISTRY}/${IMAGE_NAME}:${BUILD_TAG} + +TEST_FILE ?= ... + +.PHONY: build + +build: + docker build --pull -t ${IMAGE_URI} . diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index dd0b7da..4d7d232 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -527,6 +527,7 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, waiting_timeout = datetime.now() + timedelta(seconds=timeout) service = action.get_service() inspected_until = None + # inspected_events_until = None if timeout == -1: waiting = False @@ -535,7 +536,7 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, while waiting and datetime.now() < waiting_timeout: service = action.get_service() - inspected_until = inspect_errors( + inspected_until = inspect_events( service=service, failure_message=failure_message, ignore_warnings=ignore_warnings, @@ -550,7 +551,7 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, if waiting: sleep(sleep_time) - inspect_errors( + inspect_events( service=service, failure_message=failure_message, ignore_warnings=ignore_warnings, @@ -674,63 +675,28 @@ def print_diff(task_definition, title='Updating task definition'): click.secho('') -# def inspect_events(service, since): -# last_event_timestamp = since -# events = service.get_events(since) -# for timestamp in events: -# message = events[timestamp] -# last_event_timestamp = timestamp -# click.secho('%s\INFO: %s' % (timestamp, message)) - -# return last_event_timestamp - - -def inspect_errors(service, failure_message, ignore_warnings, since, timeout): +def inspect_events(service, failure_message, ignore_warnings, since, timeout): error = False last_event_timestamp = since events = service.get_events(since) for timestamp in events: message = events[timestamp] last_event_timestamp = timestamp - click.secho('') - if 'unable' in message.lower(): - if ignore_warnings: - click.secho( - '%s\nWARNING: %s' % (timestamp, message), - fg='yellow', - err=False - ) - click.secho('Continuing.', nl=False) - else: - click.secho( - '%s\nERROR: %s\n' % (timestamp, message), - fg='red', - err=True - ) - error = True + message_lower = message.lower() + + if 'unable' in message_lower: + error = False if ignore_warnings else True + level = 'ERROR' if error else 'WARNING' + event_log(timestamp, message, level) + elif 'rolling back' in message_lower: + event_log(timestamp, message, 'WARNING') else: - if 'rolling back' in message.lower(): - click.secho( - '%s\WARNING: %s' % (timestamp, message), - fg='yellow', - err=False - ) - else: - click.secho( - '%s\INFO: %s' % (timestamp, message), - fg='green', - err=False - ) + event_log(timestamp, message, 'INFO') if service.older_errors: - click.secho('') - click.secho('Older errors', fg='yellow', err=True) + event_log(timestamp, 'Older errors', 'WARNING') for timestamp in service.older_errors: - click.secho( - text='%s\n%s\n' % (timestamp, service.older_errors[timestamp]), - fg='yellow', - err=True - ) + event_log(timestamp, service.older_errors[timestamp], 'WARNING') if timeout: error = True @@ -743,6 +709,22 @@ def inspect_errors(service, failure_message, ignore_warnings, since, timeout): return last_event_timestamp +def event_log(timestamp, message, level): + """ + Helper function to display a message with consistent formatting and color coding. + """ + color_map = { + 'INFO': 'green', + 'WARNING': 'yellow', + 'ERROR': 'red' + } + color = color_map.get(level, 'white') # Default to white if the level is unknown + + click.secho( + f'{timestamp}\n{level}: {message}', + fg=color, + err=(level == 'ERROR') + ) ecs.add_command(deploy) ecs.add_command(scale) diff --git a/ecs_deploy/ecs.py b/ecs_deploy/ecs.py index 87f5723..026b5cc 100644 --- a/ecs_deploy/ecs.py +++ b/ecs_deploy/ecs.py @@ -205,9 +205,6 @@ class EcsDeployment(dict): ROLLOUT_STATE_FAILED = u'FAILED' ROLLOUT_STATE_COMPLETED = u'COMPLETED' - def __init__(self, deployment_definition): - super(EcsDeployment, self).__init__(deployment_definition) - def __eq__(self, other): return self.get(u'status') == other.get(u'status') and \ other.get('runningCount') == self.get('runningCount') and \ @@ -317,12 +314,8 @@ def older_errors(self): ) def get_warnings(self, since=None, until=None): - warning = {} events = self.get_events(since, until) - for k in events: - if u'unable' in events[k]: - warning[k] = events[k] - return warning + return {k: v for k, v in events.items() if 'unable' in v} def get_events(self, since=None, until=None): since = since or self.deployment_created_at From 22f7ee9f6f5206aaaeb1f96cb968d2840354e4a8 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Mon, 25 Nov 2024 15:59:29 +0100 Subject: [PATCH 3/9] change test --- Dockerfile | 3 +++ Makefile | 6 ++++++ ecs_deploy/cli.py | 1 + tests/test_cli.py | 3 +-- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index fa3d669..5787ff5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,8 @@ FROM python:3.10-alpine +COPY requirements-test.txt . +RUN pip install -r requirements-test.txt + ADD . /usr/src/app WORKDIR /usr/src/app diff --git a/Makefile b/Makefile index f6b6618..ed3d1c0 100644 --- a/Makefile +++ b/Makefile @@ -9,3 +9,9 @@ TEST_FILE ?= ... build: docker build --pull -t ${IMAGE_URI} . + +test: build + docker run -t --rm $(IMAGE_URI) pytest -p no:cacheprovider -x -vv /usr/src/app + +dev: build + docker run -it --rm -v $(PWD):/usr/src/app $(IMAGE_URI) bash \ No newline at end of file diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index 4d7d232..8bc8b4e 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -687,6 +687,7 @@ def inspect_events(service, failure_message, ignore_warnings, since, timeout): if 'unable' in message_lower: error = False if ignore_warnings else True level = 'ERROR' if error else 'WARNING' + click.secho('Continuing.', nl=False) event_log(timestamp, message, level) elif 'rolling back' in message_lower: event_log(timestamp, message, 'WARNING') diff --git a/tests/test_cli.py b/tests/test_cli.py index 96bff7b..3d04021 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -81,7 +81,7 @@ def test_deploy_with_rollback(get_client, runner): get_client.return_value = EcsTestClient('acces_key', 'secret_key', wait=2) result = runner.invoke(cli.deploy, (CLUSTER_NAME, SERVICE_NAME, '--timeout=1', '--rollback')) - assert result.exit_code == 1 + assert result.exit_code == 100 assert result.exception assert u"Deploying based on task definition: test-task:1" in result.output @@ -781,7 +781,6 @@ def test_deploy_with_wait_within_timeout(get_client, runner): result = runner.invoke(cli.deploy, (CLUSTER_NAME, SERVICE_NAME, '--timeout', '10')) assert result.exit_code == 0 assert u'Deploying new task definition' in result.output - assert u'...' in result.output @patch('ecs_deploy.cli.get_client') From 990289018f8ae3ef5056441e028b2deace6d2311 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Mon, 25 Nov 2024 16:19:55 +0100 Subject: [PATCH 4/9] change --- Makefile | 2 -- ecs_deploy/cli.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile b/Makefile index ed3d1c0..ad6caaa 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,6 @@ BUILD_TAG ?= build-local REGISTRY ?= 404977151305.dkr.ecr.eu-central-1.amazonaws.com IMAGE_URI ?= ${REGISTRY}/${IMAGE_NAME}:${BUILD_TAG} -TEST_FILE ?= ... - .PHONY: build build: diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index 8bc8b4e..64dcef7 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -722,7 +722,7 @@ def event_log(timestamp, message, level): color = color_map.get(level, 'white') # Default to white if the level is unknown click.secho( - f'{timestamp}\n{level}: {message}', + f'{timestamp} {level}: {message}', fg=color, err=(level == 'ERROR') ) From 657a56d2d25cb287c51316b17aabb5f9dbc1d0ac Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Tue, 26 Nov 2024 10:14:39 +0100 Subject: [PATCH 5/9] remove files --- Dockerfile | 11 ----------- Makefile | 15 --------------- 2 files changed, 26 deletions(-) delete mode 100644 Dockerfile delete mode 100644 Makefile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 5787ff5..0000000 --- a/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3.10-alpine - -COPY requirements-test.txt . -RUN pip install -r requirements-test.txt - -ADD . /usr/src/app -WORKDIR /usr/src/app - -RUN ["python", "setup.py", "install"] - -CMD ["ecs"] diff --git a/Makefile b/Makefile deleted file mode 100644 index ad6caaa..0000000 --- a/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -IMAGE_NAME ?= sre/ecs-deploy -BUILD_TAG ?= build-local -REGISTRY ?= 404977151305.dkr.ecr.eu-central-1.amazonaws.com -IMAGE_URI ?= ${REGISTRY}/${IMAGE_NAME}:${BUILD_TAG} - -.PHONY: build - -build: - docker build --pull -t ${IMAGE_URI} . - -test: build - docker run -t --rm $(IMAGE_URI) pytest -p no:cacheprovider -x -vv /usr/src/app - -dev: build - docker run -it --rm -v $(PWD):/usr/src/app $(IMAGE_URI) bash \ No newline at end of file From b4772a542b7c4f624824ec23ed0dafa732f37119 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Tue, 26 Nov 2024 10:16:51 +0100 Subject: [PATCH 6/9] bring deleted dockerfile --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fa3d669 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10-alpine + +ADD . /usr/src/app +WORKDIR /usr/src/app + +RUN ["python", "setup.py", "install"] + +CMD ["ecs"] From c41cae6a9cb2582c3d75c75b6b4c2378b7d74a8d Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Tue, 26 Nov 2024 12:58:11 +0100 Subject: [PATCH 7/9] remove unused line --- ecs_deploy/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index 64dcef7..d1f8653 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -527,7 +527,6 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, waiting_timeout = datetime.now() + timedelta(seconds=timeout) service = action.get_service() inspected_until = None - # inspected_events_until = None if timeout == -1: waiting = False From 9948eeddd586c0091fba963fc20c8f1cdfdc3653 Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Thu, 28 Nov 2024 12:17:33 +0100 Subject: [PATCH 8/9] update rollback --- ecs_deploy/ecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecs_deploy/ecs.py b/ecs_deploy/ecs.py index 026b5cc..e36b2f6 100644 --- a/ecs_deploy/ecs.py +++ b/ecs_deploy/ecs.py @@ -231,7 +231,7 @@ def has_failed(self): @property def is_rollback(self): - return "circuit breaker: rolling back" in self.get(u'rolloutStateReason') + return "rolling back" in self.get(u'rolloutStateReason') @property def has_completed(self): From 5fe60ebd51e8c5e9e727b864f0493ec8aeac9c7d Mon Sep 17 00:00:00 2001 From: Dipesh Gautam Date: Fri, 29 Nov 2024 11:54:07 +0100 Subject: [PATCH 9/9] better error output --- ecs_deploy/cli.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ecs_deploy/cli.py b/ecs_deploy/cli.py index d1f8653..94cae82 100644 --- a/ecs_deploy/cli.py +++ b/ecs_deploy/cli.py @@ -545,7 +545,9 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, waiting = not action.is_deployed(service) if action.primary_deployment_updated(service): - click.secho(f"{action.primary_deployment.status_message}", nl=True) + event_log(datetime.now().isoformat(), + action.primary_deployment.status_message, + 'WARNING' if 'rolling back' in action.primary_deployment.status_message else 'INFO') if waiting: sleep(sleep_time) @@ -558,10 +560,12 @@ def wait_for_finish(action, timeout, title, success_message, failure_message, timeout=waiting ) - click.secho('\n%s' % success_message, fg='green') click.secho('Duration: %s sec\n' % (datetime.now() - start_timestamp).seconds) + if action.rollback: - click.secho('Rollback complete', fg='green') + click.secho('Deployment failed, but service has been rolled back to previous task definition', fg='red') + else: + click.secho('\n%s' % success_message, fg='green') def deploy_task_definition(deployment, task_definition, title, success_message, failure_message, timeout, deregister,