diff --git a/README.md b/README.md
index 41613859e7..2c2283f711 100644
--- a/README.md
+++ b/README.md
@@ -165,7 +165,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack.
| [instance\_max\_spot\_price](#input\_instance\_max\_spot\_price) | Max price price for spot instances per hour. This variable will be passed to the create fleet as max spot price for the fleet. | `string` | `null` | no |
| [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no |
| [instance\_target\_capacity\_type](#input\_instance\_target\_capacity\_type) | Default lifecycle used for runner instances, can be either `spot` or `on-demand`. | `string` | `"spot"` | no |
-| [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the instance termination watcher. This feature is Beta, changes will not trigger a major release as long in beta.
`enable`: Enable or disable the spot termination watcher.
`memory_size`: Memory size linit in MB of the lambda.
`s3_key`: S3 key for syncer lambda function. Required if using S3 bucket to specify lambdas.
`s3_object_version`: S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket.
`timeout`: Time out of the lambda in seconds.
`zip`: File location of the lambda zip file. |
object({| `{}` | no | +| [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the instance termination watcher. This feature is Beta, changes will not trigger a major release as long in beta.
enable = optional(bool, false)
enable_metric = optional(string, null) # deprectaed
memory_size = optional(number, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
timeout = optional(number, null)
zip = optional(string, null)
})
object({| `{}` | no | | [instance\_types](#input\_instance\_types) | List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win). | `list(string)` |
enable = optional(bool, false)
enable_metric = optional(string, null) # deprectaed
features = optional(object({
enable_spot_termination_handler = optional(bool, true)
enable_spot_termination_notification_watcher = optional(bool, true)
}), {})
memory_size = optional(number, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
timeout = optional(number, null)
zip = optional(string, null)
})
[| no | | [job\_queue\_retention\_in\_seconds](#input\_job\_queue\_retention\_in\_seconds) | The number of seconds the job is held in the queue before it is purged. | `number` | `86400` | no | | [job\_retry](#input\_job\_retry) | Experimental! Can be removed / changed without trigger a major release.Configure job retries. The configuration enables job retries (for ephemeral runners). After creating the insances a message will be published to a job retry queue. The job retry check lambda is checking after a delay if the job is queued. If not the message will be published again on the scale-up (build queue). Using this feature can impact the reate limit of the GitHub app.
"m5.large",
"c5.large"
]
object({| `{}` | no | @@ -257,6 +257,7 @@ Talk to the forestkeepers in the `runners-channel` on Slack. | Name | Description | |------|-------------| | [binaries\_syncer](#output\_binaries\_syncer) | n/a | +| [instance\_termination\_handler](#output\_instance\_termination\_handler) | n/a | | [instance\_termination\_watcher](#output\_instance\_termination\_watcher) | n/a | | [queues](#output\_queues) | SQS queues. | | [runners](#output\_runners) | n/a | diff --git a/docs/configuration.md b/docs/configuration.md index 0eae2195ec..4b5f33507e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -215,21 +215,35 @@ In case the setup does not work as intended, trace the events through this seque ### Termination watcher -This feature is in early stage and therefore disabled by default. +This feature is in early stage and therefore disabled by default. To enable the watcher, set `instance_termination_watcher.enable = true`. -The termination watcher is currently watching for spot termination notifications. The module is only taken events into account for instances tagged with `ghr:environment` by default when deployment the module as part of one of the main modules (root or multi-runner). The module can also be deployed stand-alone, in that case the tag filter needs to be tunned. +The termination watcher is currently watching for spot terminations. The module is only taken events into account for instances tagged with `ghr:environment` by default when deployment the module as part of one of the main modules (root or multi-runner). The module can also be deployed stand-alone, in this case, the tag filter needs to be tunned. + +### Termination notification + +The watcher is listening for spot termination warnings and create a log message and optionally a metric. The watcher is disabled by default. The feature is enabled once the watcher is enabled, the feature can be disabled explicit by setting `instance_termination_watcher.features.enable_spot_termination_handler = false`. - Logs: The module will log all termination notifications. For each warning it will look up instance details and log the environment, instance type and time the instance is running. As well some other details. - Metrics: Metrics are disabled by default, this to avoid costs. Once enabled a metric will be created for each warning with at least dimensions for the environment and instance type. THe metric name space can be configured via the variables. The metric name used is `SpotInterruptionWarning`. -#### Log example +### Termination handler + +!!! warning + This feature will only work once the CloudTrail is enabled. + +The termination handler is listening for spot terminations by capture the `BidEvictedEvent` via CloudTrail. The handler will log and optionally create a metric for each termination. The intend is to enhance the logic to inform the user about the termination via the GitHub Job or Workflow run. The feature is disabled by default. The feature is enabled once the watcher is enabled, the feature can be disabled explicit by setting `instance_termination_watcher.features.enable_spot_termination_handler = false`. + +- Logs: The module will log all termination notifications. For each warning it will look up instance details and log the environment, instance type and time the instance is running. As well some other details. +- Metrics: Metrics are disabled by default, this to avoid costs. Once enabled a metric will be created for each termination with at least dimensions for the environment and instance type. THe metric name space can be configured via the variables. The metric name used is `SpotTermination`. + +### Log example (both warnings and terminations) Below an example of the the log messages created. ``` { "level": "INFO", - "message": "Received spot notification warning:", + "message": "Received spot notification for ${metricName}", "environment": "default", "instanceId": "i-0039b8826b3dcea55", "instanceType": "c5.large", diff --git a/examples/default/main.tf b/examples/default/main.tf index a775872137..90c889319b 100644 --- a/examples/default/main.tf +++ b/examples/default/main.tf @@ -78,7 +78,7 @@ module "runners" { # Let the module manage the service linked role # create_service_linked_role_spot = true - instance_types = ["m5.large", "c5.large"] + instance_types = ["m7a.large", "m5.large"] # override delay of events in seconds delay_webhook_event = 5 @@ -122,7 +122,7 @@ module "runners" { # metric = { # enable_spot_termination_warning = true # enable_job_retry = false - # enable_github_app_rate_limit = true + # enable_github_app_rate_limit = false # } # } diff --git a/lambdas/functions/termination-watcher/src/ConfigResolver.ts b/lambdas/functions/termination-watcher/src/ConfigResolver.ts index 477eb613c9..9e98b2a20a 100644 --- a/lambdas/functions/termination-watcher/src/ConfigResolver.ts +++ b/lambdas/functions/termination-watcher/src/ConfigResolver.ts @@ -2,6 +2,7 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; export class Config { createSpotWarningMetric: boolean; + createSpotTerminationMetric: boolean; tagFilters: Record
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
})
object({| n/a | yes | | [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no | -| [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the spot termination watcher lambda function. This feature is Beta, changes will not trigger a major release as long in beta.
key_base64 = string
id = string
webhook_secret = string
})
object({| `{}` | no | +| [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the spot termination watcher lambda function. This feature is Beta, changes will not trigger a major release as long in beta.
enable = optional(bool, false)
enable_metrics = optional(string, null) # deprecated
memory_size = optional(number, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
timeout = optional(number, null)
zip = optional(string, null)
})
object({| `{}` | no | | [key\_name](#input\_key\_name) | Key pair name | `string` | `null` | no | | [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | | [lambda\_architecture](#input\_lambda\_architecture) | AWS Lambda architecture. Lambda functions using Graviton processors ('arm64') tend to have better price/performance than 'x86\_64' functions. | `string` | `"arm64"` | no | diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index 13b6f838a0..553cc04594 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -634,8 +634,12 @@ variable "instance_termination_watcher" { EOF type = object({ - enable = optional(bool, false) - enable_metrics = optional(string, null) # deprecated + enable = optional(bool, false) + enable_metrics = optional(string, null) # deprecated + features = optional(object({ + enable_spot_termination_handler = optional(bool, true) + enable_spot_termination_notification_watcher = optional(bool, true) + }), {}) memory_size = optional(number, null) s3_key = optional(string, null) s3_object_version = optional(string, null) diff --git a/modules/termination-watcher/README.md b/modules/termination-watcher/README.md index 849380777f..c3ab80ff33 100644 --- a/modules/termination-watcher/README.md +++ b/modules/termination-watcher/README.md @@ -65,34 +65,29 @@ yarn run dist ## Providers -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | ~> 5.27 | +No providers. ## Modules | Name | Source | Version | |------|--------|---------| -| [termination\_warning\_watcher](#module\_termination\_warning\_watcher) | ../lambda | n/a | +| [termination\_handler](#module\_termination\_handler) | ./termination | n/a | +| [termination\_notification](#module\_termination\_notification) | ./notification | n/a | ## Resources -| Name | Type | -|------|------| -| [aws_cloudwatch_event_rule.spot_instance_termination_warning](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | -| [aws_cloudwatch_event_target.main](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | -| [aws_iam_role_policy.lambda_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | -| [aws_lambda_permission.main](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +No resources. ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [config](#input\_config) | Configuration for the spot termination watcher lambda function.
enable = optional(bool, false)
enable_metrics = optional(string, null) # deprecated
features = optional(object({
enable_spot_termination_handler = optional(bool, true)
enable_spot_termination_notification_watcher = optional(bool, true)
}), {})
memory_size = optional(number, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
timeout = optional(number, null)
zip = optional(string, null)
})
object({| n/a | yes | +| [config](#input\_config) | Configuration for the spot termination watcher.
aws_partition = optional(string, null)
architecture = optional(string, null)
enable_metric = optional(string, null)
environment_variables = optional(map(string), {})
lambda_tags = optional(map(string), {})
log_level = optional(string, null)
logging_kms_key_id = optional(string, null)
logging_retention_in_days = optional(number, null)
memory_size = optional(number, null)
metrics = optional(object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_spot_termination_warning = optional(bool, true)
}), {})
}), {})
prefix = optional(string, null)
principals = optional(list(object({
type = string
identifiers = list(string)
})), [])
role_path = optional(string, null)
role_permissions_boundary = optional(string, null)
runtime = optional(string, null)
s3_bucket = optional(string, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
security_group_ids = optional(list(string), [])
subnet_ids = optional(list(string), [])
tag_filters = optional(map(string), null)
tags = optional(map(string), {})
timeout = optional(number, null)
tracing_config = optional(object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
}), {})
zip = optional(string, null)
})
object({| n/a | yes | ## Outputs | Name | Description | |------|-------------| -| [lambda](#output\_lambda) | n/a | +| [spot\_termination\_handler](#output\_spot\_termination\_handler) | n/a | +| [spot\_termination\_notification](#output\_spot\_termination\_notification) | n/a | diff --git a/modules/termination-watcher/main.tf b/modules/termination-watcher/main.tf index acf41f83be..1cf8ccb275 100644 --- a/modules/termination-watcher/main.tf +++ b/modules/termination-watcher/main.tf @@ -15,41 +15,3 @@ locals { metrics_namespace = var.config.metrics.namespace }) } - -module "termination_warning_watcher" { - source = "../lambda" - lambda = local.config -} - - -resource "aws_cloudwatch_event_rule" "spot_instance_termination_warning" { - name = "${var.config.prefix != null ? format("%s-", var.config.prefix) : ""}spot-instance-termination" - description = "Spot Instance Termination Warning" - - event_pattern = <
aws_partition = optional(string, null)
architecture = optional(string, null)
enable_metric = optional(string, null)
environment_variables = optional(map(string), {})
features = optional(object({
enable_spot_termination_handler = optional(bool, true)
enable_spot_termination_notification_watcher = optional(bool, true)
}), {})
lambda_tags = optional(map(string), {})
log_level = optional(string, null)
logging_kms_key_id = optional(string, null)
logging_retention_in_days = optional(number, null)
memory_size = optional(number, null)
metrics = optional(object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_spot_termination = optional(bool, true)
enable_spot_termination_warning = optional(bool, true)
}), {})
}), {})
prefix = optional(string, null)
principals = optional(list(object({
type = string
identifiers = list(string)
})), [])
role_path = optional(string, null)
role_permissions_boundary = optional(string, null)
runtime = optional(string, null)
s3_bucket = optional(string, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
security_group_ids = optional(list(string), [])
subnet_ids = optional(list(string), [])
tag_filters = optional(map(string), null)
tags = optional(map(string), {})
timeout = optional(number, null)
tracing_config = optional(object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
}), {})
zip = optional(string, null)
})