diff --git a/lambda/monitor_service_quotas/lambda.py b/lambda/monitor_service_quotas/lambda.py index fd8026e..77617bc 100644 --- a/lambda/monitor_service_quotas/lambda.py +++ b/lambda/monitor_service_quotas/lambda.py @@ -1,7 +1,9 @@ import os import boto3 import json +import time from datetime import datetime, timedelta +from botocore.config import Config import checker import cloudformation_checks import ebs_checks @@ -247,6 +249,18 @@ route53_registry.register(r'^Hosted zones$', route53_checks.HostedZonesChecker) route53_registry.register(r'^Health checks$', route53_checks.HealthChecksChecker) +# Configure boto3 client with adaptive retries for throttling +BOTO_CONFIG = Config( + retries={ + 'max_attempts': 5, + 'mode': 'adaptive' # Automatically handles throttling with exponential backoff + } +) + +# Rate limiting delay between API calls (in seconds) +# ~6-7 requests/second, safely under AWS rate limits +API_RATE_LIMIT_DELAY = 0.15 + def handler(event, context): regions = os.environ['SERVICE_QUOTA_REGION_LIST'].split(',') @@ -267,8 +281,8 @@ def handler(event, context): return def check_quotas_in_region(region): - quota_client = boto3.client('service-quotas', region_name=region) - ec2_client = boto3.client('ec2', region_name=region) + quota_client = boto3.client('service-quotas', region_name=region, config=BOTO_CONFIG) + ec2_client = boto3.client('ec2', region_name=region, config=BOTO_CONFIG) threshold = float(os.environ['SERVICE_QUOTA_THRESHOLD']) / 100 @@ -314,6 +328,7 @@ def list_all_services(client): for service in page['Services']: if service['ServiceCode'] not in skip_services: services.append(service) + time.sleep(API_RATE_LIMIT_DELAY) return services def list_all_service_quotas(client, service_code): @@ -321,6 +336,7 @@ def list_all_service_quotas(client, service_code): paginator = client.get_paginator('list_service_quotas') for page in paginator.paginate(ServiceCode=service_code): quotas.extend(page['Quotas']) + time.sleep(API_RATE_LIMIT_DELAY) return quotas def get_quota_usage(quota, service_code, region): @@ -362,8 +378,8 @@ def get_service_specific_usage(service_code, quota, region): if service_code == 'cloudformation': logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}") - return get_service_usage(ebs_registry, quota['QuotaName'], region) - if service_code == 'cloudformation': + return get_service_usage(cloudformation_registry, quota['QuotaName'], region) + elif service_code == 'ebs': logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}") return get_service_usage(ebs_registry, quota['QuotaName'], region) elif service_code == 'ec2': @@ -375,9 +391,9 @@ def get_service_specific_usage(service_code, quota, region): elif service_code == 'eks': logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}") return get_service_usage(eks_registry, quota['QuotaName'], region) - elif service_code == 'efs': + elif service_code == 'es': logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}") - return get_service_usage(efs_registry, quota['QuotaName'], region) + return get_service_usage(es_registry, quota['QuotaName'], region) elif service_code == 'rds': logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}") return get_service_usage(rds_registry, quota['QuotaName'], region) diff --git a/monitor_ami_usage.tf b/monitor_ami_usage.tf index c144b30..bc77e84 100644 --- a/monitor_ami_usage.tf +++ b/monitor_ami_usage.tf @@ -40,7 +40,7 @@ data "aws_iam_policy_document" "monitor_ami_usage_execution" { statement { effect = "Allow" - resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/monitor_ami_usage_execution:*"] + resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/${var.name_prefix}monitor_ami_usage_execution:*"] actions = [ "logs:CreateLogGroup", diff --git a/monitor_service_quotas.tf b/monitor_service_quotas.tf index e8660c0..0cc8bad 100644 --- a/monitor_service_quotas.tf +++ b/monitor_service_quotas.tf @@ -26,7 +26,7 @@ data "aws_iam_policy_document" "monitor_service_quotas_execution" { statement { effect = "Allow" - resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/monitor_service_quotas_execution:*"] + resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/${var.name_prefix}monitor_service_quotas_execution:*"] actions = [ "logs:CreateLogGroup", @@ -85,7 +85,7 @@ resource "aws_lambda_function" "monitor_service_quotas" { filename = data.archive_file.monitor_service_quotas.output_path source_code_hash = data.archive_file.monitor_service_quotas.output_base64sha256 tags = local.tags - timeout = 300 + timeout = 600 environment { variables = {