Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions lambda/monitor_service_quotas/lambda.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import boto3
import json
import time
from datetime import datetime, timedelta
from botocore.config import Config
import checker
import cloudformation_checks
import ebs_checks
Expand Down Expand Up @@ -247,6 +249,18 @@
route53_registry.register(r'^Hosted zones$', route53_checks.HostedZonesChecker)
route53_registry.register(r'^Health checks$', route53_checks.HealthChecksChecker)

# Configure boto3 client with adaptive retries for throttling
BOTO_CONFIG = Config(
retries={
'max_attempts': 5,
'mode': 'adaptive' # Automatically handles throttling with exponential backoff
}
)

# Rate limiting delay between API calls (in seconds)
# ~6-7 requests/second, safely under AWS rate limits
API_RATE_LIMIT_DELAY = 0.15

def handler(event, context):
regions = os.environ['SERVICE_QUOTA_REGION_LIST'].split(',')

Expand All @@ -267,8 +281,8 @@ def handler(event, context):
return

def check_quotas_in_region(region):
quota_client = boto3.client('service-quotas', region_name=region)
ec2_client = boto3.client('ec2', region_name=region)
quota_client = boto3.client('service-quotas', region_name=region, config=BOTO_CONFIG)
ec2_client = boto3.client('ec2', region_name=region, config=BOTO_CONFIG)

threshold = float(os.environ['SERVICE_QUOTA_THRESHOLD']) / 100

Expand Down Expand Up @@ -314,13 +328,15 @@ def list_all_services(client):
for service in page['Services']:
if service['ServiceCode'] not in skip_services:
services.append(service)
time.sleep(API_RATE_LIMIT_DELAY)
return services

def list_all_service_quotas(client, service_code):
quotas = []
paginator = client.get_paginator('list_service_quotas')
for page in paginator.paginate(ServiceCode=service_code):
quotas.extend(page['Quotas'])
time.sleep(API_RATE_LIMIT_DELAY)
return quotas

def get_quota_usage(quota, service_code, region):
Expand Down Expand Up @@ -362,8 +378,8 @@ def get_service_specific_usage(service_code, quota, region):

if service_code == 'cloudformation':
logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}")
return get_service_usage(ebs_registry, quota['QuotaName'], region)
if service_code == 'cloudformation':
return get_service_usage(cloudformation_registry, quota['QuotaName'], region)
elif service_code == 'ebs':
logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}")
return get_service_usage(ebs_registry, quota['QuotaName'], region)
elif service_code == 'ec2':
Expand All @@ -375,9 +391,9 @@ def get_service_specific_usage(service_code, quota, region):
elif service_code == 'eks':
logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}")
return get_service_usage(eks_registry, quota['QuotaName'], region)
elif service_code == 'efs':
elif service_code == 'es':
logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}")
return get_service_usage(efs_registry, quota['QuotaName'], region)
return get_service_usage(es_registry, quota['QuotaName'], region)
elif service_code == 'rds':
logger.debug(f"Getting service specific usage for {quota['QuotaName']} - {service_code}")
return get_service_usage(rds_registry, quota['QuotaName'], region)
Expand Down
2 changes: 1 addition & 1 deletion monitor_ami_usage.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ data "aws_iam_policy_document" "monitor_ami_usage_execution" {

statement {
effect = "Allow"
resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/monitor_ami_usage_execution:*"]
resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/${var.name_prefix}monitor_ami_usage_execution:*"]

actions = [
"logs:CreateLogGroup",
Expand Down
4 changes: 2 additions & 2 deletions monitor_service_quotas.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ data "aws_iam_policy_document" "monitor_service_quotas_execution" {

statement {
effect = "Allow"
resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/monitor_service_quotas_execution:*"]
resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:/aws/lambda/${var.name_prefix}monitor_service_quotas_execution:*"]

actions = [
"logs:CreateLogGroup",
Expand Down Expand Up @@ -85,7 +85,7 @@ resource "aws_lambda_function" "monitor_service_quotas" {
filename = data.archive_file.monitor_service_quotas.output_path
source_code_hash = data.archive_file.monitor_service_quotas.output_base64sha256
tags = local.tags
timeout = 300
timeout = 600

environment {
variables = {
Expand Down
Loading