Skip to content

Commit 25663e7

Browse files
committed
improve auto Monitor
1 parent ddd9200 commit 25663e7

File tree

1 file changed

+43
-22
lines changed

1 file changed

+43
-22
lines changed

Diff for: lambda_function.py

+43-22
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
bucket = "BUCKET_NAME"
1313

1414

15-
def killdeadAlarms(fleetId, monitorapp, project):
15+
def killdeadAlarms(fleetId, project):
1616
checkdates = [
1717
datetime.datetime.now().strftime("%Y-%m-%d"),
1818
(datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"),
@@ -27,7 +27,12 @@ def killdeadAlarms(fleetId, monitorapp, project):
2727
if eachevent["EventInformation"]["EventSubType"] == "terminated":
2828
todel.append(eachevent["EventInformation"]["InstanceId"])
2929
todel = [f"{project}_{x}" for x in todel]
30-
cloudwatch.delete_alarms(AlarmNames=todel)
30+
while len(todel) > 100:
31+
dellist = todel[:100]
32+
cloudwatch.delete_alarms(AlarmNames=dellist)
33+
todel = todel[100:]
34+
if len(todel) <= 100:
35+
cloudwatch.delete_alarms(AlarmNames=todel)
3136
print("Old alarms deleted")
3237

3338

@@ -41,8 +46,18 @@ def seeIfLogExportIsDone(logExportId):
4146
time.sleep(30)
4247

4348

44-
def downscaleSpotFleet(queue, spotFleetID):
45-
response = sqs.get_queue_url(QueueName=queue)
49+
def downscaleSpotFleet(nonvisible, spotFleetID):
50+
status = ec2.describe_spot_fleet_instances(SpotFleetRequestId=spotFleetID)
51+
if nonvisible < len(status["ActiveInstances"]):
52+
ec2.modify_spot_fleet_request(
53+
ExcessCapacityTerminationPolicy="noTermination",
54+
TargetCapacity=str(nonvisible),
55+
SpotFleetRequestId=spotFleetID,
56+
)
57+
58+
59+
def check_sqs_queue(queueName):
60+
response = sqs.get_queue_url(QueueName=queueName)
4661
queueUrl = response["QueueUrl"]
4762
response = sqs.get_queue_attributes(
4863
QueueUrl=queueUrl,
@@ -53,25 +68,22 @@ def downscaleSpotFleet(queue, spotFleetID):
5368
)
5469
visible = int(response["Attributes"]["ApproximateNumberOfMessages"])
5570
nonvisible = int(response["Attributes"]["ApproximateNumberOfMessagesNotVisible"])
56-
status = ec2.describe_spot_fleet_instances(SpotFleetRequestId=spotFleetID)
57-
if nonvisible < len(status["ActiveInstances"]):
58-
result = ec2.modify_spot_fleet_request(
59-
ExcessCapacityTerminationPolicy="noTermination",
60-
TargetCapacity=str(nonvisible),
61-
SpotFleetRequestId=spotFleetID,
62-
)
71+
print(
72+
f"Found {visible} visible messages and {nonvisible} nonvisible messages in queue."
73+
)
74+
return visible, nonvisible
6375

6476

6577
def lambda_handler(event, lambda_context):
6678
# Triggered any time SQS queue ApproximateNumberOfMessagesVisible = 0
6779
# OR ApproximateNumberOfMessagesNotVisible = 0
6880
messagestring = event["Records"][0]["Sns"]["Message"]
6981
messagedict = json.loads(messagestring)
70-
queueId = messagedict["Trigger"]["Dimensions"][0]["value"]
71-
project = queueId.rsplit("_", 1)[0]
82+
queueName = messagedict["Trigger"]["Dimensions"][0]["value"]
83+
project = queueName.rsplit("_", 1)[0]
7284

7385
# Download monitor file
74-
monitor_file_name = f"{queueId.split('Queue')[0]}SpotFleetRequestId.json"
86+
monitor_file_name = f"{queueName.split('Queue')[0]}SpotFleetRequestId.json"
7587
monitor_local_name = f"/tmp/{monitor_file_name}"
7688
monitor_on_bucket_name = f"monitors/{monitor_file_name}"
7789

@@ -88,18 +100,19 @@ def lambda_handler(event, lambda_context):
88100
monitorapp = monitorInfo["MONITOR_APP_NAME"]
89101
fleetId = monitorInfo["MONITOR_FLEET_ID"]
90102
loggroupId = monitorInfo["MONITOR_LOG_GROUP_NAME"]
91-
starttime = monitorInfo["MONITOR_START_TIME"]
92103
CLEAN_DASHBOARD = monitorInfo["CLEAN_DASHBOARD"]
93104
print(f"Monitor triggered for {monitorcluster} {monitorapp} {fleetId} {loggroupId}")
94105

106+
visible, nonvisible = check_sqs_queue(queueName)
107+
95108
# If no visible messages, downscale machines
96-
if "ApproximateNumberOfMessagesVisible" in event["Records"][0]["Sns"]["Message"]:
109+
if visible == 0 and nonvisible > 0:
97110
print("No visible messages. Tidying as we go.")
98-
killdeadAlarms(fleetId, monitorapp, project)
99-
downscaleSpotFleet(queueId, fleetId)
111+
killdeadAlarms(fleetId, project)
112+
downscaleSpotFleet(nonvisible, fleetId)
100113

101114
# If no messages in progress, cleanup
102-
if "ApproximateNumberOfMessagesNotVisible" in event["Records"][0]["Sns"]["Message"]:
115+
if visible == 0 and nonvisible == 0:
103116
print("No messages in progress. Cleaning up.")
104117
ecs.update_service(
105118
cluster=monitorcluster,
@@ -115,7 +128,12 @@ def lambda_handler(event, lambda_context):
115128
active_instances = []
116129
for instance in active_dictionary["ActiveInstances"]:
117130
active_instances.append(instance["InstanceId"])
118-
cloudwatch.delete_alarms(AlarmNames=active_instances)
131+
while len(active_instances) > 100:
132+
dellist = active_instances[:100]
133+
cloudwatch.delete_alarms(AlarmNames=dellist)
134+
active_instances = active_instances[100:]
135+
if len(active_instances) <= 100:
136+
cloudwatch.delete_alarms(AlarmNames=active_instances)
119137
killdeadAlarms(fleetId, monitorapp, project)
120138

121139
# Read spot fleet id and terminate all EC2 instances
@@ -129,7 +147,7 @@ def lambda_handler(event, lambda_context):
129147
ECS_SERVICE_NAME = monitorapp + "Service"
130148

131149
print("Deleting existing queue.")
132-
queueoutput = sqs.list_queues(QueueNamePrefix=queueId)
150+
queueoutput = sqs.list_queues(QueueNamePrefix=queueName)
133151
try:
134152
if len(queueoutput["QueueUrls"]) == 1:
135153
queueUrl = queueoutput["QueueUrls"][0]
@@ -148,7 +166,7 @@ def lambda_handler(event, lambda_context):
148166
print("Couldn't delete service.")
149167

150168
print("De-registering task")
151-
taskArns = ecs.list_task_definitions()
169+
taskArns = ecs.list_task_definitions(familyPrefix=ECS_TASK_NAME)
152170
for eachtask in taskArns["taskDefinitionArns"]:
153171
fulltaskname = eachtask.split("/")[-1]
154172
ecs.deregister_task_definition(taskDefinition=fulltaskname)
@@ -185,3 +203,6 @@ def lambda_handler(event, lambda_context):
185203
cloudwatch.delete_dashboards(
186204
DashboardNames=[entry["DashboardName"]]
187205
)
206+
207+
# Delete monitor file
208+
s3.delete_object(Bucket=bucket, Key=monitor_on_bucket_name)

0 commit comments

Comments
 (0)