游客发表

AWS CloudWatch监控之钉钉告警

发帖时间:2025-11-05 04:59:29

一、钉钉CloudWatch服务安装

Amazon Linux 2系统安装Agent。钉钉

复制Bash

#!/bin/

bash

rpm -ivh https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpmsudo tee -a /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<-EOF{ "logs": { "logs_collected": { "files": { "collect_list": [ { "file_path": "/logArchive/hcaextension/info*.log",钉钉 "log_group_name": "RGC-Prod-3in1oven", "log_stream_name": "info.logs" }, { "file_path": "/logArchive/hcaextension/http*.log", "log_group_name": "RGC-Prod-3in1oven", "log_stream_name": "http.logs" } ] } } }, "metrics": { "aggregation_dimensions": [ [ "InstanceId" ] ], "append_dimensions": { "AutoScalingGroupName": "${aws:AutoScalingGroupName}", "ImageId": "${aws:ImageId}", "InstanceId": "${aws:InstanceId}", "InstanceType": "${aws:InstanceType}" }, "metrics_collected": { "cpu": { "measurement": [ "cpu_usage_idle", "cpu_usage_iowait", "cpu_usage_user", "cpu_usage_system" ], "metrics_collection_interval": 180, "resources": [ "*" ], "totalcpu": false }, "disk": { "measurement": [ "used_percent" ], "metrics_collection_interval": 180, "resources": [ "/" ] }, "diskio": { "measurement": [ "io_time", "write_bytes", "read_bytes", "writes", "reads" ], "metrics_collection_interval": 180, "resources": [ "/" ] }, "mem": { "measurement": [ "mem_used_percent" ], "metrics_collection_interval": 180 }, "netstat": { "measurement": [ "tcp_established", "tcp_time_wait" ], "metrics_collection_interval": 180 }, "statsd": { "metrics_aggregation_interval": 60, "metrics_collection_interval": 180, "service_address": ":8125" }, "swap": { "measurement": [ "swap_used_percent" ], "metrics_collection_interval": 180 } } }}

EOF

sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.jsonsystemctl restart amazon-cloudwatch-agent.servicesystemctl enable amazon-cloudwatch-agent.service1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.31.32.33.34.35.36.37.38.39.40.41.42.43.44.45.46.47.48.49.50.51.52.53.54.55.56.57.58.59.60.61.62.63.64.65.66.67.68.69.70.71.72.73.74.75.76.77.78.79.80.81.82.83.84.85.86.87.88.89.90.91.92.93.94.95.96.97.98.99.100.101.102.103.104.105.106. 二、AWS-CLI批量下发监控

前提条件:本机安装awscli工具。钉钉

需要修改的钉钉是区域信息、ip_list、钉钉实例id、钉钉sns_arn信息。钉钉

通过脚本自动在CloudWatch上添加监控配置EC2监控。钉钉

复制Python

#!/usr/bin/

python

# -*- coding: utf-8 -*-

import os

import json

import subprocess

# 1

. 配置cli路径和region

Contants = { "AWSCLI": "C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe" --output json,钉钉 "AWSREGION": [eu-central-1]

# 新加坡

}

# 构造字典

class CreateDict(dict): def __getitem__(self, item): try: return dict.__getitem__(self, item) except KeyError: value = self[item] = type(self)()

return value

#########################################################################################################

# 配置告警

# CPUUtilization,3分钟检查3次,平均值大于或等于80%

免费源码下载钉钉就告警。钉钉

def getCPUUtilizationComm(name,钉钉 action, instance_id): mertic = CPUUtilization print("#####开始配置 %s#####" % mertic) return {cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--unit Percent \--dimensions "Name=InstanceId,Value={id}".format(cli=Contants[AWSCLI], name=name, actinotallow=action, id=instance_id, mertic=mertic)# MEMUtilization,3分钟检查3次,平均值大于或等于80%

,钉钉就告警。钉钉

def getmem_used_percentComm(name, action, instance_id, instancetype, imageid): mertic = mem_used_percent print("#####开始配置 %s#####" % mertic) return {cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace CWAgent \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data missing \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype}.format(cli=Contants[AWSCLI], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)# DISKUtilization,3分钟检查3次,平均值大于或等于80%

,就告警。

def getdisk_used_percentComm(name, action, instance_id, instancetype, imageid): mertic = disk_used_percent print("#####开始配置 %s#####" % mertic) return {cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace CWAgent \--dimensions "Name=path,Value=/" \--statistic Average \--period 60 \--threshold 80 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data missing \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype} Name=device,Value=nvme0n1p1 Name=fstype,Value=ext4 "Name=path,Value=/".format(cli=Contants[AWSCLI], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)

#注意因为磁盘无法获取到值和指定变量所以磁盘的值需要在cloudwatch上看下类型值来填写 device和fstype

# NetworkIn,3

分钟检查3次,平均值大于或等于5m,就告警。

def getNetworkInComm(name, action, instance_id): mertic = NetworkIn print("#####开始配置 %s#####" % mertic) return {cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 5000000 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions "Name=InstanceId,Value=%s".format(cli=Contants[AWSCLI], name=name, actinotallow=action, id=instance_id, mertic=mertic)# NetworkOut,3

分钟检查3次,网站模板平均值大于或等于5m,就告警。

def getNetworkOutComm(name, action, instance_id): mertic = NetworkOut print("#####开始配置 %s#####" % mertic) return {cli} cloudwatch put-metric-alarm \--alarm-name "AWS_EC2_{name}_{mertic}" \--alarm-description "aws ec2 {mertic}" \--metric-name {mertic} \--namespace AWS/EC2 \--statistic Average \--period 60 \--threshold 5000000 \--evaluation-periods 3 \--datapoints-to-alarm 3 \--comparison-operator GreaterThanOrEqualToThreshold \--treat-missing-data notBreaching \--alarm-actions "{action}" \--ok-actions "{action}" \--dimensions "Name=InstanceId,Value={id}".format(cli=Contants[AWSCLI], name=name, actinotallow=action, id=instance_id, mertic=mertic)

# 执行命令函数

def execCommand(comm): try: print(comm) (status, stdout) = subprocess.getstatusoutput(comm) print(status)

return stdout

except Exception as e: print(e)

# 获取当前可用区内所有EC2的基础信息

def getAll(get_server_id_list): # instanceids = ["i-0f24b7bf904ea9563" ,"i-0ce745e06c12cbde1"] # for instanceid in instanceids: # print(instanceid) # comm1 = "%s ec2 describe-instances --instance-ids %s" % (Contants[AWSCLI],instanceid) comm1 = "%s ec2 describe-instances" % Contants[AWSCLI] all_data = json.loads(execCommand(comm1)) instance_list = [] instance_list_modify = [] for r in all_data[Reservations]: data = {} for i in r[Instances]: data[id] = i[InstanceId] data[imageid] = i[ImageId] data[instancetype] = i[InstanceType] for t in i[Tags]: if t[Key] == Name: data[name] = t[Value] if not data[name]: data[name] = i[InstanceId] instance_list.append(data) # print(instance_list) for instance_id in instance_list: print(instance_id) if instance_id.get("id") in get_server_id_list: instance_list_modify.append(instance_id) #print(instance_list) print(instance_list_modify)

return instance_list_modify

# 添加报警

def add_alert(data, action): for i in data: instance_id = i[id] name = i[name] imageid = i[imageid] instancetype = i[instancetype] print(instance_id, name, imageid, instancetype) #print(instance_id, name) execCommand(getCPUUtilizationComm(name, action, instance_id)) #execCommand(getNetworkInComm(name, action, instance_id)) #execCommand(getNetworkOutComm(name, action, instance_id)) #execCommand(getmem_used_percentComm(name, action, instance_id, instancetype, imageid)) #execCommand(getdisk_used_percentComm(name, action, instance_id, instancetype, imageid))def get_server_info(instance_list): server_info = [] # print(server_dict) for i in instance_list: # print(i)

# 显示执行命令

print("aws ec2 describe-instances --output json --instance-ids ".format(i)) # print(cmd) server_dict = {} data = os.popen("aws ec2 describe-instances --output json --instance-ids ".format(i)).read() json_str = json.loads(data) """ # print(json_str["Reservations"][0]["Instances"][0]) server_dict[id]=json_str["Reservations"][0]["Instances"][0]["InstanceId"] server_dict[imageid]=json_str["Reservations"][0]["Instances"][0]["ImageId"] server_dict[instancetype]=json_str["Reservations"][0]["Instances"][0]["InstanceType"] if not json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]: server_dict[name] = json_str["Reservations"][0]["Instances"][0]["InstanceId"] else: server_dict[name]=json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"] server_info.append(server_dict) """ for Reservations_list in json_str["Reservations"]: for Instances_list in Reservations_list["Instances"]: server_dict[id] = Instances_list["InstanceId"] server_dict[imageid] = Instances_list["ImageId"] server_dict[instancetype] = Instances_list["InstanceType"] # if not Instances_list["Tags"][0]["Value"]: # server_dict[name] = Instances_list["InstanceId"] # else: # server_dict[name] = Instances_list["Tags"][0]["Value"] for tag_item in Instances_list["Tags"]: name=tag_item["Key"] if name == "Name": server_dict[name] = tag_item["Value"]

break

if i == server_dict["id"]: print(server_dict) server_info.append(server_dict)

return server_info

if __name__ == __main__: # 2

. 配置sns的arn

sns_arn = "arn:aws:sns:eu-central-1:643xxxxx:xxxx-CloudWatch-Lambda-DingTalk" ip_list = ["i-010bxxxx","i-00xxxxx"] cli = Contants[AWSCLI] for i in Contants[AWSREGION]: print([Region] , i) Contants[AWSCLI] = cli + --region +

i

add_alert(get_server_info(ip_list), sns_arn)1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.31.32.33.34.35.36.37.38.39.40.41.42.43.44.45.46.47.48.49.50.51.52.53.54.55.56.57.58.59.60.61.62.63.64.65.66.67.68.69.70.71.72.73.74.75.76.77.78.79.80.81.82.83.84.85.86.87.88.89.90.91.92.93.94.95.96.97.98.99.100.101.102.103.104.105.106.107.108.109.110.111.112.113.114.115.116.117.118.119.120.121.122.123.124.125.126.127.128.129.130.131.132.133.134.135.136.137.138.139.140.141.142.143.144.145.146.147.148.149.150.151.152.153.154.155.156.157.158.159.160.161.162.163.164.165.166.167.168.169.170.171.172.173.174.175.176.177.178.179.180.181.182.183.184.185.186.187.188.189.190.191.192.193.194.195.196.197.198.199.200.201.202.203.204.205.206.207.208.209.210.211.212.213.214.215.216.217.218.219.220.221.222.223.224.225.226.227.228.229.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.245.246.247.248.249. 三、Amazon SNS创建主题

创建sns主题关联LAMBDA 钉钉程序。

四、Lambda钉钉函数通知脚本

上传如下脚本,通过cloudwatch调式EC2设定的规则来触发告警测试。

复制Prolog

# _*_coding:utf-8_*

_

# python 3.8# Creation time: 2021/11/18import time

import hmac

import hashlib

import base64

import urllib.parse

import json

import os

import requests

import datetimedef lambda_handler(event, context): headers = {Content-Type: application/json;charset=utf-8} token = ca5533c8cb976c21 timestamp = str(round(time.time() * 1000)) secret = SEC8d1a31ec5e8e91 secret_enc = secret.encode(utf-8) string_to_sign = {}\n{}.format(timestamp, secret) string_to_sign_enc = string_to_sign.encode(utf-8) hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))

# get url

api_url = "https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}".format(token, timestamp, sign)

# msg setting

#message = event[Records][0][SNS] message = event[Records][0][Sns] Timestamp = message[Timestamp] Subject = message[Subject] # sns_message = message[Message] sns_message = json.loads(message[Message]) NewStateReason = json.loads(event[Records][0][Sns][Message])[NewStateReason] current_time = (datetime.datetime.now() + datetime.timedelta(hours=8)).strftime(%Y-%m-%d %H:%M:%S) if "ALARM" in Subject: title = ![1.png](https://xxx.oss-cn.aliyuncs.com/dingding-image/1.png) elif "OK" in Subject: title = ![2.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/2.png) else: title = ![3.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/3.png) _value = sns_message[Trigger][Dimensions][0][value] if _value.startswith(/): _value = sns_message[Trigger][Dimensions][1][value] content = "### {title}".format(title=title) +

\

"\n> #### **时间**: " + current_time +

\

"\n> #### **状态**: " + sns_message[OldStateValue] + " => " + sns_message[NewStateValue] +

\

"\n> #### **告警名称**: " + sns_message[AlarmName] +

\

"\n> #### **账户ID**: " + sns_message[AWSAccountId] +

\

"\n> #### **AWS区域**: " + sns_message[Region] +

\

"\n> #### **描述**: " + sns_message[AlarmDescription] +

\

"\n> #### **产品资源**: " + sns_message[Trigger][Namespace] +

\

"\n> #### **实例ID**: " + _value +

\

"\n> #### **指标名称**: " + sns_message[Trigger][MetricName] +

\

"\n> #### **报警详情**: " + sns_message[NewStateReason] msg = { "msgtype": "markdown", "markdown": { "title": title, "text":

content

}, "at": { "isAtAll": "true" } }

# request

request = requests.post(url=api_url, data=json.dumps(msg), headers=headers).content.decode("utf8")

return request

1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.31.32.33.34.35.36.37.38.39.40.41.42.43.44.45.46.47.48.49.50.51.52.53.54.55.56.57.58.59.60.61.62.63.64.65.66.67.68.69.70.71.72.73.74.75.76.

Aws子账户权限调式工具。

https://policysim.aws.amazon.com/

服务器租用

    热门排行

    友情链接