第267集:云监控与告警
教学目标
- 理解云监控的重要性和架构
- 掌握指标监控和日志管理
- 熟悉性能分析和故障排查
- 学习告警配置和通知机制
- 能够构建可视化监控仪表盘
核心知识点
1. 云监控概述
1.1 监控层次结构
+---------------------------------------------------+
| 业务监控层 |
| 用户满意度、业务指标、收入、转化率 |
+---------------------------------------------------+
+---------------------------------------------------+
| 应用监控层 |
| 响应时间、错误率、吞吐量、APM |
+---------------------------------------------------+
+---------------------------------------------------+
| 基础设施监控层 |
| CPU、内存、磁盘、网络、容器、Kubernetes |
+---------------------------------------------------+
+---------------------------------------------------+
| 日志监控层 |
| 应用日志、系统日志、审计日志、安全日志 |
+---------------------------------------------------+1.2 监控指标类型
| 指标类型 | 描述 | 示例 |
|---|---|---|
| 基础设施指标 | 系统资源使用情况 | CPU使用率、内存使用率、磁盘I/O |
| 应用指标 | 应用性能和健康状态 | 响应时间、错误率、请求量 |
| 业务指标 | 业务关键指标 | 订单量、用户数、收入 |
| 自定义指标 | 特定业务指标 | 队列长度、缓存命中率 |
2. AWS监控服务
2.1 CloudWatch基础
# 发布自定义指标
aws cloudwatch put-metric-data \
--namespace MyApplication \
--metric-name RequestCount \
--value 100 \
--timestamp $(date -u +"%Y-%m-%dT%H:%M:%SZ")
# 发布多个指标
aws cloudwatch put-metric-data \
--namespace MyApplication \
--metric-data '[{"MetricName":"RequestCount","Value":100},{"MetricName":"ErrorCount","Value":5}]'
# 获取指标统计
aws cloudwatch get-metric-statistics \
--namespace MyApplication \
--metric-name RequestCount \
--start-time 2024-01-01T00:00:00Z \
--end-time 2024-01-02T00:00:00Z \
--period 3600 \
--statistics Average,Sum,Maximum,Minimum
# 列出指标
aws cloudwatch list-metrics \
--namespace MyApplication2.2 CloudWatch告警
# 创建CPU使用率告警
aws cloudwatch put-metric-alarm \
--alarm-name cpu-alarm \
--alarm-description "Alert when CPU exceeds 80%" \
--metric-name CPUUtilization \
--namespace AWS/EC2 \
--statistic Average \
--period 300 \
--evaluation-periods 2 \
--threshold 80 \
--comparison-operator GreaterThanThreshold \
--dimensions Name=InstanceId,Value=i-1234567890abcdef0
# 创建复合告警
aws cloudwatch put-metric-alarm \
--alarm-name composite-alarm \
--alarm-description "Alert on high CPU or memory" \
--alarm-actions arn:aws:sns:us-east-1:123456789012:my-topic \
--ok-actions arn:aws:sns:us-east-1:123456789012:my-topic \
--metrics '[{"Id":"m1","MetricStat":{"Metric":{"Namespace":"AWS/EC2","MetricName":"CPUUtilization","Dimensions":[{"Name":"InstanceId","Value":"i-1234567890abcdef0"}]},"Period":300,"Stat":"Average"}},{"Id":"m2","MetricStat":{"Metric":{"Namespace":"System/Linux","MetricName":"MemoryUtilization","Dimensions":[{"Name":"InstanceId","Value":"i-1234567890abcdef0"}]},"Period":300,"Stat":"Average"}},{"Id":"e1","Expression":"m1 > 80"},{"Id":"e2","Expression":"m2 > 80"},{"Id":"c1","Expression":"e1 OR e2","Label":"High CPU or Memory"}]' \
--evaluation-periods 1 \
--threshold 1 \
--comparison-operator GreaterThanOrEqualToThreshold
# 查看告警状态
aws cloudwatch describe-alarms --alarm-names cpu-alarm
# 禁用告警
aws cloudwatch disable-alarm-actions --alarm-name cpu-alarm
# 启用告警
aws cloudwatch enable-alarm-actions --alarm-name cpu-alarm
# 删除告警
aws cloudwatch delete-alarms --alarm-names cpu-alarm2.3 CloudWatch日志
# 创建日志组
aws logs create-log-group \
--log-group-name /aws/ec2/my-instance
# 创建日志流
aws logs create-log-stream \
--log-group-name /aws/ec2/my-instance \
--log-stream-name my-log-stream
# 上传日志事件
aws logs put-log-events \
--log-group-name /aws/ec2/my-instance \
--log-stream-name my-log-stream \
--log-events timestamp=$(date +%s)000,message="Application started"
# 批量上传日志事件
aws logs put-log-events \
--log-group-name /aws/ec2/my-instance \
--log-stream-name my-log-stream \
--log-events file://events.json
# 获取日志事件
aws logs get-log-events \
--log-group-name /aws/ec2/my-instance \
--log-stream-name my-log-stream
# 创建日志订阅过滤器
aws logs put-subscription-filter \
--log-group-name /aws/ec2/my-instance \
--filter-name error-filter \
--filter-pattern "[timestamp, request_id, level, message]" \
--destination-arn arn:aws:lambda:us-east-1:123456789012:function:my-function
# 创建指标过滤器
aws logs put-metric-filter \
--log-group-name /aws/ec2/my-instance \
--filter-name error-metric \
--filter-pattern "[timestamp, request_id, level, message]" \
--metric-transformations metricName=ErrorCount,metricNamespace=MyApplication,metricValue=1
# 查询日志
aws logs start-query \
--log-group-name /aws/ec2/my-instance \
--start-time $(date -d '1 hour ago' +%s) \
--end-time $(date +%s) \
--query-string 'fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20'
# 导出日志到S3
aws logs create-export-task \
--task-name my-export-task \
--log-group-name /aws/ec2/my-instance \
--from $(date -d '1 day ago' +%s) \
--to $(date +%s) \
--destination s3://my-log-bucket3. Azure监控服务
3.1 Azure Monitor
# 创建诊断设置
az monitor diagnostic-settings create \
--name my-diagnostic-settings \
--resource /subscriptions/{subscription-id}/resourceGroups/myResourceGroup/providers/Microsoft.Compute/virtualMachines/myVM \
--logs '[{"category": "VMInstanceAgent","enabled": true}]' \
--metrics '[{"category": "AllMetrics","enabled": true}]' \
--storage-account mystorageaccount
# 查询指标
az monitor metrics list \
--resource /subscriptions/{subscription-id}/resourceGroups/myResourceGroup/providers/Microsoft.Compute/virtualMachines/myVM \
--metric "Percentage CPU"
# 创建指标告警
az monitor metrics alert create \
--name cpu-alert \
--resource /subscriptions/{subscription-id}/resourceGroups/myResourceGroup/providers/Microsoft.Compute/virtualMachines/myVM \
--scopes /subscriptions/{subscription-id}/resourceGroups/myResourceGroup/providers/Microsoft.Compute/virtualMachines/myVM \
--condition "avg Percentage CPU > 80" \
--window-size 5m \
--evaluation-frequency 1m \
--description "Alert when CPU exceeds 80%"
# 查看告警规则
az monitor metrics alert list3.2 Azure Log Analytics
# 创建Log Analytics工作区
WORKSPACE_ID=$(az monitor log-analytics workspace create \
--resource-group myResourceGroup \
--workspace-name my-workspace \
--location eastus \
--query customerId \
-o tsv)
echo "Workspace ID: $WORKSPACE_ID"
# 获取工作区密钥
WORKSPACE_KEY=$(az monitor log-analytics workspace get-shared-keys \
--resource-group myResourceGroup \
--workspace-name my-workspace \
--query primarySharedKey \
-o tsv)
echo "Workspace Key: $WORKSPACE_KEY"
# 查询日志
az monitor log-analytics query \
--workspace $WORKSPACE_ID \
--analytics-query 'Syslog | where TimeGenerated > ago(1h) | project TimeGenerated, Computer, SeverityLevel, SyslogMessage | order by TimeGenerated desc | take 20'
# 创建保存的查询
az monitor log-analytics saved-query create \
--resource-group myResourceGroup \
--workspace-name my-workspace \
--name high-cpu-query \
--category "Performance" \
--display-name "High CPU Query" \
--query 'Perf | where CounterName == "% Processor Time" and CounterValue > 80 | project TimeGenerated, Computer, CounterValue | order by TimeGenerated desc'4. GCP监控服务
4.1 Cloud Monitoring
# 创建自定义指标
gcloud monitoring projects metric-descriptors create \
custom.googleapis.com/myapp/request_count \
--display-name="Request Count" \
--metric-kind=GAUGE \
--value-type=INT64 \
--unit="1"
# 写入时间序列数据
cat > metric-data.json << 'EOF'
{
"timeSeries": [
{
"metric": {
"type": "custom.googleapis.com/myapp/request_count",
"labels": {
"instance_id": "my-instance"
}
},
"resource": {
"type": "gce_instance",
"labels": {
"instance_id": "my-instance",
"zone": "us-central1-a"
}
},
"points": [
{
"interval": {
"endTime": "2024-01-01T00:00:00Z"
},
"value": {
"int64Value": 100
}
}
]
}
]
}
EOF
gcloud monitoring timeseries write \
--project my-project \
metric-data.json
# 查询指标
gcloud monitoring time-series-list \
--filter='metric.type="custom.googleapis.com/myapp/request_count"' \
--aligner=align_mean \
--aggregation-alignment-period=300s
# 创建告警策略
cat > alert-policy.json << 'EOF'
{
"displayName": "High CPU Alert",
"conditions": [
{
"displayName": "CPU Condition",
"conditionThreshold": {
"filter": 'resource.type="gce_instance" AND metric.type="compute.googleapis.com/instance/cpu/utilization"',
"aggregations": [
{
"alignmentPeriod": "300s",
"perSeriesAligner": "ALIGN_MEAN"
}
],
"comparison": "COMPARISON_GT",
"thresholdValue": 0.8,
"duration": "300s"
}
}
],
"enabled": true,
"combiner": "OR"
}
EOF
gcloud alpha monitoring policies create \
--policy-from-file=alert-policy.json5. 监控可视化
5.1 CloudWatch仪表盘
# 创建仪表盘
DASHBOARD_NAME=$(aws cloudwatch put-dashboard \
--dashboard-name my-dashboard \
--dashboard-body file://dashboard.json \
--query 'DashboardArn' \
--output text)
echo "Dashboard ARN: $DASHBOARD_NAME"
# dashboard.json
cat > dashboard.json << 'EOF'
{
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 12,
"height": 6,
"properties": {
"metrics": [
["AWS/EC2", "CPUUtilization", "InstanceId", "i-1234567890abcdef0"]
],
"period": 300,
"stat": "Average",
"region": "us-east-1",
"title": "EC2 CPU Utilization"
}
},
{
"type": "log",
"x": 0,
"y": 6,
"width": 24,
"height": 6,
"properties": {
"logGroupName": "/aws/ec2/my-instance",
"region": "us-east-1",
"title": "Application Logs",
"view": "table"
}
}
]
}
EOF
# 获取仪表盘
aws cloudwatch get-dashboard --dashboard-name my-dashboard
# 删除仪表盘
aws cloudwatch delete-dashboards --dashboard-names my-dashboard5.2 Grafana集成
# 安装Grafana
sudo apt-get install -y software-properties-common
sudo add-apt-repository "deb https://packages.grafana.com/oss/deb stable main"
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
sudo apt-get update
sudo apt-get install -y grafana
# 启动Grafana
sudo systemctl start grafana-server
sudo systemctl enable grafana-server
# 安装CloudWatch插件
sudo grafana-cli plugins install grafana-cloudwatch-app
# 重启Grafana
sudo systemctl restart grafana-server
# 配置CloudWatch数据源
# 在Grafana UI中添加CloudWatch数据源
# 使用IAM凭证或角色进行认证6. 性能分析
6.1 应用性能监控
# 安装X-Ray守护进程
pip install aws-xray-daemon
# 配置X-Ray
cat > /etc/xray/cfg.yaml << 'EOF'
Version: 2
Role: arn:aws:iam::123456789012:role/XRayDaemonRole
SamplingRules:
- RuleName: MyApplication
Priority: 9000
ReservoirSize: 1
FixedRate: 1
URLPath: "*"
EOF
# 启动X-Ray守护进程
sudo systemctl start xray
sudo systemctl enable xray
# 安装Python X-Ray SDK
pip install aws-xray-sdk
# 在应用中集成X-Ray
cat > app.py << 'EOF'
from aws_xray_sdk.core import xray_recorder
from aws_xray_sdk.ext.flask.middleware import XRayMiddleware
from flask import Flask
app = Flask(__name__)
xray_recorder.configure(service='My Application')
XRayMiddleware(app, xray_recorder)
@app.route('/')
def home():
return 'Hello World!'
if __name__ == '__main__':
app.run()
EOF6.2 分布式追踪
# 安装Jaeger
curl -L https://www.jaegertracing.io/jaegertracing.io.sh | bash -s --1.48.0
# 启动Jaeger
docker run -d --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 5775:5775/udp \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 14268:14268 \
-p 14250:14250 \
-p 9411:9411 \
jaegertracing/all-in-one:1.48
# 安装OpenTelemetry
pip install opentelemetry-api opentelemetry-sdk opentelemetry-instrumentation-flask
# 在应用中集成OpenTelemetry
cat > app.py << 'EOF'
from flask import Flask
from opentelemetry import trace
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import ConsoleSpanExporter
from opentelemetry.sdk.trace.export import BatchSpanProcessor
app = Flask(__name__)
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)
FlaskInstrumentor().instrument_app(app)
@app.route('/')
def home():
with tracer.start_as_current_span("home"):
return 'Hello World!'
if __name__ == '__main__':
app.run()
EOF实用案例分析
案例1:构建全栈监控体系
场景描述
为三层Web应用构建完整的监控体系,包括基础设施、应用和业务监控。
实施步骤
- 基础设施监控
# 安装CloudWatch Agent
wget https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/amd64/latest/amazon-cloudwatch-agent.deb
sudo dpkg -i -E ./amazon-cloudwatch-agent.deb
# 配置CloudWatch Agent
cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << 'EOF'
{
"agent": {
"metrics_collection_interval": 60,
"run_as_user": "root"
},
"metrics": {
"namespace": "MyApplication",
"metrics_collected": {
"mem": {
"measurement": ["mem_used_percent"]
},
"disk": {
"measurement": ["disk_used_percent"]
},
"cpu": {
"measurement": ["cpu_usage_active"]
}
}
},
"logs": {
"logs_collected": {
"files": {
"collect_list": [
{
"file_path": "/var/log/myapp/*.log",
"log_group_name": "/aws/ec2/my-instance",
"log_stream_name": "{instance_id}"
}
]
}
}
}
}
EOF
# 启动CloudWatch Agent
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \
-a fetch-config \
-m ec2 \
-s \
-c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
sudo systemctl start amazon-cloudwatch-agent
sudo systemctl enable amazon-cloudwatch-agent- 应用监控
# 安装Prometheus
sudo useradd --no-create-home --shell /bin/false prometheus
sudo mkdir -p /etc/prometheus /var/lib/prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
tar -xvf prometheus-2.45.0.linux-amd64.tar.gz
sudo cp prometheus-2.45.0.linux-amd64/{prometheus,promtool} /usr/local/bin/
sudo chown prometheus:prometheus /usr/local/bin/{prometheus,promtool}
sudo chown -R prometheus:prometheus /etc/prometheus /var/lib/prometheus
# 配置Prometheus
cat > /etc/prometheus/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'myapp'
static_configs:
- targets: ['localhost:9090']
EOF
# 启动Prometheus
sudo cat > /etc/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl start prometheus
sudo systemctl enable prometheus- 业务监控
# 创建自定义业务指标
cat > business-metrics.py << 'EOF'
import boto3
import time
import random
cloudwatch = boto3.client('cloudwatch')
while True:
# 模拟订单数量
order_count = random.randint(50, 100)
cloudwatch.put_metric_data(
Namespace='MyBusiness',
MetricData=[{
'MetricName': 'OrderCount',
'Value': order_count,
'Unit': 'Count'
}]
)
# 模拟用户活跃度
active_users = random.randint(1000, 2000)
cloudwatch.put_metric_data(
Namespace='MyBusiness',
MetricData=[{
'MetricName': 'ActiveUsers',
'Value': active_users,
'Unit': 'Count'
}]
)
time.sleep(60)
EOF
# 启动业务指标收集
nohup python3 business-metrics.py > /dev/null 2>&1 &案例2:实施智能告警系统
场景描述
实施智能告警系统,包括多条件告警、告警聚合和智能通知。
实施步骤
- 配置多条件告警
# 创建复合告警
aws cloudwatch put-metric-alarm \
--alarm-name application-health-alarm \
--alarm-description "Alert on application health issues" \
--alarm-actions arn:aws:sns:us-east-1:123456789012:alerts-topic \
--ok-actions arn:aws:sns:us-east-1:123456789012:alerts-topic \
--insufficient-data-actions arn:aws:sns:us-east-1:123456789012:alerts-topic \
--metrics '[{"Id":"m1","MetricStat":{"Metric":{"Namespace":"AWS/ApplicationELB","MetricName":"TargetResponseTime","Dimensions":[{"Name":"LoadBalancer","Value":"my-load-balancer"}]},"Period":300,"Stat":"Average"}},{"Id":"m2","MetricStat":{"Metric":{"Namespace":"AWS/ApplicationELB","MetricName":"HTTPCode_Target_5XX_Count","Dimensions":[{"Name":"LoadBalancer","Value":"my-load-balancer"}]},"Period":300,"Stat":"Sum"}},{"Id":"m3","MetricStat":{"Metric":{"Namespace":"AWS/ApplicationELB","MetricName":"UnHealthyHostCount","Dimensions":[{"Name":"LoadBalancer","Value":"my-load-balancer"}]},"Period":300,"Stat":"Average"}},{"Id":"e1","Expression":"m1 > 5"},{"Id":"e2","Expression":"m2 > 10"},{"Id":"e3","Expression":"m3 > 0"},{"Id":"c1","Expression":"e1 OR e2 OR e3","Label":"Application Health Issue"}]' \
--evaluation-periods 1 \
--threshold 1 \
--comparison-operator GreaterThanOrEqualToThreshold- 配置告警聚合
# 创建SNS主题
TOPIC_ARN=$(aws sns create-topic \
--name alerts-topic \
--query 'TopicArn' \
--output text)
echo "Topic ARN: $TOPIC_ARN"
# 订阅多个通知渠道
aws sns subscribe \
--topic-arn $TOPIC_ARN \
--protocol email \
--notification-endpoint ops@example.com
aws sns subscribe \
--topic-arn $TOPIC_ARN \
--protocol sms \
--notification-endpoint +1234567890
aws sns subscribe \
--topic-arn $TOPIC_ARN \
--protocol https \
--notification-endpoint https://my-slack-webhook.com
# 创建Lambda函数处理告警
cat > lambda_function.py << 'EOF'
import json
import boto3
import requests
def lambda_handler(event, context):
message = event['Records'][0]['Sns']['Message']
# 发送到Slack
slack_webhook_url = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
slack_message = {
"text": f"🚨 CloudWatch Alert: {message}"
}
requests.post(slack_webhook_url, json=slack_message)
# 发送到PagerDuty
pagerduty_api_key = "YOUR_PAGERDUTY_API_KEY"
pagerduty_url = "https://events.pagerduty.com/v2/enqueue"
pagerduty_event = {
"routing_key": pagerduty_api_key,
"event_action": "trigger",
"payload": {
"summary": "CloudWatch Alert",
"severity": "critical",
"source": "cloudwatch",
"custom_details": {
"message": message
}
}
}
requests.post(pagerduty_url, json=pagerduty_event)
return {
'statusCode': 200,
'body': json.dumps('Alert processed successfully')
}
EOF
# 部署Lambda函数
aws lambda create-function \
--function-name alert-processor \
--runtime python3.9 \
--role arn:aws:iam::123456789012:role/lambda-role \
--handler lambda_function.lambda_handler \
--zip-file fileb://function.zip \
--environment Variables={SLACK_WEBHOOK_URL="https://hooks.slack.com/services/YOUR/WEBHOOK/URL",PAGERDUTY_API_KEY="YOUR_PAGERDUTY_API_KEY"}- 配置智能告警
# 创建异常检测告警
aws cloudwatch put-anomaly-detector \
--metric-name RequestCount \
--namespace MyApplication \
--stat Sum
# 创建异常检测告警
aws cloudwatch put-metric-alarm \
--alarm-name anomaly-alarm \
--alarm-description "Alert on anomaly detection" \
--metric-name RequestCount \
--namespace MyApplication \
--stat Sum \
--evaluation-periods 3 \
--datapoints-to-alarm 2 \
--threshold-metric-id anomaly_detection_metric \
--comparison-operator LessThanLowerOrGreaterThanUpperThreshold
# 创建预测告警
aws cloudwatch put-metric-alarm \
--alarm-name prediction-alarm \
--alarm-description "Alert based on prediction" \
--metrics '[{"Id":"m1","MetricStat":{"Metric":{"Namespace":"MyApplication","MetricName":"RequestCount"},"Period":300,"Stat":"Sum"}},{"Id":"p1","Label":"Prediction","Expression":"FILL(m1, 300)"},{"Id":"a1","Expression":"ANOMALY_DETECTION_BAND(m1, 3)"}]' \
--evaluation-periods 1 \
--threshold 1 \
--comparison-operator GreaterThanOrEqualToThreshold课后练习
基础练习
- 创建CloudWatch指标和告警
- 配置日志收集和查询
- 创建监控仪表盘
进阶练习
- 配置应用性能监控
- 实施分布式追踪
- 配置智能告警
挑战练习
- 构建全栈监控体系
- 实施智能告警系统
- 集成多种监控工具
思考问题
- 如何设计有效的监控策略?
- 如何减少告警疲劳?
- 如何优化监控性能?
总结
本集详细介绍了Linux系统中云监控与告警的实现方法,包括指标监控、日志管理、性能分析、告警配置以及可视化仪表盘等内容。通过本集的学习,您应该能够:
- 理解云监控的重要性和架构
- 掌握指标监控和日志管理
- 熟悉性能分析和故障排查
- 学习告警配置和通知机制
- 能够构建可视化监控仪表盘
云监控是云基础设施运维的重要组成部分,它提供了实时监控、故障预警和性能优化的能力。在实际项目中,应根据应用特点和业务需求建立完善的监控体系,并持续优化监控策略和告警规则,以确保系统的稳定性和可靠性。