香港VPS全面监控方案:Prometheus + Grafana + Alertmanager 从零搭建运维大盘
服务器没有监控,等于在黑暗中飞行。CPU 爆了、磁盘满了、网站挂了——没有监控系统,往往是用户报障才知道。本文在香港 VPS 上从零搭建 Prometheus + Grafana + Alertmanager 完整监控体系,覆盖系统指标、应用指标和智能告警,实现故障发生前主动感知。
一、架构总览
┌──────────────────────────────────────────────┐
│ 监控目标(被监控端) │
│ Node Exporter Nginx Exporter MySQL Exporter │
│ ↑ ↑ ↑ │
└───────┼───────────────┼───────────────┼────────┘
│ scrape (拉取) │ │
┌───────▼───────────────▼───────────────▼────────┐
│ Prometheus(数据采集与存储) │
│ 时序数据库 + PromQL 查询引擎 │
└──────────────────┬─────────────────────────────┘
│ │
┌─────────▼──────┐ ┌──────▼────────────┐
│ Grafana │ │ Alertmanager │
│ 可视化大盘 │ │ 告警路由 + 通知 │
└─────────────────┘ └───────────────────┘
│
┌───────────┼───────────┐
▼ ▼ ▼
邮件 企业微信 Telegram二、使用 Docker Compose 一键部署监控栈
mkdir -p /opt/monitoring/{prometheus,grafana,alertmanager}
cd /opt/monitoring创建 docker-compose.yml:
version: '3.8'
networks:
monitoring:
driver: bridge
volumes:
prometheus_data:
grafana_data:
services:
# ── 数据采集核心 ──
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d' # 数据保留 30 天
- '--web.enable-lifecycle'
ports:
- "127.0.0.1:9090:9090" # 只绑定 localhost,通过 Nginx 代理对外
networks:
- monitoring
# ── 可视化大盘 ──
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=StrongPassword123
- GF_SERVER_ROOT_URL=https://monitor.yourdomain.com
- GF_SMTP_ENABLED=true
- GF_SMTP_HOST=smtp.gmail.com:587
- GF_SMTP_USER=your@gmail.com
- GF_SMTP_PASSWORD=app_password
ports:
- "127.0.0.1:3000:3000"
networks:
- monitoring
# ── 告警路由 ──
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "127.0.0.1:9093:9093"
networks:
- monitoring
# ── 系统指标采集器 ──
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
network_mode: host # 使用宿主机网络,采集完整系统指标
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'三、Prometheus 配置文件
# /opt/monitoring/prometheus/prometheus.yml
global:
scrape_interval: 15s # 每 15 秒拉取一次指标
evaluation_interval: 15s # 每 15 秒评估一次告警规则
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- "rules/*.yml"
scrape_configs:
# 监控 Prometheus 自身
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 系统指标(Node Exporter)
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
labels:
instance: 'hk-vps-01'
region: 'hongkong'
# Nginx 监控(需安装 nginx-prometheus-exporter)
- job_name: 'nginx'
static_configs:
- targets: ['localhost:9113']
# MySQL 监控
- job_name: 'mysql'
static_configs:
- targets: ['localhost:9104']四、告警规则配置
# /opt/monitoring/prometheus/rules/vps-alerts.yml
groups:
- name: vps_critical
interval: 30s
rules:
# CPU 持续高负载
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "⚠️ CPU 使用率过高: {{ $labels.instance }}"
description: "CPU 使用率已超过 85%,持续 5 分钟。当前值: {{ $value | printf \"%.1f\" }}%"
# 内存不足
- alert: LowMemoryAvailable
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
for: 3m
labels:
severity: critical
annotations:
summary: "🔴 内存严重不足: {{ $labels.instance }}"
description: "可用内存低于 10%,当前可用: {{ $value | printf \"%.1f\" }}%"
# 磁盘空间告警
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15 for: 5m labels: severity: warning annotations: summary: "💾 磁盘空间不足: {{ $labels.instance }}" description: "根分区可用空间低于 15%,剩余: {{ $value | printf \"%.1f\" }}%" # 服务器无响应 - alert: InstanceDown expr: up == 0 for: 2m labels: severity: critical annotations: summary: "🚨 服务器无响应: {{ $labels.instance }}" description: "目标 {{ $labels.instance }} 已超过 2 分钟无法采集指标" # 网络流量异常(可能遭受攻击) - alert: HighNetworkTraffic expr: rate(node_network_receive_bytes_total{device="eth0"}[5m]) > 100000000
for: 2m
labels:
severity: warning
annotations:
summary: "🌐 入站流量异常: {{ $labels.instance }}"
description: "入站流量超过 100MB/s,可能遭受攻击。当前: {{ $value | humanize }}B/s"五、Alertmanager 告警通知配置
# /opt/monitoring/alertmanager/alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alert@yourdomain.com'
smtp_auth_username: 'alert@yourdomain.com'
smtp_auth_password: 'app_password'
route:
group_by: ['alertname', 'instance']
group_wait: 30s # 同组告警聚合等待时间
group_interval: 5m # 已发送告警的再通知间隔
repeat_interval: 4h # 未恢复告警的重复通知间隔
receiver: 'default'
routes:
# 严重告警:即时通知企业微信
- match:
severity: critical
receiver: 'wechat-critical'
group_wait: 0s
# 警告级别:仅邮件通知
- match:
severity: warning
receiver: 'email-warning'
receivers:
- name: 'default'
email_configs:
- to: 'admin@yourdomain.com'
subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
body: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'email-warning'
email_configs:
- to: 'ops@yourdomain.com'
require_tls: true
- name: 'wechat-critical'
webhook_configs:
- url: 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_WECHAT_BOT_KEY'
send_resolved: true六、Grafana 预置 Dashboard
启动后导入以下官方 Dashboard ID(Grafana → Import → 填入 ID):
| Dashboard | ID | 覆盖指标 |
|---|---|---|
| Node Exporter Full | 1860 | CPU/内存/磁盘/网络全面系统指标 |
| Nginx 监控 | 9614 | 请求数、连接数、响应时间 |
| MySQL Overview | 7362 | 查询 QPS、慢查询、连接池 |
| Alertmanager | 9578 | 告警状态总览 |
七、Nginx 对外暴露 Grafana(HTTPS)
<code">server {
listen 443 ssl;
server_name monitor.yourdomain.com;
ssl_certificate /etc/letsencrypt/live/monitor.yourdomain.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/monitor.yourdomain.com/privkey.pem;
# 限制访问来源(只允许指定 IP 访问监控大盘)
allow 你的办公室IP;
allow 你的家庭IP;
deny all;
location / {
proxy_pass http://127.0.0.1:3000;
proxy_set_header Host $host;
}
}<code">cd /opt/monitoring docker compose up -d docker compose ps # 确认所有服务正常运行
八、总结
完整的 Prometheus + Grafana + Alertmanager 监控体系部署完成后,你的香港 VPS 将具备:15 秒级指标采集、可视化大盘、分级智能告警(企业微信即时推送严重告警 + 邮件通知警告)。建议在所有生产服务器上标配此监控栈,是服务稳定运营的基础保障。