创建目录

1
2
3
4
5
6
7
8
9
10
11
12
13
## Prometheus
mkdir -pv /data/prometheus/{conf,data}
mkdir -pv /data/prometheus/conf/{rules,targets}
mkdir -pv /data/prometheus/conf/targets/{servers,nodes,blackbox}

## Alertmanager
mkdir -pv /data/alertmanager/{conf,data,tmpl}

## Grafana
mkdir -pv /data/grafana/{conf,data,logs,plugins}

## docker-compose
mkdir -pv /data/docker-compose/{prometheus,alertmanager,grafana}

配置文件

Prometheus配置文件

主配置文件

  • 节点基于文件的自动发现

prometheus.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
cat > /data/prometheus/conf/prometheus.yml << 'EOF'
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- file_sd_configs:
- files:
- targets/servers/alertmanagers.yaml

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yaml"

# 基于文件的自动发现
scrape_configs:
- job_name: 'prometheus'
file_sd_configs:
- files:
- targets/servers/prometheus.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'alertmanagers'
file_sd_configs:
- files:
- targets/servers/alertmanagers.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'grafana'
file_sd_configs:
- files:
- targets/servers/grafana.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'nodes'
file_sd_configs:
- files:
- targets/nodes/node-exporter.yaml
refresh_interval: 2m
# relabel configs
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'mysql'
file_sd_configs:
- files:
- targets/servers/mysql.yaml
refresh_interval: 2m

- job_name: 'redis-cluster'
file_sd_configs:
- files:
- targets/servers/redis-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'es-cluster'
file_sd_configs:
- files:
- targets/servers/es-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'kafka-cluster'
file_sd_configs:
- files:
- targets/servers/kafka-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'nacos-cluster'
metrics_path: '/nacos/actuator/prometheus'
file_sd_configs:
- files:
- targets/servers/nacos-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

- job_name: 'docker-engines'
file_sd_configs:
- files:
- targets/nodes/docker-nodes.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace

# 网站检测
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
file_sd_configs:
- files:
- targets/blackbox/http-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115

# PING 检测
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
file_sd_configs:
- files:
- targets/blackbox/ping-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115

# 端口检测
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
file_sd_configs:
- files:
- targets/blackbox/port-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115
EOF

重新打标示例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# relabel configs
relabel_configs:
- source_labels:
- __scheme__
- __address__
- __metrics_path__
regex: "(http|https)(.*)"
separator: ""
target_label: "endpoint"
replacement: "${1}://${2}"
action: replace

- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'ipaddr'
replacement: $1
action: replace

- regex: "(job|app)"
replacement: ${1}_name
action: labelmap

# metric relabel configs
metric_relabel_configs:
- source_labels:
- __name__
regex: "go_.*"
action: drop

节点配置文件

targets/servers/prometheus.yaml

1
2
3
4
5
6
7
8
cat > /data/prometheus/conf/targets/servers/prometheus.yaml << 'EOF'
- targets:
- 3.1.101.39:9090
labels:
job: prometheus
server: prometheus
env: dev
EOF

targets/servers/alertmanagers.yaml

1
2
3
4
5
6
7
8
cat > /data/prometheus/conf/targets/servers/alertmanagers.yaml << 'EOF'
- targets:
- 3.1.101.39:9093
labels:
job: alertmanager
server: alertmanager
env: dev
EOF

targets/servers/grafana.yaml

1
2
3
4
5
6
7
8
cat > /data/prometheus/conf/targets/servers/grafana.yaml << 'EOF'
- targets:
- 3.1.101.40:3000
labels:
job: grafana
server: grafana
env: dev
EOF

targets/nodes/node-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
cat > /data/prometheus/conf/targets/nodes/node-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9100
- 3.1.101.34:9100
- 3.1.101.35:9100
labels:
job: node-exporter
server: middleware
env: dev
- targets:
- 3.1.101.36:9100
- 3.1.101.38:9100
- 3.1.101.39:9100
- 3.1.101.40:9100
labels:
job: node-exporter
server: devops
env: dev
- targets:
- 3.1.101.41:9100
- 3.1.101.42:9100
labels:
job: node-exporter
server: weblogic
env: dev
- targets:
- 6.1.14.86:9100
- 6.1.14.87:9100
labels:
job: node-exporter
server: weblogic
env: old-env
EOF

targets/servers/mysql-exporter.yaml

1
2
3
4
5
6
7
8
9
cat > /data/prometheus/conf/targets/servers/mysql-exporter.yaml << 'EOF'
- targets:
- 3.1.101.36:9104
labels:
instance: 6.1.14.87
job: mysql-exporter
server: mysql
env: dev
EOF

targets/servers/redis-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
cat > /data/prometheus/conf/targets/servers/redis-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9121
- 3.1.101.34:9121
- 3.1.101.35:9121
labels:
job: redis-exporter
server: redis
env: dev
cluster: redis
EOF

targets/servers/es-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
cat > /data/prometheus/conf/targets/servers/es-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9115
- 3.1.101.34:9115
- 3.1.101.35:9115
labels:
job: es-exporter
server: elasticsearch
env: dev
cluster: elk
EOF

targets/servers/kafka-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
cat > /data/prometheus/conf/targets/servers/kafka-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9308
- 3.1.101.34:9308
- 3.1.101.35:9308
labels:
job: kafka-exporter
server: kafka
env: dev
cluster: kafka
EOF

targets/servers/nacos-exporter.yaml

1
2
3
4
5
6
7
8
9
10
11
cat > /data/prometheus/conf/targets/servers/nacos-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:8848
- 3.1.101.34:8848
- 3.1.101.35:8848
labels:
job: nacos-exporter
server: nacos
env: dev
cluster: nacos
EOF

targets/nodes/docker-nodes.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
cat > /data/prometheus/conf/targets/nodes/docker-nodes.yaml << 'EOF'
- targets:
- 3.1.101.33:9180
- 3.1.101.34:9180
- 3.1.101.35:9180
- 3.1.101.36:9180
- 3.1.101.38:9180
- 3.1.101.39:9180
- 3.1.101.40:9180
- 3.1.101.41:9180
- 3.1.101.42:9180
labels:
job: cAdvisor
server: docker
env: dev
EOF

targets/blackbox/http-status.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
cat > /data/prometheus/conf/targets/blackbox/http-status.yaml << 'EOF'
- targets:
- http://3.1.101.39:9090/
labels:
job: http-status
server: prometheus
env: dev
- targets:
- http://3.1.101.40:3030/
labels:
job: http-status
server: grafana
env: dev
- targets:
- http://3.1.101.40:5601/
labels:
job: http-status
server: kibana
env: dev
- targets:
- http://3.1.101.36:8088/
labels:
job: http-status
server: jumpserver
env: dev
- targets:
- http://3.1.101.35:8095/
labels:
job: http-status
server: jira
env: dev
- targets:
- http://3.1.101.45:8848/nacos/
labels:
job: http-status
server: nacos
env: dev
- targets:
- http://3.1.101.33:8858/
- http://3.1.101.34:8858/
labels:
job: http-status
server: sentinel
env: dev
- targets:
- http://3.1.101.41:7001/console
- http://3.1.101.42:7001/console
labels:
job: http-status
server: weblogic
env: dev
- targets:
- http://6.1.14.86:7001/console
- http://6.1.14.87:7001/console
labels:
job: http-status
server: weblogic
env: dev
EOF

targets/blackbox/ping-status.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
cat > /data/prometheus/conf/targets/blackbox/ping-status.yaml << 'EOF'
- targets:
- 3.1.101.33
- 3.1.101.34
- 3.1.101.35
labels:
job: ping-status
server: middleware
env: dev
- targets:
- 3.1.101.36
- 3.1.101.38
- 3.1.101.39
- 3.1.101.40
labels:
job: ping-status
server: devops
env: dev
- targets:
- 3.1.101.41
- 3.1.101.42
labels:
job: ping-status
server: weblogic
env: dev
- targets:
- 6.1.14.86
- 6.1.14.87
labels:
job: ping-status
server: weblogic
env: old-env
EOF

targets/blackbox/port-status.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
cat > /data/prometheus/conf/targets/blackbox/port-status.yaml << 'EOF'
# 监控平台
- targets:
- 3.1.101.39:9090
labels:
job: port-status
server: prometheus
env: dev
- targets:
- 3.1.101.40:3030
labels:
job: port-status
server: grafana
env: dev
- targets:
- 3.1.101.39:9093
labels:
job: port-status
server: alertmanager
env: dev
# redis集群
- targets:
- 3.1.101.33:6379
- 3.1.101.34:6379
- 3.1.101.35:6379
labels:
job: port-status
server: redis-master
env: dev
- targets:
- 3.1.101.33:26379
- 3.1.101.34:26379
- 3.1.101.35:26379
labels:
job: port-status
server: redis-slave
env: dev
# kafka集群
- targets:
- 3.1.101.33:9092
- 3.1.101.34:9092
- 3.1.101.35:9092
labels:
job: port-status
server: kafka
env: dev
- targets:
- 3.1.101.33:2181
- 3.1.101.34:2181
- 3.1.101.35:2181
labels:
job: port-status
server: zookeeper
env: dev
# 阿里微服务中间件
- targets:
- 3.1.101.33:8848
- 3.1.101.34:8848
- 3.1.101.35:8848
labels:
job: port-status
server: nacos
env: dev
- targets:
- 3.1.101.33:8091
- 3.1.101.34:8091
- 3.1.101.35:8091
labels:
job: port-status
server: seata
env: dev
- targets:
- 3.1.101.33:8858
- 3.1.101.34:8858
labels:
job: port-status
server: sentinel
env: dev
# ELK日志平台
- targets:
- 3.1.101.33:9200
- 3.1.101.34:9200
- 3.1.101.35:9200
labels:
job: port-status
server: elasticsearch
env: dev
- targets:
- 3.1.101.33:5044
- 3.1.101.34:5044
labels:
job: port-status
server: logstash
env: dev
- targets:
- 3.1.101.40:5601
labels:
job: port-status
server: kibana
env: dev
# Mysql数据库
- targets:
- 6.1.14.87:3306
labels:
job: port-status
server: mysql
env: dev

# weblogic服务
- targets:
- 3.1.101.41:7001
- 3.1.101.42:7001
labels:
job: port-status
server: weblogic
env: dev
- targets:
- 6.1.14.86:7001
- 6.1.14.87:7001
labels:
job: port-status
server: weblogic
env: old-env
# DevOps平台
- targets:
- 3.1.101.38:389
labels:
job: port-status
server: openldap
env: dev
- targets:
- 3.1.101.36:8088
labels:
job: port-status
server: jumpserver
env: dev
- targets:
- 3.1.101.38:8098
labels:
job: port-status
server: gitlab
env: dev
- targets:
- 3.1.101.36:8080
labels:
job: port-status
server: jenkins
env: dev
- targets:
- 3.1.101.36:9000
labels:
job: port-status
server: sonarqube
env: dev
- targets:
- 3.1.101.35:8095
labels:
job: port-status
server: jira
env: dev
EOF

Alertmanager配置文件

主配置文件

alertmanager.yml示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
global:
resolve_timeout: 5m
# smtp配置
smtp_from: 'wangshui898@sina.com'
smtp_smarthost: 'smtp.sina.com:465'
smtp_auth_username: 'wangshui898@sina.com'
smtp_auth_password: 'Authorization code'
smtp_require_tls: false

# 告警模板
templates:
- 'tmpl/*.tmpl'

route:
receiver: default-receiver # 所有不匹配以下子路由的告警都将保留在根节点,并发送到“default-receiver”
group_wait: 30s # 为一个组发送通知的初始等待时间,默认30s
group_interval: 2m # 在发送新告警前的等待时间。通常5m或以上
repeat_interval: 1h # 发送重复告警的周期。如果已经发送了通知,再次发送之前需要等待多长时间。
group_by: ['alertname','server'] # 分组规则,如果满足group_by中包含的标签,则这些报警会合并为一个通知发给receiver

routes: # 子路由,父路由的所有属性都会被子路由继承,子路由可以有多级
- match: # 此路由在警报标签上进行匹配,以捕获与服务列表相关的警报
severity: critical
receiver: 'default-receiver' # 发送给指定接收者

- match: # 此路由在警报标签上进行匹配,以捕获与服务列表相关的警报
server: mysql
receiver: 'dba' # 发送给指定接收者

- match_re: # 此路由在告警标签上执行正则表达式匹配,并发送给对应接收人
service: ccms-.*
receiver: 'dev'

# 告警抑制
#inhibit_rules:
#- source_match:
# severity: 'critical'
# target_match:
# severity: 'warning'
# equal: ['alertname']

receivers: # 定义接收者,将告警发送给谁
- name: 'default-receiver'
email_configs:
- to: 'wangshui898@126.com'
html: '{{ template "email.to.html" .}}'
send_resolved: true

- name: 'dev'
email_configs:
- to: 'wangshui898@163.com'
send_resolved: true

- name: 'dba'
email_configs:
- to: 'wangshui898@sina.com'
send_resolved: true

alertname即为prometheus告警规则中的groups.name.rules.alert

告警模版文件

官方模板参考: https://raw.githubusercontent.com/prometheus/alertmanager/master/template/default.tmpl

邮件告警模版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
mkdir -pv /data/alertmanager/conf/tmpl
cat > /data/alertmanager/conf/tmpl/e-mail.tmpl << 'EOF'
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
=========告警通知==========<br>
告警类型: {{ .Labels.alertname }} <br>
告警级别: {{ .Labels.severity }} 级<br>
{{- end }}
----------------------------<br>
告警主题: {{ .Annotations.title }} <br>
故障详情: {{ .Annotations.description }} <br>
故障时间: {{ .StartsAt.Local }} <br>
{{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}}
=========请勿回复==========<br>
{{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
==========告警恢复通知========<br>
告警类型: {{ .Labels.alertname }} <br>
告警级别: {{ .Labels.severity }} 级<br>
{{- end }}
----------------------------<br>
告警主题: {{ .Annotations.title }} <br>
触发详情: {{ .Annotations.description }}, 已恢复 <br>
故障时间: {{ .StartsAt.Local }} <br>
恢复时间: {{ .EndsAt.Local }} <br>
{{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}}
===========请勿回复===========<br>
{{- end }}
{{- end }}
{{- end }}
EOF

模版中的变量要和告Prometheus告警规则(rules)中对应, 如title, description

1
2
3
{{- if gt (len .Alerts.Firing) 0 -}}                # 判断报警列表的长度是否大于0,大于0说明有报警,否则没有
{{- range $index, $alert := .Alerts -}} # 遍历所有的告警列表,$index是索引,$alert是每一个报警元素
{{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}} # 判断instance是否存在
1
2
3
4
5
6
7
8
9
.Receiver: 接收器的名称
.Status: 如果正在告警,值为firing,恢复为resolved
.Alerts: 所有告警对象的列表,是一个列表,
.Alerts.Firing: 告警列表
.Alerts.Resolved: 恢复列表
.GroupLabels: 告警的分组标签
.CommonLabels: 所有告警共有的标签
.CommonAnnotations: 所有告警共有的注解
.ExternalURL: 告警对应的alertmanager连接地址

服务部署

  • Docker-compose

Prometheus

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
version: "3"
services:
prometheus:
container_name: prometheus
image: prom/prometheus:v2.25.1
user: root
ports:
- 9090:9090
restart: always
volumes:
- /etc/localtime:/etc/localtime
- /data/prometheus/conf:/etc/prometheus
- /data/prometheus/data:/prometheus
environment:
TZ: Asia/Shanghai
command: --config.file=/etc/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.retention=30d
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G

Alertmanager

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
version: "3"
services:
alertmanager:
container_name: alertmanager
image: prom/alertmanager:v0.23.0
user: root
ports:
- 9093:9093
restart: always
volumes:
- /etc/localtime:/etc/localtime
- /data/alertmanager/conf:/etc/alertmanager
- /data/alertmanager/data:/alertmanager
environment:
TZ: Asia/Shanghai
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G

grafana

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
version: "3"
services:
grafana:
container_name: grafana
image: grafana/grafana:7.4.1
user: root
ports:
- 3000:3000
restart: always
environment:
TZ: Asia/Shanghai
GF_PATHS_DATA: /data/grafana/data
GF_PATHS_LOGS: /data/grafana/logs
volumes:
- /etc/localtime:/etc/localtime
- /data/grafana:/data/grafana
- /data/grafana/plugins:/var/lib/grafana/plugins
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G

exporter部署

  • docker-compose

node_exporter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
version: "3"
services:
node_exporter:
container_name: node_exporter
image: prom/node-exporter:v1.1.2
restart: always
network_mode: host
command:
- '--web.listen-address=:9100'
- '--path.rootfs=/rootfs'
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
# - '--collector.textfile.directory=/node_exporter/prom'
volumes:
- /proc:/host/proc
- /sys:/host/sys
- /:/rootfs

常用Linux主机模版: 8919, 1860

mysql-exporter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
version: "3"
services:
mysql_exporter:
container_name: mysql_exporter
image: prom/mysqld-exporter:v0.13.0
restart: always
ports:
- "9104:9104"
environment:
# mysql服务端, 需要配置具有查询权限的用户:
# CREATE USER 'exporter'@'%' IDENTIFIED WITH mysql_native_password BY 'Aa@123456';
# GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';
# 格式为 dbuser:dbpasswd@(mysqlip:port)/dbname, 实际应用当中,应该配置为具体需要监控的数据库实例
DATA_SOURCE_NAME: "exporter:Aa@123456@(172.16.20.111:3306)/"

常用mysql监控模版: 7362

oracle-exporter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
version: "3"
services:
oracle-exporter:
container_name: oracle_exporter
image: iamseth/oracledb_exporter:latest
hostname: ccms-odb-sit1
restart: always
ports:
- 9161:9161
volumes:
- /etc/localtime:/etc/localtime
environment:
- TZ=Asia/Shanghai
- DATA_SOURCE_NAME=system/System123@3.1.101.43:1521/loandb

常用监控模板: 11121

redis-exporter

1
2
3
4
5
6
7
8
9
version: "3"
services:
redis_exporter:
container_name: redis_exporter
image: bitnami/redis-exporter:1.20.0
restart: always
ports:
- "9121:9121"
command: "-redis.addr 3.1.101.33:6379 -redis-only-metrics -web.listen-address 0.0.0.0:9121 -redis.password g1tredis2o2l"

常用redis监控模版: 11835

elasticsearch_exporter

1
2
3
4
5
6
7
8
9
version: "3"
services:
es_exporter:
container_name: es_exporter
image: justwatch/elasticsearch_exporter:1.1.0
restart: always
ports:
- "9115:9115"
command: "--es.all --es.indices --es.cluster_settings --es.indices_settings --es.shards --es.snapshots --es.timeout=10s --web.listen-address=0.0.0.0:9115 --web.telemetry-path=/metrics --es.uri http://3.1.101.33:9200"

常用ES监控模版: 2322

kafka_exporter

1
2
3
4
5
6
7
8
9
version: "3"
services:
kafka_exporter:
container_name: kafka_exporter
image: danielqsj/kafka-exporter:v1.3.0
restart: always
ports:
- "9308:9308"
command: "--kafka.server=3.1.101.33:9092"

常用kakfa监控模版: 13572, 7589

cAdvisor

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
version: "3"
services:
cadvisor:
container_name: cadvisor
image: google/cadvisor:v0.33.0
user: root
privileged: true
ports:
- 9180:8080
restart: always
volumes:
- /:/rootfs
- /var/run:/var/run
- /sys:/sys
- /var/lib/docker/:/var/lib/docker
- /dev/disk/:/dev/disk

常用docker监控模版: 13584,13946

blackbox-exporter

创建目录

1
mkdir -pv /data/exporter/blackbox_exporter

创建配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
cat > /data/exporter/blackbox_exporter/config.yml << 'EOF'
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
EOF

docker-compose编排文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
version: "3"
services:
blackbox-exporter:
container_name: blackbox-exporter
image: prom/blackbox-exporter:v0.19.0
restart: always
ports:
- 9115:9115
volumes:
- /etc/localtime:/etc/localtime
- /data/exporter/blackbox_exporter:/etc/blackbox_exporter
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G

grafana常用监控模版: 9965

Prometheus常用告警规则

参考: https://awesome-prometheus-alerts.grep.to/rules

热加载告警规则

1
2
启动参数中加入: --web.enable-lifecycle参数, 然后终端执行如下POST请求
curl -X POST http://IP:port/-/reload

Prometheus.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
cat > /data/prometheus/conf/rules/Prometheus.yaml << 'EOF'
groups:
- name: Prometheus.rules
rules:
- alert: PrometheusAllTargetsMissing
expr: count by (job) (up) == 0
for: 2m
labels:
severity: critical
annotations:
title: 'Prometheus all targets missing'
description: "A Prometheus job does not have living target anymore."
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus configuration reload failure'
description: "Prometheus: 【{{ $labels.instance }}】 configuration reload error."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus too many restarts'
description: "Prometheus: 【{{ $labels.instance }}】 has restarted more than twice in the last 15 minutes. It might be crashlooping."
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus AlertManager configuration reload failure'
description: "AlertManager: 【{{ $labels.instance }}】 configuration reload error"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 1m
labels:
severity: warning
annotations:
title: 'Prometheus notifications backlog'
description: "Prometheus: 【{{ $labels.instance }}】 The notification queue has not been empty for 10 minutes"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus AlertManager notification failing'
description: "AlertManager: 【{{ $labels.instance }}】 is failing sending notifications"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB checkpoint creation failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} checkpoint creation failures"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB checkpoint deletion failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} checkpoint deletion failures"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB compactions failed'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB compactions failures"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB head truncations failed'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB head truncation failures"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB reload failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB reload failures"
EOF

Host.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
cat > /data/prometheus/conf/rules/Hosts.yaml << 'EOF'
groups:
- name: Hosts.rules
rules:
## Custom By wangshui
- alert: HostDown
expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
for: 0m
labels:
severity: critical
annotations:
title: 'Instance down'
description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"

- alert: HostCpuLoadAvage
expr: sum(node_load5) by (instance) > 10
for: 1m
annotations:
title: "5分钟内CPU负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过10 (当前值:{{ $value }})"
labels:
severity: 'warning'

- alert: HostCpuUsage
expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
for: 1m
annotations:
title: "CPU使用率过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
labels:
severity: 'warning'

- alert: HostMemoryUsage
expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
for: 1m
annotations:
title: "主机内存使用率超过80%"
description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
labels:
severity: 'warning'

- alert: HostIOWait
expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
for: 1m
annotations:
title: "磁盘负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
labels:
severity: 'warning'

- alert: HostFileSystemUsage
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70
for: 1m
annotations:
title: "磁盘空间剩余不足"
description: "主机: 【{{ $labels.instance }}{{ $labels.mountpoint }}分区使用率超过70%, 当前值使用率:{{ $value }}%"
labels:
severity: 'warning'

- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机swap分区不足"
description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"

- alert: HostNetworkConnection-ESTABLISHED
expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机ESTABLISHED连接数过高"
description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }}"

- alert: HostNetworkConnection-TIME_WAIT
expr: sum(node_sockstat_TCP_tw) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机TIME_WAIT连接数过高"
description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"

- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡入口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡出口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

- alert: HostUnusualDiskReadRate
expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机磁盘读取速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"

- alert: HostUnusualDiskWriteRate
expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘写入速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"

- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机分区Inode节点不足"
description: "主机: 【{{ $labels.instance }}{{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"

- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Read延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"

- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Write延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"
EOF

Blackbox.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
cat > /data/prometheus/conf/rules/Blackbox.yaml << 'EOF'
groups:
- name: Blackbox.rules
rules:
- alert: HostConnectionFailure
expr: probe_success{job="ping-status"} == 0
for: 0m
labels:
severity: critical
annotations:
title: Host Connection Failure
description: "主机 【{{ $labels.instance }}】 cannot be connected"

- alert: ServiceConnectionFailure
expr: probe_success{job="port-status"} == 0
for: 0m
labels:
severity: critical
annotations:
title: Service Connection Failure
description: "服务 【{{ $labels.server }}】 on 主机 【{{ $labels.instance }}】 cannot be connected"

- alert: BlackboxSlowProbeOnServer
expr: avg_over_time(probe_duration_seconds{job="port-status"}[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Service probe timeout
description: "服务 【{{ $labels.server }}】 on 主机 【{{ $labels.instance }}】Blackbox probe took more than 1s to complete, Current Value: {{ $value }}s"

- alert: BlackboxSlowProbeOnWebsite
expr: avg_over_time(probe_duration_seconds{job="http-status"}[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Service probe timeout
description: "网站 【{{ $labels.instance }}】 Blackbox probe took more than 1s to complete, Current Value: {{ $value }}s"

- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
service: web
annotations:
title: Blackbox probe HTTP failure
description: "网站: 【{{ $labels.instance }}】HTTP status code is exception, Current status code: {{ $value }}"

- alert: BlackboxSslCertificateWillExpireSoonIn30days
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
title: Blackbox SSL certificate will expire soon
description: "网站: 【{{ $labels.instance }}】 SSL certificate expires in 30 days"
- alert: BlackboxSslCertificateWillExpireSoonIn3days
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
title: Blackbox SSL certificate will expire soon
description: "网站: 【{{ $labels.instance }}】 SSL certificate expires in 3 days"
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
title: Blackbox SSL certificate expired
description: "网站: 【{{ $labels.instance }}】 SSL certificate has expired already"
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Blackbox probe slow HTTP
description: "网站: 【{{ $labels.instance }}】HTTP request took more than 1s, Current Value: {{ $value }}s"
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Blackbox probe slow ping
description: "主机: 【{{ $labels.instance }}】Blackbox ping took more than 1s, Current Value: {{ $value }}s"
EOF

Mysql.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
cat > /data/prometheus/conf/rules/Mysql.yaml << 'EOF'
groups:
- name: Mysql.rules
rules:
## Mysql Alarm Rules
- alert: MysqlDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL down'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL instance is down"

- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 0m
labels:
severity: info
annotations:
title: 'MySQL Restarted'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL has just been restarted, less than one minute ago"

- alert: MysqlTooManyConnections(>80%)
expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many connections (> 80%)'
description: "Mysql实例: 【{{ $labels.instance }}】, More than 80% of MySQL connections are in use, Current Value: {{ $value }}%"

- alert: MysqlThreadsRunningHigh
expr: mysql_global_status_threads_running > 40
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL Threads_Running High'
description: "Mysql实例: 【{{ $labels.instance }}】, Threads_Running above the threshold(40), Current Value: {{ $value }}"

- alert: MysqlQpsHigh
expr: sum by (instance) (rate(mysql_global_status_queries[2m])) > 500
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL QPS High'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL QPS above 500"

- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL slow queries'
description: "Mysql实例: 【{{ $labels.instance }}】, has some new slow query."

- alert: MysqlTooManyAbortedConnections
expr: round(increase(mysql_global_status_aborted_connects[5m])) > 20
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 minutes'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted connections within 2 minutes"

- alert: MysqlTooManyAbortedClients
expr: round(increase(mysql_global_status_aborted_clients[120m])) > 10
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 hours'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted Clients within 2 hours"

- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave IO thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave IO thread not running"

- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave SQL thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave SQL thread not running"

- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
for: 1m
labels:
severity: critical
annotations:
title: 'MySQL Slave replication lag'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL replication lag"

- alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 0m
labels:
severity: warning
annotations:
title: 'MySQL InnoDB log waits'
description: "Mysql实例: 【{{ $labels.instance }}】, innodb log writes stalling"
EOF

Redis.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
cat > /data/prometheus/conf/rules/redis.yaml << 'EOF'
groups:
- name: Redis.rules
rules:
## Redis Alarm Rules
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
title: 'Redis down'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance is down"

- alert: RedisMissingMaster
expr: count(redis_instance_info{role="master"}) < 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis missing master'
description: "Redis cluster has no node marked as master."

- alert: RedisTooManyMasters
expr: count(redis_instance_info{role="master"}) > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis too many masters'
description: "Redis cluster has too many nodes marked as master."

- alert: RedisDisconnectedSlaves
expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis disconnected slaves'
description: "Redis not replicating for all slaves. Consider reviewing the redis replication status."

- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 0m
labels:
severity: critical
annotations:
title: 'Redis replication broken'
description: "Redis实例: 【{{ $labels.instance }}】,Redis instance lost a slave"

- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[1m]) > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis cluster flapping'
description: "Redis实例: 【{{ $labels.instance }}】,Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)."

- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 0m
labels:
severity: critical
annotations:
title: 'Redis missing backup'
description: "Redis实例: 【{{ $labels.instance }}】,Redis has not been backuped for 24 hours"

- alert: RedisOutOfConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
title: 'Redis out of configured maxmemory'
description: "Redis实例: 【{{ $labels.instance }}】,Redis is running out of configured maxmemory (> 90%), Current Value: {{ $value }}"

- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 2m
labels:
severity: warning
annotations:
title: 'Redis too many connections'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance has too many connections, Current Value: {{ $value }}"

- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 5
for: 2m
labels:
severity: warning
annotations:
title: 'Redis not enough connections'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance should have more connections (> 5), Current Value: {{ $value }}"

- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Redis rejected connections'
description: "Redis实例: 【{{ $labels.instance }}】, Some connections to Redis has been rejected, Current Value: {{ $value }}"
EOF

Elasticsearch.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
cat > /data/prometheus/conf/rules/elasticsearch.yaml << 'EOF'
groups:
- name: Elasticsearch.rules
rules:
## ES Alarm Rules
- alert: ElasticsearchHeapUsageTooHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
title: "Elasticsearch Heap Usage Too High"
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 90%, Current Value: {{ $value }}"

- alert: ElasticsearchHeapUsageWarning
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch Heap Usage warning'
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 80%, Current Value: {{ $value }}"

- alert: ElasticsearchDiskOutOfSpace
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch disk out of space'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 90%, Current Value: {{ $value }}"

- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch disk space low'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 80%, Current Value: {{ $value }}"

- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Cluster Red'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Red status"

- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch Cluster Yellow'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Yellow status"

- alert: ElasticsearchHealthyNodes
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Nodes'
description: "Missing node in Elasticsearch cluster"

- alert: ElasticsearchHealthyDataNodes
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Data Nodes'
description: "Missing data node in Elasticsearch cluster"

- alert: ElasticsearchRelocatingShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch relocating shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is relocating shards"

- alert: ElasticsearchRelocatingShardsTooLong
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch relocating shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been relocating shards for 15min"

- alert: ElasticsearchInitializingShards
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch initializing shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is initializing shards"

- alert: ElasticsearchInitializingShardsTooLong
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch initializing shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been initializing shards for 15 min"

- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch unassigned shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has unassigned shards"

- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch pending tasks'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has pending tasks. Cluster works slowly, Current Value: {{ $value }}"

- alert: ElasticsearchNoNewDocuments
expr: increase(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch no new documents'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch No new documents for 10 min!"
EOF

kafka.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
cat > /data/prometheus/conf/rules/kafka.yaml << 'EOF'
groups:
- name: kafka.rules
rules:
## KAFKA Alarm Rules
- alert: KafkaTopicsReplicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Kafka topics replicas less than 3'
description: "Topic: {{ $labels.topic }} partition less than 3, Current Value: {{ $value }}"

- alert: KafkaConsumersGroupLag
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka consumers group 消费滞后'
description: "Kafka consumers group 消费滞后 (Lag > 50), Lag值: {{ $value }}"

- alert: KafkaConsumersTopicLag
expr: sum(kafka_consumergroup_lag) by (topic) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka Topic 消费滞后'
description: "Kafka Topic 消费滞后 (Lag > 50), Lag值: {{ $value }}"
EOF

Docker.rules

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
cat > /data/prometheus/conf/rules/Docker.yaml << 'EOF'
groups:
- name: Docker.rules
rules:
- alert: DockerInstanceDown
expr: up{job="cAdvisor"} == 0
for: 0m
labels:
severity: critical
annotations:
title: 'Docker Instance down'
description: "容器实例: 【{{ $labels.instance }}】has been down for more than 1 minute"

- alert: ContainerKilled
expr: time() - container_last_seen{name!=""} > 60
for: 1m
labels:
severity: critical
annotations:
title: "A Container has disappeared"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 has disappeared"

- alert: ContainerCpuUsage
expr: (sum by(instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[3m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
title: "Container CPU usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 CPU usage is above 80%, Current Value: {{ $value }}"

- alert: ContainerMemoryUsage
expr: (sum by(instance, name) (container_memory_working_set_bytes{name!=""}) / sum by(instance, name) (container_spec_memory_limit_bytes{name!=""} > 0) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
title: "Container CPU usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 Memory usage is above 80%, Current Value: {{ $value }}"
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
title: "Container Volume usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 Volume usage is above 80%, Current Value: {{ $value }}"
EOF