创建目录 1 2 3 4 5 6 7 8 9 10 11 12 13 mkdir -pv /data/prometheus/{conf,data}mkdir -pv /data/prometheus/conf/{rules,targets}mkdir -pv /data/prometheus/conf/targets/{servers,nodes,blackbox}mkdir -pv /data/alertmanager/{conf,data,tmpl}mkdir -pv /data/grafana/{conf,data,logs,plugins}mkdir -pv /data/docker-compose/{prometheus,alertmanager,grafana}
配置文件 Prometheus配置文件 主配置文件
prometheus.yml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 cat > /data/prometheus/conf/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - file_sd_configs: - files: - targets/servers/alertmanagers.yaml rule_files: - "rules/*.yaml" scrape_configs: - job_name: 'prometheus' file_sd_configs: - files: - targets/servers/prometheus.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'alertmanagers' file_sd_configs: - files: - targets/servers/alertmanagers.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'grafana' file_sd_configs: - files: - targets/servers/grafana.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'nodes' file_sd_configs: - files: - targets/nodes/node-exporter.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'mysql' file_sd_configs: - files: - targets/servers/mysql.yaml refresh_interval: 2m - job_name: 'redis-cluster' file_sd_configs: - files: - targets/servers/redis-exporter.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'es-cluster' file_sd_configs: - files: - targets/servers/es-exporter.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'kafka-cluster' file_sd_configs: - files: - targets/servers/kafka-exporter.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'nacos-cluster' metrics_path: '/nacos/actuator/prometheus' file_sd_configs: - files: - targets/servers/nacos-exporter.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'docker-engines' file_sd_configs: - files: - targets/nodes/docker-nodes.yaml refresh_interval: 2m relabel_configs: - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'instance' replacement: $1 action: replace - job_name: 'http_status' metrics_path: /probe params: module: [http_2xx ] file_sd_configs: - files: - targets/blackbox/http-status.yaml relabel_configs: - source_labels: [__address__ ] target_label: __param_target - source_labels: [__param_target ] target_label: instance - target_label: __address__ replacement: 3.1 .101 .40 :9115 - job_name: 'ping_status' metrics_path: /probe params: module: [icmp ] file_sd_configs: - files: - targets/blackbox/ping-status.yaml relabel_configs: - source_labels: [__address__ ] target_label: __param_target - source_labels: [__param_target ] target_label: instance - target_label: __address__ replacement: 3.1 .101 .40 :9115 - job_name: 'port_status' metrics_path: /probe params: module: [tcp_connect ] file_sd_configs: - files: - targets/blackbox/port-status.yaml relabel_configs: - source_labels: [__address__ ] target_label: __param_target - source_labels: [__param_target ] target_label: instance - target_label: __address__ replacement: 3.1 .101 .40 :9115 EOF
重新打标示例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 relabel_configs: - source_labels: - __scheme__ - __address__ - __metrics_path__ regex: "(http|https)(.*)" separator: "" target_label: "endpoint" replacement: "${1}://${2}" action: replace - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'ipaddr' replacement: $1 action: replace - regex: "(job|app)" replacement: ${1}_name action: labelmap metric_relabel_configs: - source_labels: - __name__ regex: "go_.*" action: drop
节点配置文件 targets/servers/prometheus.yaml
1 2 3 4 5 6 7 8 cat > /data/prometheus/conf/targets/servers/prometheus.yaml << 'EOF' - targets: - 3.1 .101 .39 :9090 labels: job: prometheus server: prometheus env: dev EOF
targets/servers/alertmanagers.yaml
1 2 3 4 5 6 7 8 cat > /data/prometheus/conf/targets/servers/alertmanagers.yaml << 'EOF' - targets: - 3.1 .101 .39 :9093 labels: job: alertmanager server: alertmanager env: dev EOF
targets/servers/grafana.yaml
1 2 3 4 5 6 7 8 cat > /data/prometheus/conf/targets/servers/grafana.yaml << 'EOF' - targets: - 3.1 .101 .40 :3000 labels: job: grafana server: grafana env: dev EOF
targets/nodes/node-exporter.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 cat > /data/prometheus/conf/targets/nodes/node-exporter.yaml << 'EOF' - targets: - 3.1 .101 .33 :9100 - 3.1 .101 .34 :9100 - 3.1 .101 .35 :9100 labels: job: node-exporter server: middleware env: dev - targets: - 3.1 .101 .36 :9100 - 3.1 .101 .38 :9100 - 3.1 .101 .39 :9100 - 3.1 .101 .40 :9100 labels: job: node-exporter server: devops env: dev - targets: - 3.1 .101 .41 :9100 - 3.1 .101 .42 :9100 labels: job: node-exporter server: weblogic env: dev - targets: - 6.1 .14 .86 :9100 - 6.1 .14 .87 :9100 labels: job: node-exporter server: weblogic env: old-env EOF
targets/servers/mysql-exporter.yaml
1 2 3 4 5 6 7 8 9 cat > /data/prometheus/conf/targets/servers/mysql-exporter.yaml << 'EOF' - targets: - 3.1 .101 .36 :9104 labels: instance: 6.1 .14 .87 job: mysql-exporter server: mysql env: dev EOF
targets/servers/redis-exporter.yaml
1 2 3 4 5 6 7 8 9 10 11 cat > /data/prometheus/conf/targets/servers/redis-exporter.yaml << 'EOF' - targets: - 3.1 .101 .33 :9121 - 3.1 .101 .34 :9121 - 3.1 .101 .35 :9121 labels: job: redis-exporter server: redis env: dev cluster: redis EOF
targets/servers/es-exporter.yaml
1 2 3 4 5 6 7 8 9 10 11 cat > /data/prometheus/conf/targets/servers/es-exporter.yaml << 'EOF' - targets: - 3.1 .101 .33 :9115 - 3.1 .101 .34 :9115 - 3.1 .101 .35 :9115 labels: job: es-exporter server: elasticsearch env: dev cluster: elk EOF
targets/servers/kafka-exporter.yaml
1 2 3 4 5 6 7 8 9 10 11 cat > /data/prometheus/conf/targets/servers/kafka-exporter.yaml << 'EOF' - targets: - 3.1 .101 .33 :9308 - 3.1 .101 .34 :9308 - 3.1 .101 .35 :9308 labels: job: kafka-exporter server: kafka env: dev cluster: kafka EOF
targets/servers/nacos-exporter.yaml
1 2 3 4 5 6 7 8 9 10 11 cat > /data/prometheus/conf/targets/servers/nacos-exporter.yaml << 'EOF' - targets: - 3.1 .101 .33 :8848 - 3.1 .101 .34 :8848 - 3.1 .101 .35 :8848 labels: job: nacos-exporter server: nacos env: dev cluster: nacos EOF
targets/nodes/docker-nodes.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 cat > /data/prometheus/conf/targets/nodes/docker-nodes.yaml << 'EOF' - targets: - 3.1 .101 .33 :9180 - 3.1 .101 .34 :9180 - 3.1 .101 .35 :9180 - 3.1 .101 .36 :9180 - 3.1 .101 .38 :9180 - 3.1 .101 .39 :9180 - 3.1 .101 .40 :9180 - 3.1 .101 .41 :9180 - 3.1 .101 .42 :9180 labels: job: cAdvisor server: docker env: dev EOF
targets/blackbox/http-status.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 cat > /data/prometheus/conf/targets/blackbox/http-status.yaml << 'EOF' - targets: - http://3.1.101.39:9090/ labels: job: http-status server: prometheus env: dev - targets: - http://3.1.101.40:3030/ labels: job: http-status server: grafana env: dev - targets: - http://3.1.101.40:5601/ labels: job: http-status server: kibana env: dev - targets: - http://3.1.101.36:8088/ labels: job: http-status server: jumpserver env: dev - targets: - http://3.1.101.35:8095/ labels: job: http-status server: jira env: dev - targets: - http://3.1.101.45:8848/nacos/ labels: job: http-status server: nacos env: dev - targets: - http://3.1.101.33:8858/ - http://3.1.101.34:8858/ labels: job: http-status server: sentinel env: dev - targets: - http://3.1.101.41:7001/console - http://3.1.101.42:7001/console labels: job: http-status server: weblogic env: dev - targets: - http://6.1.14.86:7001/console - http://6.1.14.87:7001/console labels: job: http-status server: weblogic env: dev EOF
targets/blackbox/ping-status.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 cat > /data/prometheus/conf/targets/blackbox/ping-status.yaml << 'EOF' - targets: - 3.1 .101 .33 - 3.1 .101 .34 - 3.1 .101 .35 labels: job: ping-status server: middleware env: dev - targets: - 3.1 .101 .36 - 3.1 .101 .38 - 3.1 .101 .39 - 3.1 .101 .40 labels: job: ping-status server: devops env: dev - targets: - 3.1 .101 .41 - 3.1 .101 .42 labels: job: ping-status server: weblogic env: dev - targets: - 6.1 .14 .86 - 6.1 .14 .87 labels: job: ping-status server: weblogic env: old-env EOF
targets/blackbox/port-status.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 cat > /data/prometheus/conf/targets/blackbox/port-status.yaml << 'EOF' - targets: - 3.1 .101 .39 :9090 labels: job: port-status server: prometheus env: dev - targets: - 3.1 .101 .40 :3030 labels: job: port-status server: grafana env: dev - targets: - 3.1 .101 .39 :9093 labels: job: port-status server: alertmanager env: dev - targets: - 3.1 .101 .33 :6379 - 3.1 .101 .34 :6379 - 3.1 .101 .35 :6379 labels: job: port-status server: redis-master env: dev - targets: - 3.1 .101 .33 :26379 - 3.1 .101 .34 :26379 - 3.1 .101 .35 :26379 labels: job: port-status server: redis-slave env: dev - targets: - 3.1 .101 .33 :9092 - 3.1 .101 .34 :9092 - 3.1 .101 .35 :9092 labels: job: port-status server: kafka env: dev - targets: - 3.1 .101 .33 :2181 - 3.1 .101 .34 :2181 - 3.1 .101 .35 :2181 labels: job: port-status server: zookeeper env: dev - targets: - 3.1 .101 .33 :8848 - 3.1 .101 .34 :8848 - 3.1 .101 .35 :8848 labels: job: port-status server: nacos env: dev - targets: - 3.1 .101 .33 :8091 - 3.1 .101 .34 :8091 - 3.1 .101 .35 :8091 labels: job: port-status server: seata env: dev - targets: - 3.1 .101 .33 :8858 - 3.1 .101 .34 :8858 labels: job: port-status server: sentinel env: dev - targets: - 3.1 .101 .33 :9200 - 3.1 .101 .34 :9200 - 3.1 .101 .35 :9200 labels: job: port-status server: elasticsearch env: dev - targets: - 3.1 .101 .33 :5044 - 3.1 .101 .34 :5044 labels: job: port-status server: logstash env: dev - targets: - 3.1 .101 .40 :5601 labels: job: port-status server: kibana env: dev - targets: - 6.1 .14 .87 :3306 labels: job: port-status server: mysql env: dev - targets: - 3.1 .101 .41 :7001 - 3.1 .101 .42 :7001 labels: job: port-status server: weblogic env: dev - targets: - 6.1 .14 .86 :7001 - 6.1 .14 .87 :7001 labels: job: port-status server: weblogic env: old-env - targets: - 3.1 .101 .38 :389 labels: job: port-status server: openldap env: dev - targets: - 3.1 .101 .36 :8088 labels: job: port-status server: jumpserver env: dev - targets: - 3.1 .101 .38 :8098 labels: job: port-status server: gitlab env: dev - targets: - 3.1 .101 .36 :8080 labels: job: port-status server: jenkins env: dev - targets: - 3.1 .101 .36 :9000 labels: job: port-status server: sonarqube env: dev - targets: - 3.1 .101 .35 :8095 labels: job: port-status server: jira env: dev EOF
Alertmanager配置文件 主配置文件 alertmanager.yml示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 global: resolve_timeout: 5m smtp_from: 'wangshui898@sina.com' smtp_smarthost: 'smtp.sina.com:465' smtp_auth_username: 'wangshui898@sina.com' smtp_auth_password: 'Authorization code' smtp_require_tls: false templates: - 'tmpl/*.tmpl' route: receiver: default-receiver group_wait: 30s group_interval: 2m repeat_interval: 1h group_by: ['alertname' ,'server' ] routes: - match: severity: critical receiver: 'default-receiver' - match: server: mysql receiver: 'dba' - match_re: service: ccms-.* receiver: 'dev' receivers: - name: 'default-receiver' email_configs: - to: 'wangshui898@126.com' html: '{{ template "email.to.html" .}} ' send_resolved: true - name: 'dev' email_configs: - to: 'wangshui898@163.com' send_resolved: true - name: 'dba' email_configs: - to: 'wangshui898@sina.com' send_resolved: true
alertname即为prometheus告警规则中的groups.name.rules.alert
告警模版文件 官方模板参考: https://raw.githubusercontent.com/prometheus/alertmanager/master/template/default.tmpl
邮件告警模版
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 mkdir -pv /data/alertmanager/conf/tmpl cat > /data/alertmanager/conf/tmpl/e-mail.tmpl << 'EOF' {{ define "email.to.html" }} {{- if gt (len .Alerts.Firing) 0 -}} {{- range $index, $alert := .Alerts -}} {{- if eq $index 0 -}} =========告警通知==========<br> 告警类型: {{ .Labels.alertname }} <br> 告警级别: {{ .Labels.severity }} 级<br> {{- end }} ----------------------------<br> 告警主题: {{ .Annotations.title }} <br> 故障详情: {{ .Annotations.description }} <br> 故障时间: {{ .StartsAt.Local }} <br> {{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}} =========请勿回复==========<br> {{- end }} {{- end }} {{- if gt (len .Alerts.Resolved) 0 -}} {{- range $index, $alert := .Alerts -}} {{- if eq $index 0 -}} ==========告警恢复通知========<br> 告警类型: {{ .Labels.alertname }} <br> 告警级别: {{ .Labels.severity }} 级<br> {{- end }} ----------------------------<br> 告警主题: {{ .Annotations.title }} <br> 触发详情: {{ .Annotations.description }}, 已恢复 <br> 故障时间: {{ .StartsAt.Local }} <br> 恢复时间: {{ .EndsAt.Local }} <br> {{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}} ===========请勿回复===========<br> {{- end }} {{- end }} {{- end }} EOF
模版中的变量要和告Prometheus告警规则(rules)中对应, 如title, description
1 2 3 {{- if gt (len .Alerts.Firing) 0 -}} # 判断报警列表的长度是否大于0,大于0说明有报警,否则没有 {{- range $index, $alert := .Alerts -}} # 遍历所有的告警列表,$index是索引,$alert是每一个报警元素 {{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}} # 判断instance是否存在
1 2 3 4 5 6 7 8 9 .Receiver: 接收器的名称 .Status: 如果正在告警,值为firing,恢复为resolved .Alerts: 所有告警对象的列表,是一个列表, .Alerts.Firing: 告警列表 .Alerts.Resolved: 恢复列表 .GroupLabels: 告警的分组标签 .CommonLabels: 所有告警共有的标签 .CommonAnnotations: 所有告警共有的注解 .ExternalURL: 告警对应的alertmanager连接地址
服务部署
Prometheus 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 version: "3" services: prometheus: container_name: prometheus image: prom/prometheus:v2.25.1 user: root ports: - 9090 :9090 restart: always volumes: - /etc/localtime:/etc/localtime - /data/prometheus/conf:/etc/prometheus - /data/prometheus/data:/prometheus environment: TZ: Asia/Shanghai command: --config.file=/etc/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.retention=30d deploy: resources: limits: memory: 2G reservations: memory: 1G
Alertmanager 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 version: "3" services: alertmanager: container_name: alertmanager image: prom/alertmanager:v0.23.0 user: root ports: - 9093 :9093 restart: always volumes: - /etc/localtime:/etc/localtime - /data/alertmanager/conf:/etc/alertmanager - /data/alertmanager/data:/alertmanager environment: TZ: Asia/Shanghai deploy: resources: limits: memory: 2G reservations: memory: 1G
grafana 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 version: "3" services: grafana: container_name: grafana image: grafana/grafana:7.4.1 user: root ports: - 3000 :3000 restart: always environment: TZ: Asia/Shanghai GF_PATHS_DATA: /data/grafana/data GF_PATHS_LOGS: /data/grafana/logs volumes: - /etc/localtime:/etc/localtime - /data/grafana:/data/grafana - /data/grafana/plugins:/var/lib/grafana/plugins deploy: resources: limits: memory: 2G reservations: memory: 1G
exporter部署
node_exporter 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 version: "3" services: node_exporter: container_name: node_exporter image: prom/node-exporter:v1.1.2 restart: always network_mode: host command: - '--web.listen-address=:9100' - '--path.rootfs=/rootfs' - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' volumes: - /proc:/host/proc - /sys:/host/sys - /:/rootfs
常用Linux主机模版: 8919, 1860
mysql-exporter 1 2 3 4 5 6 7 8 9 10 11 12 13 14 version: "3" services: mysql_exporter: container_name: mysql_exporter image: prom/mysqld-exporter:v0.13.0 restart: always ports: - "9104:9104" environment: DATA_SOURCE_NAME: "exporter:Aa@123456@(172.16.20.111:3306)/"
常用mysql监控模版: 7362
oracle-exporter 1 2 3 4 5 6 7 8 9 10 11 12 13 14 version: "3" services: oracle-exporter: container_name: oracle_exporter image: iamseth/oracledb_exporter:latest hostname: ccms-odb-sit1 restart: always ports: - 9161 :9161 volumes: - /etc/localtime:/etc/localtime environment: - TZ=Asia/Shanghai - DATA_SOURCE_NAME=system/System123@3.1.101.43:1521/loandb
常用监控模板: 11121
redis-exporter 1 2 3 4 5 6 7 8 9 version: "3" services: redis_exporter: container_name: redis_exporter image: bitnami/redis-exporter:1.20.0 restart: always ports: - "9121:9121" command: "-redis.addr 3.1.101.33:6379 -redis-only-metrics -web.listen-address 0.0.0.0:9121 -redis.password g1tredis2o2l"
常用redis监控模版: 11835
elasticsearch_exporter 1 2 3 4 5 6 7 8 9 version: "3" services: es_exporter: container_name: es_exporter image: justwatch/elasticsearch_exporter:1.1.0 restart: always ports: - "9115:9115" command: "--es.all --es.indices --es.cluster_settings --es.indices_settings --es.shards --es.snapshots --es.timeout=10s --web.listen-address=0.0.0.0:9115 --web.telemetry-path=/metrics --es.uri http://3.1.101.33:9200"
常用ES监控模版: 2322
kafka_exporter 1 2 3 4 5 6 7 8 9 version: "3" services: kafka_exporter: container_name: kafka_exporter image: danielqsj/kafka-exporter:v1.3.0 restart: always ports: - "9308:9308" command: "--kafka.server=3.1.101.33:9092"
常用kakfa监控模版: 13572, 7589
cAdvisor 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 version: "3" services: cadvisor: container_name: cadvisor image: google/cadvisor:v0.33.0 user: root privileged: true ports: - 9180 :8080 restart: always volumes: - /:/rootfs - /var/run:/var/run - /sys:/sys - /var/lib/docker/:/var/lib/docker - /dev/disk/:/dev/disk
常用docker监控模版: 13584,13946
blackbox-exporter 创建目录
1 mkdir -pv /data/exporter/blackbox_exporter
创建配置文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 cat > /data/exporter/blackbox_exporter/config.yml << 'EOF' modules: http_2xx: prober: http http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" - send: "SSH-2.0-blackbox-ssh-check" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1} " - expect: "^:[^ ]+ 001" icmp: prober: icmp EOF
docker-compose编排文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 version: "3" services: blackbox-exporter: container_name: blackbox-exporter image: prom/blackbox-exporter:v0.19.0 restart: always ports: - 9115:9115 volumes: - /etc/localtime:/etc/localtime - /data/exporter/blackbox_exporter:/etc/blackbox_exporter deploy: resources: limits: memory: 2G reservations: memory: 1G
grafana常用监控模版: 9965
Prometheus常用告警规则 参考: https://awesome-prometheus-alerts.grep.to/rules
热加载告警规则
1 2 启动参数中加入: --web.enable-lifecycle参数, 然后终端执行如下POST请求 curl -X POST http://IP:port/-/reload
Prometheus.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 cat > /data/prometheus/conf/rules/Prometheus.yaml << 'EOF' groups: - name: Prometheus.rules rules: - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 2m labels: severity: critical annotations: title: 'Prometheus all targets missing' description: "A Prometheus job does not have living target anymore." - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: title: 'Prometheus configuration reload failure' description: "Prometheus: 【{{ $labels.instance }} 】 configuration reload error." - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 for: 0m labels: severity: warning annotations: title: 'Prometheus too many restarts' description: "Prometheus: 【{{ $labels.instance }} 】 has restarted more than twice in the last 15 minutes. It might be crashlooping." - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: alertmanager_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: title: 'Prometheus AlertManager configuration reload failure' description: "AlertManager: 【{{ $labels.instance }} 】 configuration reload error" - alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 1m labels: severity: warning annotations: title: 'Prometheus notifications backlog' description: "Prometheus: 【{{ $labels.instance }} 】 The notification queue has not been empty for 10 minutes" - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: title: 'Prometheus AlertManager notification failing' description: "AlertManager: 【{{ $labels.instance }} 】 is failing sending notifications" - alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: title: 'Prometheus TSDB checkpoint creation failures' description: "Prometheus: 【{{ $labels.instance }} 】 encountered {{ $value }} checkpoint creation failures" - alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: title: 'Prometheus TSDB checkpoint deletion failures' description: "Prometheus: 【{{ $labels.instance }} 】 encountered {{ $value }} checkpoint deletion failures" - alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: title: 'Prometheus TSDB compactions failed' description: "Prometheus: 【{{ $labels.instance }} 】 encountered {{ $value }} TSDB compactions failures" - alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: title: 'Prometheus TSDB head truncations failed' description: "Prometheus: 【{{ $labels.instance }} 】 encountered {{ $value }} TSDB head truncation failures" - alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 for: 1m labels: severity: critical annotations: title: 'Prometheus TSDB reload failures' description: "Prometheus: 【{{ $labels.instance }} 】 encountered {{ $value }} TSDB reload failures" EOF
Host.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 cat > /data/prometheus/conf/rules/Hosts.yaml << 'EOF' groups: - name: Hosts.rules rules: - alert: HostDown expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0 for: 0m labels: severity: critical annotations: title: 'Instance down' description: "主机: 【{{ $labels.instance }} 】has been down for more than 1 minute" - alert: HostCpuLoadAvage expr: sum(node_load5) by (instance) > 10 for: 1m annotations: title: "5分钟内CPU负载过高" description: "主机: 【{{ $labels.instance }} 】 5五分钟内CPU负载超过10 (当前值:{{ $value }} )" labels: severity: 'warning' - alert: HostCpuUsage expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80 for: 1m annotations: title: "CPU使用率过高" description: "主机: 【{{ $labels.instance }} 】 5五分钟内CPU使用率超过80% (当前值:{{ $value }} )" labels: severity: 'warning' - alert: HostMemoryUsage expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80 for: 1m annotations: title: "主机内存使用率超过80%" description: "主机: 【{{ $labels.instance }} 】 内存使用率超过80% (当前使用率:{{ $value }} %)" labels: severity: 'warning' - alert: HostIOWait expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10 for: 1m annotations: title: "磁盘负载过高" description: "主机: 【{{ $labels.instance }} 】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }} )" labels: severity: 'warning' - alert: HostFileSystemUsage expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70 for: 1m annotations: title: "磁盘空间剩余不足" description: "主机: 【{{ $labels.instance }} 】 {{ $labels.mountpoint }} 分区使用率超过70%, 当前值使用率:{{ $value }} %" labels: severity: 'warning' - alert: HostSwapIsFillingUp expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: 'warning' annotations: title: "主机swap分区不足" description: "主机: 【{{ $labels.instance }} 】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }} %" - alert: HostNetworkConnection-ESTABLISHED expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000 for: 5m labels: severity: 'warning' annotations: title: "主机ESTABLISHED连接数过高" description: "主机: 【{{ $labels.instance }} 】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }} " - alert: HostNetworkConnection-TIME_WAIT expr: sum(node_sockstat_TCP_tw) by (instance) > 1000 for: 5m labels: severity: 'warning' annotations: title: "主机TIME_WAIT连接数过高" description: "主机: 【{{ $labels.instance }} 】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }} " - alert: HostUnusualNetworkThroughputIn expr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: 'warning' annotations: title: "主机网卡入口流量过高" description: "主机: 【{{ $labels.instance }} 】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }} " - alert: HostUnusualNetworkThroughputOut expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: 'warning' annotations: title: "主机网卡出口流量过高" description: "主机: 【{{ $labels.instance }} 】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }} " - alert: HostUnusualDiskReadRate expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: 'warning' annotations: title: "主机磁盘读取速率过高" description: "主机: 【{{ $labels.instance }} 】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }} " - alert: HostUnusualDiskWriteRate expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50 for: 2m labels: severity: 'warning' annotations: title: "主机磁盘写入速率过高" description: "主机: 【{{ $labels.instance }} 】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }} " - alert: HostOutOfInodes expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10 for: 2m labels: severity: 'warning' annotations: title: "主机分区Inode节点不足" description: "主机: 【{{ $labels.instance }} 】 {{ $labels.mountpoint }} 分区inode节点不足 (可用值小于{{ $value }} %)" - alert: HostUnusualDiskReadLatency expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0 for: 2m labels: severity: 'warning' annotations: title: "主机磁盘Read延迟过高" description: "主机: 【{{ $labels.instance }} 】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }} ms" - alert: HostUnusualDiskWriteLatency expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0 for: 2m labels: severity: 'warning' annotations: title: "主机磁盘Write延迟过高" description: "主机: 【{{ $labels.instance }} 】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }} ms" EOF
Blackbox.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 cat > /data/prometheus/conf/rules/Blackbox.yaml << 'EOF' groups: - name: Blackbox.rules rules: - alert: HostConnectionFailure expr: probe_success{job="ping-status"} == 0 for: 0m labels: severity: critical annotations: title: Host Connection Failure description: "主机 【{{ $labels.instance }} 】 cannot be connected" - alert: ServiceConnectionFailure expr: probe_success{job="port-status"} == 0 for: 0m labels: severity: critical annotations: title: Service Connection Failure description: "服务 【{{ $labels.server }} 】 on 主机 【{{ $labels.instance }} 】 cannot be connected" - alert: BlackboxSlowProbeOnServer expr: avg_over_time(probe_duration_seconds{job="port-status"}[1m]) > 1 for: 1m labels: severity: warning annotations: title: Service probe timeout description: "服务 【{{ $labels.server }} 】 on 主机 【{{ $labels.instance }} 】Blackbox probe took more than 1s to complete, Current Value: {{ $value }} s" - alert: BlackboxSlowProbeOnWebsite expr: avg_over_time(probe_duration_seconds{job="http-status"}[1m]) > 1 for: 1m labels: severity: warning annotations: title: Service probe timeout description: "网站 【{{ $labels.instance }} 】 Blackbox probe took more than 1s to complete, Current Value: {{ $value }} s" - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 0m labels: severity: critical service: web annotations: title: Blackbox probe HTTP failure description: "网站: 【{{ $labels.instance }} 】HTTP status code is exception, Current status code: {{ $value }} " - alert: BlackboxSslCertificateWillExpireSoonIn30days expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 for: 0m labels: severity: warning annotations: title: Blackbox SSL certificate will expire soon description: "网站: 【{{ $labels.instance }} 】 SSL certificate expires in 30 days" - alert: BlackboxSslCertificateWillExpireSoonIn3days expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 for: 0m labels: severity: critical annotations: title: Blackbox SSL certificate will expire soon description: "网站: 【{{ $labels.instance }} 】 SSL certificate expires in 3 days" - alert: BlackboxSslCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 0m labels: severity: critical annotations: title: Blackbox SSL certificate expired description: "网站: 【{{ $labels.instance }} 】 SSL certificate has expired already" - alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: title: Blackbox probe slow HTTP description: "网站: 【{{ $labels.instance }} 】HTTP request took more than 1s, Current Value: {{ $value }} s" - alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: title: Blackbox probe slow ping description: "主机: 【{{ $labels.instance }} 】Blackbox ping took more than 1s, Current Value: {{ $value }} s" EOF
Mysql.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 cat > /data/prometheus/conf/rules/Mysql.yaml << 'EOF' groups: - name: Mysql.rules rules: - alert: MysqlDown expr: mysql_up == 0 for: 0m labels: severity: critical annotations: title: 'MySQL down' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL instance is down" - alert: MysqlRestarted expr: mysql_global_status_uptime < 60 for: 0m labels: severity: info annotations: title: 'MySQL Restarted' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL has just been restarted, less than one minute ago" - alert: MysqlTooManyConnections(>80%) expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80 for: 2m labels: severity: warning annotations: title: 'MySQL too many connections (> 80%)' description: "Mysql实例: 【{{ $labels.instance }} 】, More than 80% of MySQL connections are in use, Current Value: {{ $value }} %" - alert: MysqlThreadsRunningHigh expr: mysql_global_status_threads_running > 40 for: 2m labels: severity: warning annotations: title: 'MySQL Threads_Running High' description: "Mysql实例: 【{{ $labels.instance }} 】, Threads_Running above the threshold(40), Current Value: {{ $value }} " - alert: MysqlQpsHigh expr: sum by (instance) (rate(mysql_global_status_queries[2m])) > 500 for: 2m labels: severity: warning annotations: title: 'MySQL QPS High' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL QPS above 500" - alert: MysqlSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: warning annotations: title: 'MySQL slow queries' description: "Mysql实例: 【{{ $labels.instance }} 】, has some new slow query." - alert: MysqlTooManyAbortedConnections expr: round(increase(mysql_global_status_aborted_connects[5m])) > 20 for: 2m labels: severity: warning annotations: title: 'MySQL too many Aborted connections in 2 minutes' description: "Mysql实例: 【{{ $labels.instance }} 】, {{ $value }} Aborted connections within 2 minutes" - alert: MysqlTooManyAbortedClients expr: round(increase(mysql_global_status_aborted_clients[120m])) > 10 for: 2m labels: severity: warning annotations: title: 'MySQL too many Aborted connections in 2 hours' description: "Mysql实例: 【{{ $labels.instance }} 】, {{ $value }} Aborted Clients within 2 hours" - alert: MysqlSlaveIoThreadNotRunning expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0 for: 0m labels: severity: critical annotations: title: 'MySQL Slave IO thread not running' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL Slave IO thread not running" - alert: MysqlSlaveSqlThreadNotRunning expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0 for: 0m labels: severity: critical annotations: title: 'MySQL Slave SQL thread not running' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL Slave SQL thread not running" - alert: MysqlSlaveReplicationLag expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30 for: 1m labels: severity: critical annotations: title: 'MySQL Slave replication lag' description: "Mysql实例: 【{{ $labels.instance }} 】, MySQL replication lag" - alert: MysqlInnodbLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 for: 0m labels: severity: warning annotations: title: 'MySQL InnoDB log waits' description: "Mysql实例: 【{{ $labels.instance }} 】, innodb log writes stalling" EOF
Redis.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 cat > /data/prometheus/conf/rules/redis.yaml << 'EOF' groups: - name: Redis.rules rules: - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: title: 'Redis down' description: "Redis实例: 【{{ $labels.instance }} 】, Redis instance is down" - alert: RedisMissingMaster expr: count(redis_instance_info{role="master"}) < 1 for: 2m labels: severity: critical annotations: title: 'Redis missing master' description: "Redis cluster has no node marked as master." - alert: RedisTooManyMasters expr: count(redis_instance_info{role="master"}) > 1 for: 2m labels: severity: critical annotations: title: 'Redis too many masters' description: "Redis cluster has too many nodes marked as master." - alert: RedisDisconnectedSlaves expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1 for: 2m labels: severity: critical annotations: title: 'Redis disconnected slaves' description: "Redis not replicating for all slaves. Consider reviewing the redis replication status." - alert: RedisReplicationBroken expr: delta(redis_connected_slaves[1m]) < 0 for: 0m labels: severity: critical annotations: title: 'Redis replication broken' description: "Redis实例: 【{{ $labels.instance }} 】,Redis instance lost a slave" - alert: RedisClusterFlapping expr: changes(redis_connected_slaves[1m]) > 1 for: 2m labels: severity: critical annotations: title: 'Redis cluster flapping' description: "Redis实例: 【{{ $labels.instance }} 】,Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)." - alert: RedisMissingBackup expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24 for: 0m labels: severity: critical annotations: title: 'Redis missing backup' description: "Redis实例: 【{{ $labels.instance }} 】,Redis has not been backuped for 24 hours" - alert: RedisOutOfConfiguredMaxmemory expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 for: 2m labels: severity: warning annotations: title: 'Redis out of configured maxmemory' description: "Redis实例: 【{{ $labels.instance }} 】,Redis is running out of configured maxmemory (> 90%), Current Value: {{ $value }} " - alert: RedisTooManyConnections expr: redis_connected_clients > 100 for: 2m labels: severity: warning annotations: title: 'Redis too many connections' description: "Redis实例: 【{{ $labels.instance }} 】, Redis instance has too many connections, Current Value: {{ $value }} " - alert: RedisNotEnoughConnections expr: redis_connected_clients < 5 for: 2m labels: severity: warning annotations: title: 'Redis not enough connections' description: "Redis实例: 【{{ $labels.instance }} 】, Redis instance should have more connections (> 5), Current Value: {{ $value }} " - alert: RedisRejectedConnections expr: increase(redis_rejected_connections_total[1m]) > 0 for: 0m labels: severity: critical annotations: title: 'Redis rejected connections' description: "Redis实例: 【{{ $labels.instance }} 】, Some connections to Redis has been rejected, Current Value: {{ $value }} " EOF
Elasticsearch.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 cat > /data/prometheus/conf/rules/elasticsearch.yaml << 'EOF' groups: - name: Elasticsearch.rules rules: - alert: ElasticsearchHeapUsageTooHigh expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 for: 2m labels: severity: critical annotations: title: "Elasticsearch Heap Usage Too High" description: "主机: 【{{ $labels.instance }} 】, The heap usage is over 90%, Current Value: {{ $value }} " - alert: ElasticsearchHeapUsageWarning expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 for: 2m labels: severity: warning annotations: title: 'Elasticsearch Heap Usage warning' description: "主机: 【{{ $labels.instance }} 】, The heap usage is over 80%, Current Value: {{ $value }} " - alert: ElasticsearchDiskOutOfSpace expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 for: 0m labels: severity: critical annotations: title: 'Elasticsearch disk out of space' description: "主机: 【{{ $labels.instance }} 】, The disk usage is over 90%, Current Value: {{ $value }} " - alert: ElasticsearchDiskSpaceLow expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 for: 2m labels: severity: warning annotations: title: 'Elasticsearch disk space low' description: "主机: 【{{ $labels.instance }} 】, The disk usage is over 80%, Current Value: {{ $value }} " - alert: ElasticsearchClusterRed expr: elasticsearch_cluster_health_status{color="red"} == 1 for: 0m labels: severity: critical annotations: title: 'Elasticsearch Cluster Red' description: "主机: 【{{ $labels.instance }} 】, Elastic Cluster Red status" - alert: ElasticsearchClusterYellow expr: elasticsearch_cluster_health_status{color="yellow"} == 1 for: 0m labels: severity: warning annotations: title: 'Elasticsearch Cluster Yellow' description: "主机: 【{{ $labels.instance }} 】, Elastic Cluster Yellow status" - alert: ElasticsearchHealthyNodes expr: elasticsearch_cluster_health_number_of_nodes < 3 for: 0m labels: severity: critical annotations: title: 'Elasticsearch Healthy Nodes' description: "Missing node in Elasticsearch cluster" - alert: ElasticsearchHealthyDataNodes expr: elasticsearch_cluster_health_number_of_data_nodes < 3 for: 0m labels: severity: critical annotations: title: 'Elasticsearch Healthy Data Nodes' description: "Missing data node in Elasticsearch cluster" - alert: ElasticsearchRelocatingShards expr: elasticsearch_cluster_health_relocating_shards > 0 for: 0m labels: severity: info annotations: title: 'Elasticsearch relocating shards' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch is relocating shards" - alert: ElasticsearchRelocatingShardsTooLong expr: elasticsearch_cluster_health_relocating_shards > 0 for: 15m labels: severity: warning annotations: title: 'Elasticsearch relocating shards too long' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch has been relocating shards for 15min" - alert: ElasticsearchInitializingShards expr: elasticsearch_cluster_health_initializing_shards > 0 for: 0m labels: severity: info annotations: title: 'Elasticsearch initializing shards' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch is initializing shards" - alert: ElasticsearchInitializingShardsTooLong expr: elasticsearch_cluster_health_initializing_shards > 0 for: 15m labels: severity: warning annotations: title: 'Elasticsearch initializing shards too long' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch has been initializing shards for 15 min" - alert: ElasticsearchUnassignedShards expr: elasticsearch_cluster_health_unassigned_shards > 0 for: 0m labels: severity: critical annotations: title: 'Elasticsearch unassigned shards' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch has unassigned shards" - alert: ElasticsearchPendingTasks expr: elasticsearch_cluster_health_number_of_pending_tasks > 0 for: 15m labels: severity: warning annotations: title: 'Elasticsearch pending tasks' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch has pending tasks. Cluster works slowly, Current Value: {{ $value }} " - alert: ElasticsearchNoNewDocuments expr: increase(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1 for: 0m labels: severity: warning annotations: title: 'Elasticsearch no new documents' description: "主机: 【{{ $labels.instance }} 】, Elasticsearch No new documents for 10 min!" EOF
kafka.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 cat > /data/prometheus/conf/rules/kafka.yaml << 'EOF' groups: - name: kafka.rules rules: - alert: KafkaTopicsReplicas expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3 for: 0m labels: severity: critical annotations: title: 'Kafka topics replicas less than 3' description: "Topic: {{ $labels.topic }} partition less than 3, Current Value: {{ $value }} " - alert: KafkaConsumersGroupLag expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50 for: 1m labels: severity: critical annotations: title: 'Kafka consumers group 消费滞后' description: "Kafka consumers group 消费滞后 (Lag > 50), Lag值: {{ $value }} " - alert: KafkaConsumersTopicLag expr: sum(kafka_consumergroup_lag) by (topic) > 50 for: 1m labels: severity: critical annotations: title: 'Kafka Topic 消费滞后' description: "Kafka Topic 消费滞后 (Lag > 50), Lag值: {{ $value }} " EOF
Docker.rules 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 cat > /data/prometheus/conf/rules/Docker.yaml << 'EOF' groups: - name: Docker.rules rules: - alert: DockerInstanceDown expr: up{job="cAdvisor"} == 0 for: 0m labels: severity: critical annotations: title: 'Docker Instance down' description: "容器实例: 【{{ $labels.instance }} 】has been down for more than 1 minute" - alert: ContainerKilled expr: time() - container_last_seen{name!=""} > 60 for: 1m labels: severity: critical annotations: title: "A Container has disappeared" description: "Container Name 【{{ $labels.name }} 】 on 主机【{{ $labels.instance }} 】 has disappeared" - alert: ContainerCpuUsage expr: (sum by(instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[3m])) * 100 ) > 80 for: 2m labels: severity: warning annotations: title: "Container CPU usaged above 80%" description: "Container Name 【{{ $labels.name }} 】 on 主机【{{ $labels.instance }} 】 CPU usage is above 80%, Current Value: {{ $value }} " - alert: ContainerMemoryUsage expr: (sum by(instance, name) (container_memory_working_set_bytes{name!=""}) / sum by(instance, name) (container_spec_memory_limit_bytes{name!=""} > 0 ) * 100 ) > 80 for: 2m labels: severity: warning annotations: title: "Container CPU usaged above 80%" description: "Container Name 【{{ $labels.name }} 】 on 主机【{{ $labels.instance }} 】 Memory usage is above 80%, Current Value: {{ $value }} " - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 for: 5m labels: severity: warning annotations: title: "Container Volume usaged above 80%" description: "Container Name 【{{ $labels.name }} 】 on 主机【{{ $labels.instance }} 】 Volume usage is above 80%, Current Value: {{ $value }} " EOF