Prometheus数据采集-exporter

来自Linux78|wiki

exporter介绍

exporter是prometheus监控中重要的组成部分,负责数据指标的采集插件有node_exporter、blackbox_exporter、mysqld_exporter、snmp_exporter等,第三方的插件有redis_exporter,cadvisor等。

node_exporter

node_exporter主要用来采集机器的性能指标数据,包括cpu,内存,磁盘,io等基本信息。上边文章介绍promehteus server时已详细介绍了node_exporter,这里就不在赘述。

mysqld_exporter

mysqld_exporter主要用于监控采集mysql数据库服务器相关指标。

cd /data/
wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.10.0/mysqld_exporter-0.10.0.linux-amd64.tar.gz
tar -xf mysqld_exporter-0.10.0.linux-amd64.tar.gz
cd mysqld_exporter-0.10.0/
ls
LICENSE  mysqld_exporter  NOTICE
mkdir log

mysqld_exporter需要连接到数据库,创建一个登录数据库的用户

mysql> GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost' identified by Cds20180808!';
mysql> flush privileges;

创建用于连接数据库的配置文件

vim .my.cnf 
[client]
user=exporter
password=123456!

创建supervisor启动mysqld_exporter

vim /etc/supervisor/conf.d/mysqld_exporter.conf
[program:mysqld_exporter]
command = /data/mysqld_exporter-0.10.0/mysqld_exporter -config.my-cnf="/data/mysqld_exporter-0.10.0/.my.cnf"
autostart = true
autorestart = true
startsecs = 5
startretries = 3
redirect_stderr = true
stdout_logfile=/data/mysqld_exporter-0.10.0/log/out-mysqld_exporter.log
stderr_logfile=/data/mysqld_exporter-0.10.0/log/err-mysqld_exporter.log
stdout_logfile_maxbytes = 20MB
stdout_logfile_backups = 20

启动mysqld_exporter

supervisorctl update mysqld_exporter
supervisorctl status mysqld_exporter

redis_exporter

redis_exporter主要用于监控采集redis数据库服务器相关指标。

cd /data/ wget https://github.com/oliver006/redis_exporter/releases/download/v0.24.0/redis_exporter-v0.24.0.linux-amd64.tar.gz tar -xf redis_exporter-v0.24.0.linux-amd64.tar.gz mkdir /data/redis_exporter-v0.24.0 mv redis_exporter redis_exporter-v0.24.0 cd redis_exporter-v0.24.0/ mkdir log 配置supervisor启动redis_exporter

vim /etc/supervisor/conf.d/redis_exporter.conf
[program:redis_exporter]
command = /data/redis_exporter-v0.24.0/redis_exporter -redis.addr 10.13.225.112:6379 -redis.password cds-china
autostart = true
startsecs = 5
startretries = 3
redirect_stderr = true
stout_logfile = /data/redis_exporter-v0.24.0/log/out-redis_exporter.log
stderr_logfile = /data/redis_exporter-v0.24.0/log/err-redis_exporter.log
stdout_logfile_maxbytes = 20MB
stdout_logfile_backups = 20

启动redis_exporter

supervisorctl update redis_exporter
supervisorctl status redis_exporter

black_exporter

black_exporter是prometheus社区提供的官方黑盒监控解决方案,其允许用户通过:http、https、dns、tcp以及icmp的方式对网络进行探测。我们利用icmp探针可以坚持网络是否通畅,利用http,https可以坚持网页是否可以正常访问,利用tcp检测服务端口判断服务是否正常。

cd /data/ wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.13.0/blackbox_exporter-0.13.0.linux-amd64.tar.gz tar -xf blackbox_exporter-0.13.0.linux-amd64.tar.gz 修改blackbox_exporter配置文件 cd blackbox_exporter-0.13.0/

vim blackbox.yml 
modules:
  http_2xx:
    prober: http
    timeout: 5s
    http:
      preferred_ip_protocol: "ip4"
      no_follow_redirects: true
      valid_http_versions: ["HTTP/1.1", "HTTP/2"]
      valid_status_codes: [200,302]  # Defaults to 2xx
      method: GET
  http_post_2xx:
    prober: http
    http:
      method: POST
  tcp_connect:
    prober: tcp
    timeout: 5s
  pop3s_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^+OK"
      tls: true
      tls_config:
        insecure_skip_verify: false
  ssh_banner:
    prober: tcp
    timeout: 5s
    tcp:
      query_response:
      - expect: "^SSH-2.0-"
  irc_banner:
    prober: tcp
    tcp:
      query_response:
      - send: "NICK prober"
      - send: "USER prober prober prober :prober"
      - expect: "PING :([^ ]+)"
        send: "PONG ${1}"
      - expect: "^:[^ ]+ 001"
  icmp:
    prober: icmp
    timeout: 5s

配置systemd启动blackbox_exporter

vim /usr/lib/systemd/system/blackbox_exporter.service 
 
[Unit]
Description=Prometheus blackbox exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
 
[Service]
User=root
Type=simple
WorkingDirectory=/data/blackbox_exporter-0.13.0/
ExecStart=/data/blackbox_exporter-0.13.0/blackbox_exporter --config.file=/data/blackbox_exporter-0.13.0/blackbox.yml
Restart=on-failure
 
[Install]
WantedBy=multi-user.target

supervisorctl start blackbox_exporter supervisorctl enable blackbox_exporter supervisorctl status blackbox_exporter

cadvisor

cadvisor是google开源的用于监控容器运行的工具。

  1. 下载二进制文件
cd /data/
wget https://github.com/google/cadvisor/releases/download/v0.33.0/cadvisor
mkdir cadvisor-v0.33.0
mv cadvisor cadvisor-v0.33.0/
cd cadvisor-v0.33.0/
mkdir log

配置supervisor启动cadvisor

vim /etc/supervisor/conf.d/cadvisor-server.conf 
[program:cadvisor-server]
command = /data/cadvisor-v0.33.0/cadvisor
autostart = true
autorestart = true
startsecs = 5
startretries = 3
redirect_stderr = true
stdout_logfile=/data/cadvisor-v0.33.0/log/out-cadvisor.log
stderr_logfile=/data/cadvisor-v0.33.0/log/err-cadvisor.log
stdout_logfile_maxbytes = 20MB
stdout_logfile_backups = 20
supervisorctl update cadvisor 
supervisorctl status cadvisor 

配置prometheus服务端

上面布置了exporter,需要配置prometheus server来拿到exporter采集到数据。

  1. 修改pormetheus配置文件,配置文件上篇文章已经介绍了,这里就不在解释


vim prometheus.yml 
#my global config
global:
  scrape_interval:     30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 25s # Evaluate rules every 15 seconds. The default is every 1 minute.
  scrape_timeout: 25s #is set to the global default (10s).
 
#Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 10.13.103.152:9093
      - 10.128.120.218:9093
 
#Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/data/prometheus-2.4.3/rules/node_down.yml"
  - "/data/prometheus-2.4.3/rules/memory_over.yml"
  - "/data/prometheus-2.4.3/rules/disk_over.yml"
  - "/data/prometheus-2.4.3/rules/cpu_over.yml"
  - "/data/prometheus-2.4.3/rules/http_check.yml"
  - "/data/prometheus-2.4.3/rules/tcp_check.yml"
  - "/data/prometheus-2.4.3/rules/mysql_check.yml"
  - "/data/prometheus-2.4.3/rules/redis_down_check.yml"
  - "/data/prometheus-2.4.3/rules/redis_rule_check.yml"
  - "/data/prometheus-2.4.3/rules/container_down.yml"
  - "/data/prometheus-2.4.3/rules/ping_check.yml"
 
#A scrape configuration containing exactly one endpoint to scrape:
#Here it's Prometheus itself.
scrape_configs:
  #The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
 
    #metrics_path defaults to '/metrics'
    #scheme defaults to 'http'.
 
    static_configs:
    - targets: ['localhost:9090']
  
  - job_name: 'GICHOST'
    file_sd_configs:
    - files: ['./node_exporter/host.json']
 
  - job_name: 'federate'
    scrape_interval: 30s
    scrape_timeout: 25s
    honor_labels: true
    metrics_path: '/federate'
    params:
      'match[]':
        - '{job=~"kubernetes-.*"}'
    static_configs:
      - targets:
        - '10.13.103.12:9090'
 
  - job_name: 'blackbox-http'
    scrape_interval: 5s
    metrics_path: /probe
    params:
      module: [http_2xx]  # Look for a HTTP 200 response.
    file_sd_configs:
    - files: ['./blackbox_exporter/http.json']
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 10.13.103.151:9115
 
  - job_name: 'blackbox-tcp'
    scrape_interval: 5s
    metrics_path: /probe
    params:
      module: [tcp_connect]
    file_sd_configs:
    - files: ['./blackbox_exporter/tcp.json']
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 10.13.103.151:9115
 
  - job_name: 'mysqld-exporter'
    file_sd_configs:
    - files: ['./mysqld_exporter/mysqld.json']
 
  - job_name: 'blackbox-ping'
    scrape_interval: 5s
    metrics_path: /probe
    params:
      module: [icmp]
    file_sd_configs:
    - files: ['./blackbox_exporter/ping.json']
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 10.13.103.151:9115
 
  - job_name: 'redis-exporter'
    file_sd_configs:
    - files: ['./redis_exporter/redis.json']
 
  - job_name: 'container-exporter'
    file_sd_configs:
    - files: ['./container_exporter/container.json']

配置prometheus 主机监控文件和告警规则

8.1 主机监控文件

  1. mysqld_exporter主机监控文件

root@prometheus:/data/prometheus-2.4.3# mkdir mysqld_exporter root@prometheus:/data/prometheus-2.4.3# cd mysqld_exporter/ root@prometheus:/data/prometheus-2.4.3/mysqld_exporter# vim mysqld.json [ { "targets":[ "10.13.100.51:9104" ], "labels":{ "dbinstance":"db100.51" } } ]

  1. redis_exporter主机监控文件

root@prometheus:/data/prometheus-2.4.3# mkdir redis_exporter root@prometheus:/data/prometheus-2.4.3# cd redis_exporter/ root@prometheus:/data/prometheus-2.4.3/redis_exporter# vim redis.json [ { "targets":[ "10.13.0.235:9121", "10.13.0.236:9121" ], "labels":{ "service":"redis" } } ]

  1. blackbox_exporter主机监控文件

root@prometheus:/data/prometheus-2.4.3# mkdir blackbox_exporter root@prometheus:/data/prometheus-2.4.3# cd blackbox_exporter root@prometheus:/data/prometheus-2.4.3/blackbox_exporter# vim http.json [ { "targets":[ "http://10.13.227.134:6011/health", "http://10.13.102.134:6011/health" ], "labels":{ "service":"WSREP" } } ] root@prometheus:/data/prometheus-2.4.3/blackbox_exporter# vim tcp.json [ { "targets":[ "10.128.107.53:13371", "10.128.107.57:13371" ], "labels":{ "service":"vspc" } } ] root@prometheus:/data/prometheus-2.4.3/blackbox_exporter# vim ping.json [ { "targets":[ "10.13.101.131", "10.13.101.132", "10.13.101.141" ], "labels":{ "service":"mysql" } } ]

  1. 配置cadvisor主机监控文件

root@prometheus:/data/prometheus-2.4.3# mkdir container_exporter root@prometheus:/data/prometheus-2.4.3# cd container_exporter/ root@prometheus:/data/prometheus-2.4.3/container_exporter# vim container.json [ { "targets":[ "10.13.103.153:8080", "10.128.87.5:8080" ], "labels":{ "service":"docker-monitor" } } ] 8.2 告警规则 node_exporter监控获取的cpu,磁盘,实例存活规则上篇文章已经介绍,这里不再介绍

root@prometheus:/data/prometheus-2.4.3# cd rules root@prometheus:/data/prometheus-2.4.3/rules# ls container_down.yml disk_over.yml mysql_check.yml node_down.yml redis_down_check.yml tcp_check.yml cpu_over.yml http_check.yml memory_over.yml ping_check.yml redis_rule_check.yml

  1. mysqld_exporter监控mysql数据告警规则

root@prometheus:/data/prometheus-2.4.3/rules# vim mysql_check.yml groups: - name: MySQLStatsAlert

 rules:
 - alert: MySQL is down
   expr: mysql_up == 0
   for: 1m
   labels:
     severity: critical
   annotations:
     summary: "Instance 模板:$labels.instance MySQL is down"
     description: "MySQL database is down. This requires immediate action!(current value is: 模板:$value)"
 - alert: Mysql_High_QPS
   expr: rate(mysql_global_status_questions[5m]) > 8000 
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "模板:$labels.instance: Mysql_High_QPS detected"
     description: "模板:$labels.instance: Mysql opreation is more than 5000 per second ,(current value is: 模板:$value)" 
 - alert: Mysql_Too_Many_Slow_Query
   expr: rate(mysql_global_status_slow_queries[30m]) > 3
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "模板:$labels.instance: Mysql_Too_Many_Slow_Query detected"
     description: "模板:$labels.instance: Mysql current Slow_Query Sql is more than 3 ,(current value is: 模板:$value)"
 - alert: Mysql_Deadlock
   expr: mysql_global_status_innodb_deadlocks > 300
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "模板:$labels.instance: Mysql_Deadlock detected"
     description: "模板:$labels.instance: Mysql Deadlock was found ,(current value is: 模板:$value)"
 - alert: open files high
   expr: mysql_global_status_innodb_num_open_files > (mysql_global_variables_open_files_limit) * 0.75
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance open files high"
     description: "Open files is high. Please consider increasing open_files_limit.(current value is: 模板:$value)"
 - alert: Used more than 80% of max connections limited 
   expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections * 0.8
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance Used more than 80% of max connections limited"
     description: "Used more than 80% of max connections limited.(current value is: 模板:$value)"
 - alert: InnoDB Log File size is too small
   expr: mysql_global_variables_innodb_log_file_size < 16777216 
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance InnoDB Log File size is too small"
     description: "The InnoDB Log File size is possibly too small. Choosing a small InnoDB Log File size can have significant performance impacts.(current value is: 模板:$value)"
 - alert: Binary Log is disabled
   expr: mysql_global_variables_log_bin != 1
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance Binary Log is disabled"
     description: "Binary Log is disabled. This prohibits you to do Point in Time Recovery (PiTR).(current value is: 模板:$value)"
 - alert: IO thread stopped
   expr: mysql_slave_status_slave_io_running != 1
   for: 1m
   labels:
     severity: critical
   annotations:
     summary: "Instance 模板:$labels.instance IO thread stopped"
     description: "IO thread has stopped. This is usually because it cannot connect to the Master any more.(current value is: 模板:$value)"
 - alert: SQL thread stopped
   expr: mysql_slave_status_slave_sql_running != 1
   for: 1m
   labels:
     severity: critical
   annotations:
     summary: "Instance 模板:$labels.instance Sync Binlog is enabled"
     description: "SQL thread has stopped. This is usually because it cannot apply a SQL statement received from the master.(current value is: 模板:$value)"
 - alert: Slave lagging behind Master
   expr: rate(mysql_slave_status_seconds_behind_master[1m]) >30 
   for: 1m
   labels:
     severity: warning 
   annotations:
     summary: "Instance 模板:$labels.instance Slave lagging behind Master"
     description: "Slave is lagging behind Master. Please check if Slave threads are running and if there are some performance issues!(current value is: 模板:$value)"
  1. redis_exporter监控redis指标告警规则

root@prometheus:/data/prometheus-2.4.3/rules# vim redis_down_check.yml groups: - name: redis检测规则

 rules:
 - alert: redis存活检测
   expr: redis_up{job="redis-exporter"} == 0
   for: 1m
   annotations:
     description: "机器:模板:$labels.instance 所属 job:模板:$labels.job redis宕机,请检查!"
     summary: "redis服务"

root@prometheus:/data/prometheus-2.4.3/rules# vim redis_rule_check.yml

   groups:

- name: RedisStatsAlert

 rules:
 - alert: last create rdb failed                       
   expr: redis_rdb_last_bgsave_status != 1
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: " Instance 模板:$labels.instance rdb_last_bgsave_status  "
     description: "last create rdb failed"
 - alert: Redis linked too many clients
   expr: redis_connected_clients / redis_config_maxclients * 100 > 80
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance Redis linked clients too many"
     description: "Redis linked clients too many. This requires immediate action!"
 - alert: master link status failed                                     
   expr: redis_master_link_up == 0
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance link failed"
     description: "redis_master_link=0 link failed"
 - alert: last AOF failed
   expr: redis_aof_last_bgrewrite_status != 1
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance redis aof last rewrite duration sec"
     description: "last AOF failed"
 - alert: Redis Cluster State Wrong
   expr: redis_cluster_state != 1
   for: 1m
   labels:
     severity: warning
   annotations:
     summary: "Instance 模板:$labels.instance redis cluster status wrong"
     description: "Redis Cluster State Wrong"
  1. blackbox_exporter监控指标告警规则

root@prometheus:/data/prometheus-2.4.3/rules# vim ping_check.yml groups: - name: 机器网络存活检测

 rules:
 - alert: 网络检测
   expr: probe_success{job="blackbox-ping"} == 0
   for: 1m
   annotations:
     description: "机器:模板:$labels.instance 所属 job:模板:$labels.job 网络不通或者宕机超过1分钟,请检查!"
     summary: "网络检测"

root@prometheus:/data/prometheus-2.4.3/rules# vim http_check.yml groups: - name: 服务检测规则

 rules:
 - alert: http服务检测
   expr: probe_success{job="blackbox-http"} == 0
   for: 1m
   annotations:
     description: "机器:模板:$labels.instance 所属 job:模板:$labels.job http状态码: {{ printf `probe_http_status_code{instance='%s'}` $labels.instance | query | first | value }} http检测失败,请检查!"
     summary: "http检测"

root@prometheus:/data/prometheus-2.4.3/rules# vim tcp_check.yml groups: - name: 服务检测规则

 rules:
 - alert: tcp服务检测
   expr: probe_success{job="blackbox-tcp"} == 0
   for: 1m
   annotations:
     description: "机器:模板:$labels.instance 所属 job:模板:$labels.job tcp检测失败,请检查!"
     summary: "tcp检测"
  1. cadvisor监控指标告警规则

root@prometheus:/data/prometheus-2.4.3/rules# vim container_down.yml

groups: - name: 容器存活报警规则

 rules:
 - alert: DockerInstanceDown
   expr: absent(container_last_seen{name="core_vspc"}) == 1
   for: 1m
   annotations:
     description: "vspc client容器:模板:$labels.name (所属主机模板:$labels.instance) 已经异常退出超过1分钟,请检查!"
     summary: "容器:Instance 模板:$labels.name 存活检测"