PROMETHEUS

使用prometheus

安装启动 下载地址: https://prometheus.io/download/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
[root@node1 webcode]# tar xf prometheus-2.12.0.linux-amd64.tar.gz
[root@node1 prometheus-2.12.0.linux-amd64]# ll
总用量 132252
drwxr-xr-x 2 3434 3434     4096 8月  18 23:37 console_libraries
drwxr-xr-x 2 3434 3434     4096 8月  18 23:37 consoles
-rw-r--r-- 1 3434 3434    11357 8月  18 23:37 LICENSE
-rw-r--r-- 1 3434 3434     2770 8月  18 23:37 NOTICE
-rwxr-xr-x 1 3434 3434 84771664 8月  18 21:55 prometheus
-rw-r--r-- 1 3434 3434      926 8月  18 23:37 prometheus.yml
-rwxr-xr-x 1 3434 3434 50620988 8月  18 21:56 promtool
# 复制命令到binpath
[root@node1 prometheus-2.12.0.linux-amd64]# cp prometheus promtool /usr/local/bin/
[root@node1 prometheus-2.12.0.linux-amd64]# prometheus --version
prometheus, version 2.12.0 (branch: HEAD, revision: 43acd0e2e93f9f70c49b2267efa0124f1e759e86)
  build user:       root@7a9dbdbe0cc7
  build date:       20190818-13:53:16
  go version:       go1.12.8
[root@node1 prometheus-2.12.0.linux-amd64]# mkdir /etc/prometheus
[root@node1 prometheus-2.12.0.linux-amd64]# cp prometheus.yml /etc/prometheus/
# 检查语法
[root@node1 prometheus-2.12.0.linux-amd64]# promtool check config /etc/prometheus/prometheus.yml
Checking /etc/prometheus/prometheus.yml
  SUCCESS: 0 rule files found
# 启动服务
[root@node1 prometheus-2.12.0.linux-amd64]# prometheus --config.file /etc/prometheus/prometheus.yml
# 访问 http://localhost:9090

# 制作systemd启动服务
[root@es1 alertmanager-0.19.0.linux-amd64]# cat /usr/lib/systemd/system/prometheus.service
[Unit]
Description=prometheus start

[Service]
#Type=notify
ExecStart=/usr/local/bin/prometheus --config.file /etc/prometheus/prometheus.yml
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=42s

[Install]
WantedBy=multi-user.target

热重载

1
2
3
4
1、curl -XPOST http://localhost:9090/-/reload 
# 需开启 --web.enable-lifecycle     Enable shutdown and reload via HTTP request.

2、kill -HUP pid

持久化查询

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
[root@es1 rules]# cat /etc/prometheus/prometheus.yml
rule_files:
  - "/etc/prometheus/rules/node_rules.yml"
[root@es1 rules]# cat /etc/prometheus/rules/node_rules.yml
groups:
- name: node_rules
  interval: 10s
  rules:
  - record: instance:node_cpu:avg_rate5m
    expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100
    labels:
      metric_type: aggregation
  - record: instance:node_mem_use:percentage
    expr: 100 - (node_memory_Cached_bytes+ node_memory_Buffers_bytes+node_memory_MemFree_bytes)/node_memory_MemTotal_bytes * 100
  - record: instance:node_disk_use:percentage
    expr: round(node_filesystem_avail_bytes{fstype=~"ext.*|xfs",}/node_filesystem_size_bytes* 100, 0.02)

服务发现

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
· 通过配置管理工具填充文件接收目标列表
· 查询api获取目标列表
· 使用dns记录返回目标记录

# 基于文件
[root@es1 prometheus]# cat /etc/prometheus/prometheus.yml
scrape_configs:
  - job_name: 'node'
    file_sd_configs:
    - files:
      - /etc/prometheus/targets/nodes/*.json
      refresh_interval: 1m
[root@es1 nodes]# cat /etc/prometheus/targets/nodes/nodes.json
[{
"targets":["10.24.190.167:9100","10.25.143.113:9100","10.27.73.234:9100"],
"labels": {"region": "cn-beijing"}
}]

使用node_exporter

安装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
[root@node1 webcode]# tar xf node_exporter-0.18.1.linux-amd64.tar.gz
[root@node1 webcode]# cd node_exporter-0.18.1.linux-amd64
[root@node1 node_exporter-0.18.1.linux-amd64]# ll
总用量 16500
-rw-r--r-- 1 3434 3434    11357 6月   5 00:50 LICENSE
-rwxr-xr-x 1 3434 3434 16878582 6月   5 00:41 node_exporter
-rw-r--r-- 1 3434 3434      463 6月   5 00:50 NOTICE
[root@node1 node_exporter-0.18.1.linux-amd64]# cp node_exporter /usr/local/bin/
[root@node1 node_exporter-0.18.1.linux-amd64]# node_exporter --version
node_exporter, version 0.18.1 (branch: HEAD, revision: 3db77732e925c08f675d7404a8c46466b2ece83e)
  build user:       root@b50852a1acba
  build date:       20190604-16:41:18
  go version:       go1.12.5

使用

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
[root@node1 node_exporter]# mkdir -p /var/lib/node_exporter/textfile_collector
# 配置文本本
[root@node1 node_exporter]# echo 'metadata{role="dabing", datacenter="BJ"} 1' > /var/lib/node_exporter/textfile_collector/metadata.prom
[root@node1 node_exporter]# cat /var/lib/node_exporter/textfile_collector/metadata.prom
metadata{role="dabing", datacenter="BJ"} 1

# 收集系统服务的信息
# --collector.systemd 开启
# --collector.systemd.unit-whitelist 正则
[root@node1 node_exporter]# node_exporter --collector.textfile.directory="/var/lib/node_exporter" --collector.systemd --collector.systemd.unit-whitelist=.* --web.listen-address="0.0.0.0:9600"  --web.telemetry-path="/node_metrics

#配置prometheus
[root@node1 node_exporter]# cat /etc/prometheus/prometheus.yml
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
    - targets: ['172.20.0.23:9090']
  # 新增收集的node信息
  - job_name: "node"
    metrics_path: "/node_metrics"
    static_configs:
    - targets: ['172.20.0.23:9600']
    # 可配置过滤
    params:
      collect[]:
      - cpu
      - meminfo
      - diskstats
# reload prometheus
[root@node1 node_exporter]# curl -XPOST http://localhost:9090/-/reload

使用grafana

1
2
3
4
5
6
7
8
[root@es1 prometheus]# wget https://dl.grafana.com/oss/release/grafana-6.3.5-1.x86_64.rpm ^C
[root@es1 prometheus]# yum install grafana-6.3.5-1.x86_64.rpm  -y
[root@es1 prometheus]# systemctl start grafana-server
[root@es1 prometheus]# netstat -lntp |grep gra
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name
tcp6       0      0 :::3000                 :::*                    LISTEN      14250/grafana-serve
1、初始用户名和密码都是admin。
2、新增数据源prometheus,填入的地址本地需要可以访问。

使用alertManager

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
[root@es1 prometheus]# tar xf alertmanager-0.19.0.linux-amd64.tar.gz
[root@es1 prometheus]# cd alertmanager-0.19.0.linux-amd64
[root@es1 alertmanager-0.19.0.linux-amd64]# cp alertmanager /usr/local/bin/
[root@es1 alertmanager-0.19.0.linux-amd64]# alertmanager --version
alertmanager, version 0.19.0 (branch: HEAD, revision: 7aa5d19fea3f58e3d27dbdeb0f2883037168914a)
  build user:       root@587d0268f963
  build date:       20190903-15:01:40
  go version:       go1.12.8
# systemd 启动配置
[root@es1 alertmanager-0.19.0.linux-amd64]# cat /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=prometheus start

[Service]
#Type=notify
ExecStart=/usr/local/bin/alertmanager --config.file /etc/alertmanager/alertmanager.yml
Restart=on-failure
RestartSec=42s

[Install]
WantedBy=multi-user.target

# alertmanager配置文件
[root@es1 alertmanager-0.19.0.linux-amd64]# cat /etc/alertmanager/alertmanager.yml
global:
  smtp_smarthost: "smtp.exmail.qq.com"
  smtp_from: "itd@9fbank.com.cn"
  smtp_auth_username: "itd@9fbank.com.cn"
  smtp_auth_password: "W=n92h8sh=n"
  smtp_require_tls: true
  resolve_timeout: 5m

route:
  group_by: ['alertname']		# 根据标签分组
  group_wait: 10s 					# 等待该组,一起报警
  group_interval: 10s				# 报警间隔时间
  repeat_interval: 1h				# 重复报警时间
  receiver: 'email'					# 报警接受者

receivers:
- name: "email"
  email_configs:
  - to: "yangbing@9fbank.com.cn"
  
# prometheus接入altermanager
# 配置报警
[root@es1 prometheus]# cat rules/node_alerts.yml
groups:
- name: node_alerts
  rules:
  - alert: HighNodeCpu
    expr: instance:node_cpu:avg_rate5m > 4 # 持久化查询
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: High Node Cpu in 1min 
      console: check it
  - alert: DiskFullIn4Hour
    expr: predict_linear(node_filesystem_avail_bytes{fstype=~"ext.*|xfs"}[1h], 4*3600)  < 0
    for: 1m 
    labels:
      severity: critical

# 22