Prometheus – 堂-DayDayUP

# 组件下载地址：
https://github.com/prometheus/

vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Monitoring System
Documentation=Prometheus Monitoring System

[Service]
ExecStart=/usr/local/prometheus/prometheus-2.19.2.linux-amd64/prometheus \
    --config.file=/usr/local/prometheus/prometheus-2.19.2.linux-amd64/prometheus.yml \
    --storage.tsdb.retention=20d \
    --web.listen-address=:9090
Restart=on-failure

[Install]
WantedBy=multi-user.target

cat prometheus.yml
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093
      - 127.0.0.1:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/usr/local/prometheus/prometheus-2.19.2.linux-amd64/rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['192.168.254.216:9090']
  - job_name: 'nodes'
    static_configs:
    - targets: ['192.168.254.216:9100']
    - targets: ['192.168.110.233:9100']
    - targets: ['192.168.110.234:9100']
    - targets: ['192.168.253.120:9100']
    - targets: ['192.168.253.121:9100']
#    - targets: ['192.168.253.122:9110']
  - job_name: 'mysqld'     
    static_configs:
    - targets: ['192.168.253.121:9104']
    - targets: ['192.168.110.234:9104']

  - job_name: 'pushgateway'
    static_configs:
    - targets: ['192.168.254.216:9091']

  - job_name: "jmx"
    static_configs:
    - targets:
      - 192.168.253.120:9998
      labels:
        app: safe-pt
        job: jmx
    - targets:
      - 192.168.253.120:9997
      labels:
        app: buswash
        job: jmx

cat rules.yml
groups:
- name: 主机存活 
  rules:
  # Alert for any instance that is unreachable for >5 minutes.
  - alert: InstanceDown
    expr: up == 0
    for: 5m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

- name: 内存使用率 
  rules:
  - alert: 内存负载警报 
    expr: ((node_memory_MemTotal_bytes -(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) )/node_memory_MemTotal_bytes ) * 100 > 80 
    for: 2m
    labels:
      severity: page
    annotations:
      summary: "主机 {{ $labels.instance }} 内存使用率超过80%"
      description: "{{ $labels.instance }} of job {{ $labels.job }} 主机2分钟内，内存使用率超过80%。"

- name: CPU使用率 
  rules:
  - alert: CPU使用率警报
    expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[60s]))) *100) > 80
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "主机 {{ $labels.instance }} CPU使用率超过80%"
      description: "{{ $labels.instance }} of job {{ $labels.job }} 主机1分钟内，CPU使用率超过80%。"

- name: 磁盘剩余空间
  rules:
  - alert: 磁盘空闲率警报
    expr: (node_filesystem_avail_bytes{fstype !~ "nfs|rpc_pipefs|rootfs|tmpfs|fuse.gvfsd-fuse",device!~"/etc/auto.misc",mountpoint !~ "/mnt/ospatch|/mnt/dss|/mnt/osservice|/boot|/net|/selinux"} /node_filesystem_size_bytes{fstype !~ "nfs|rpc_pipefs|rootfs|tmpfs|fuse.gvfsd-fuse",device!~"/etc/auto.misc",mountpoint !~ "/mnt/ospatch|/mnt/dss|/mnt/osservice|/boot|/net|/selinux"} ) * 100 < 20
    for: 2m
    labels:
      severity: page
    annotations:
      summary: "主机 {{ $labels.instance }} 磁盘空闲不足20%"
      description: "{{ $labels.instance }} of job {{ $labels.job }} 主机磁盘空闲不足20%! "

- name: 数据库存活
  rules:
  # Alert for any instance that is unreachable for >5 minutes.
  - alert: MysqlDown
    expr: mysql_up == 0
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Msqld {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

- name: 系统线程 
  rules:
  - alert: 系统线程数量异常警报
    expr: system_threads > 5000 
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "主机 {{ $labels.exported_instance }} 系统线程数量异常，超过5000！"
      description: "{{ $labels.exported_instance }} of job {{ $labels.exported_job }} 系统线程数量异常，超过5000!。"