5 min read

Prometheus-Operator 监控位于集群外的服务

Prometheus-Operator 监控位于集群外的服务

在Kubernetes中部署Prometheus我们通常使用Operator来部署,比如使用以下命令:

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install my-prometheus prometheus-community/prometheus --version 15.12.0

上述命令可以愉快地在Kubernetes集群中浪起来,然而有时候存在些特殊场景,比如,我们有些数据库、存储等服务在Kubernetes集群外,同时我们又不想去部署多套Prometheus,这里就需要需要些额外的奇技淫巧来满足需求了。

node_exporter

以node_exporter为例,首先我们需要在被监控的节点上下载二进包

cd /usr/local/bin
wget https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz

解压缩包

tar axvf node_exporter-1.3.1.linux-amd64.tar.gz
cp -ar node_exporter-1.3.1.linux-amd64/node_exporter

添加systemd服务,同时使用node_exporter这个用户来作为专属用户

export owner=node_exporter
useradd -m -s /bin/bash $owner

cat >/etc/systemd/system/node_exporter.service<<EOF
[Unit]
Description=Node Exporter

[Service]
User=$owner
ExecStart=/usr/local/bin/node_exporter

[Install]
WantedBy=default.target
EOF

启动服务并设置为开机自启动

systemctl daemon-reload
systemctl start node_exporter
systemctl enable node_exporter

如果启用了防火墙,需要配置规则才能让Prometheus访问,以firewalld为例:

firewall-cmd --new-zone=prometheus --permanent
firewall-cmd --zone=prometheus --add-source=172.30.109.0/24 --permanent
firewall-cmd --zone=prometheus --add-port=9100/tcp --permanent
firewall-cmd --reload

在kubernetes集群中添加对应的资源,假设Prometheus部署在monitoring命名空间:

apiVersion: v1
kind: Endpoints
metadata:
  name: mysql-node-exporter
  namespace: monitor
subsets:
  - addresses:
    - ip: 172.30.109.88
    ports:
    - name: metrics
      port: 9100
      protocol: TCP

---

apiVersion: v1
kind: Service
metadata:
  name: mysql-node-exporter
  namespace: monitor
  labels:
    role: database
    hostname: mysql-01.vqiu.local
spec:
  ports:
  - name: metrics
    port: 9100
    targetPort: 9100
    protocol: TCP
  type: ClusterIP
  clusterIP: None

---

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: mysql-node-expoter
spec:
  endpoints:
  - path: /metrics
    port: metrics
  namespaceSelector:
    matchNames:
    - monitor
  selector:
    matchLabels:
      role: database
      hostname: mysql-01.vqiu.local

此时,我们就可以在Grafana中看到这台主机的监控指标了。

mysql_expoter

创建mysql账号

CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'PGAMDv0qhyHj1QSxQnJB' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';

创建一个系统账号

useradd exporter -s /sbin/nologin

下载软件包

cd /usr/local/bin/
wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.linux-amd64.tar.gz
tar axvf mysqld_exporter-0.14.0.linux-amd64.tar.gz
mv mysqld_exporter-0.14.0.linux-amd64/mysqld_exporter ./

创建systemd文件

cat >/etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=Prometheus mysql Exporter
After=network-online.target
StartLimitInterval=0
StartLimitIntervalSec=0

[Service]
Type=simple
User=exporter
Group=exporter
Environment="DATA_SOURCE_NAME=exporter:PGAMDv0qhyHj1QSxQnJB@unix(/var/lib/mysql/mysql.sock)/?allowCleartextPasswords=true"
ExecStart=/usr/local/bin/mysqld_exporter \
            --web.listen-address 0.0.0.0:9104

SyslogIdentifier=mysqld_exporter
Restart=always
RestartSec=5

LockPersonality=true
NoNewPrivileges=true
MemoryDenyWriteExecute=true
PrivateTmp=true
ProtectHome=true
RemoveIPC=true
RestrictSUIDSGID=true
ProtectSystem=full

[Install]
WantedBy=multi-user.target
EOF

提示: 实际上也可以使用--config.my-cnf /etc/.mysqld_exporter.cnf参数来将凭证解耦。

服务启动

systemctl enable mysqld_exporter --now

服务启动后可以使用curl来测试

# curl 127.0.0.1:9104/metrics
 # TYPE go_gc_cycles_automatic_gc_cycles_total counter
go_gc_cycles_automatic_gc_cycles_total 0
# HELP go_gc_cycles_forced_gc_cycles_total Count of completed GC cycles forced by the application.
# TYPE go_gc_cycles_forced_gc_cycles_total counter
go_gc_cycles_forced_gc_cycles_total 0
# HELP go_gc_cycles_total_gc_cycles_total Count of all completed GC cycles.
# TYPE go_gc_cycles_total_gc_cycles_total counter
go_gc_cycles_total_gc_cycles_total 0
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0
go_gc_duration_seconds{quantile="0.25"} 0
...

在kubernetes中创建对象资源

  • service
kind: Endpoints
apiVersion: v1
metadata:
  name: mysql-exporter-master
  labels:
    app: gitee-mysql-expoter
    role: master
subsets:
  - addresses:
    - ip: 172.30.109.88
    ports:
    - name: metrics
      port: 9104
---

apiVersion: v1
kind: Service
metadata:
  name: mysql-exporter-master
  labels:
    app: mysql-exporter
spec:
  type: ClusterIP
  clusterIP: None
  ports:
  - name: metrics
    port: 9104
    protocol: TCP
  • ServiceMonitor
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: gitee-mysql
  labels:
    app.kubernetes.io/name: mysql
    app.kubernetes.io/instance: gitee-mysql
spec:
  endpoints:
    - port: metrics
      interval: 30s
  namespaceSelector:
    matchNames:
      - "gitee"
  selector:
    matchLabels:
      app.kubernetes.io/name: mysql
      app.kubernetes.io/instance: gitee-mysql
      app.kubernetes.io/component: metrics

适当可以添加以下告警规则:

# Source: gitee-mysqlha/templates/prometheusrule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: mysql
  labels:
    app.kubernetes.io/name: mysql
spec:
  groups:
    - name: mysql-master-slave
      rules:
        - alert: MySQL Slave 同步SQL存在异常
          expr: mysql_slave_status_slave_sql_running != 1
          for: 1m
          labels:
            severity: error
          annotations:
            summary: "Instance {{ $labels.instance }} : MySQL Slave 同步SQL存在异常"
            description: "{{ $labels.instance }}: MySQL Slave 同步SQL存在异常"
  
        - alert: MySQL Slave 同步IO存在异常
          expr: mysql_slave_status_slave_io_running != 1
          for: 1m
          labels:
            severity: error
          annotations:
            summary: "Instance {{ $labels.instance }} : MySQL Slave 同步IO存在异常"
            description: "{{ $labels.instance }} : MySQL Slave 同步IO存在异常"
  
        - alert: 主备同步存在错误
          expr: mysql_slave_status_last_errno > 0
          for: 1m
          labels:
            severity: warning
          annotations:
            summary: "Instance {{ $labels.instance }} 主备同步存在错误"
            description: "{{ $labels.instance }} 主备同步存在错误 (当前错误数: {{$value}})"

另外关于节点的其它监控

    groups:
    - name: node.alerts
      rules:
      - alert: NodeHighCPUUsage
        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
        for: 15m
        labels:
          severity: warning
          context: node
        annotations:
          summary: High load on node
          description: "Node {{ $labels.instance }} has more than 90% CPU load"
      - alert: NodeDiskUsagePercentage
        expr: (100 - 100 * sum(node_filesystem_avail_bytes{device!~"tmpfs|by-uuid",fstype=~"xfs|ext"} / node_filesystem_size_bytes{device!~"tmpfs|by-uuid",fstype=~"xfs|ext"}) BY (instance,device)) > 85
        for: 5m
        labels:
          severity: warning
          context: node
        annotations:
          description: Node disk usage above 85%
          summary: Disk usage on target {{ $labels.instance }} at 85%
      - alert: KubernetesNodeContainerOOMKilled
        expr: sum by (instance) (changes(node_vmstat_oom_kill[24h])) > 3
        labels:
          severity: warning
          context: node
        annotations:
          description: More than 3 OOM killed pods on a node within 24h
          summary: More than 3 OOM killed pods on node {{ $labels.instance }} within 24h