https://github.com/prometheus/mysqld_exporter
创建专门采集数据的数据库账号
CREATE USER 'mysqld-exporter' IDENTIFIED BY '123456' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, REPLICATION SLAVE, SELECT ON *.* TO 'mysqld-exporter';
flush privileges;
部署mysqld_exporter
apiVersion: apps/v1
kind: Deployment
metadata:
name: mysqld-exporter
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: mysqld-exporter
template:
metadata:
labels:
app: mysqld-exporter
spec:
containers:
- name: mysqld-exporter
image: prom/mysqld-exporter
args:
- --collect.info_schema.tables
- --collect.info_schema.innodb_tablespaces
- --collect.info_schema.innodb_metrics
- --collect.global_status
- --collect.global_variables
- --collect.slave_status
- --collect.info_schema.processlist
- --collect.perf_schema.tablelocks
- --collect.perf_schema.eventsstatements
- --collect.perf_schema.eventsstatementssum
- --collect.perf_schema.eventswaits
- --collect.auto_increment.columns
- --collect.binlog_size
- --collect.perf_schema.tableiowaits
- --collect.perf_schema.indexiowaits
- --collect.info_schema.userstats
- --collect.info_schema.clientstats
- --collect.info_schema.tablestats
- --collect.info_schema.schemastats
- --collect.perf_schema.file_events
- --collect.perf_schema.file_instances
- --collect.perf_schema.replication_group_member_stats
- --collect.perf_schema.replication_applier_status_by_worker
- --collect.slave_hosts
- --collect.info_schema.innodb_cmp
- --collect.info_schema.innodb_cmpmem
- --collect.info_schema.query_response_time
- --collect.engine_tokudb_status
- --collect.engine_innodb_status
ports:
- containerPort: 9104
protocol: TCP
env:
- name: DATA_SOURCE_NAME
value: "user:password@(hostname:3306)/"
---
apiVersion: v1
kind: Service
metadata:
name: mysqld-exporter
namespace: monitoring
labels:
app: mysqld-exporter
spec:
type: ClusterIP
ports:
- port: 9104
protocol: TCP
name: http
selector:
app: mysqld-exporter
user:password@(hostname:3306)
修改为刚刚创建的账号以及MySQL连接地址
部署监控采集配置
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: mysqld-exporter
name: mysqld-exporter
namespace: monitoring
spec:
endpoints:
- interval: 30s
port: http
relabelings:
- sourceLabels:
- __meta_kubernetes_service_name
targetLabel: service_name
jobLabel: mysqld-exporter
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app: mysqld-exporter
这里使用了relabelings生成了名为service_name的label,如果使用https://github.com/percona/grafana-dashboards/tree/master/dashboards
这里的图形模板,则需要这个label
配置grafana
如果只是看 MySQL 或 MariaDB 的一些概览情况,可以导入模板:https://grafana.com/grafana/dashboards/7362
如果需要更丰富的面板,可以导入 percona 开源的一些面板,地址:MARKDOWN_HASH00769e3a4e9c1944ef660d5e6d0c10b0MARKDOWNHASH
(导入 MySQL 开头的 json 文件中的内容即可)
配置告警
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: mysqld-rules
namespace: monitoring
spec:
groups:
- name: mysqld-exporter.rules
rules:
- alert: MysqlDown
expr: mysql_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: MySQL down! instance {{ $labels.instance }}
description: MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlTooManyConnections
annotations:
summary: MySQL too many connections (instance {{ $labels.instance }})
description: More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
expr: |
avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 5m
labels:
severity: warning
- alert: MysqlHighThreadsRunning
expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
severity: warning
annotations:
summary: MySQL high threads running (instance {{ $labels.instance }})
description: More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
description: MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
description: MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
for: 5m
labels:
severity: warning
annotations:
summary: MySQL Slave replication lag (instance {{ $labels.instance }})
description: MysqL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlSlowQueries
expr: rate(mysql_global_status_slow_queries[2m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: MySQL slow queries (instance {{ $labels.instance }})
description: MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 5m
labels:
severity: warning
annotations:
summary: MySQL restarted (instance {{ $labels.instance }})
description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
依次创建上面的资源即可