赞
踩
docker run -d --name prometheus -p 9090:9090 prom/prometheus
下载地址
curl -LO https://github.com/prometheus/prometheus/releases/download/v2.40.5/prometheus-2.40.5.linux-amd64.tar.gz
展开程序包
tar xf prometheus-2.40.5.linux-amd64.tar.gz -C /usr/local/
ln -sv /usr/local/prometheus-2.40.5.linux-amd64 /usr/local/prometheus
# 设定目录权限
cd /usr/local/prometheus
mkdir data
创建用户,若prometheus用户已经存在,可略过该步骤:
useradd -r -s /sbin/nologin prometheus
chown -R prometheus.prometheus /usr/local/prometheus/data
创建Systemd Unitfile,保存于/lib/systemd/system/prometheus.service文件中:
[Unit] Description=Monitoring system and time series database Documentation=https://prometheus.io/docs/introduction/overview/ [Service] Restart=always User=prometheus EnvironmentFile=-/etc/default/prometheus ExecStart=/usr/local/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --storage.tsdb.path=/usr/local/prometheus/data \ --web.console.libraries=/usr/share/prometheus/console_libraries \ --web.enable-lifecycle \ $ARGS ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s SendSIGKILL=no LimitNOFILE=8192 [Install] WantedBy=multi-user.target EOF
启动服务:
systemctl daemon-reload
systemctl enable --now prometheus.service
systemctl is-active prometheus.service
验证监听的端口,并测试访问其暴露的指标
ss -tnlp | grep '9090'
浏览器访问
http://10.0.0.100:9090/
修改配置后的重载命令
curl -XPOST http://localhost:9090/-/reload
#下载程序包 curl -LO https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz # 展开程序包 tar xf node_exporter-1.5.0.linux-amd64.tar.gz -C /usr/local/ ln -sv /usr/local/node_exporter-1.5.0.linux-amd64 /usr/local/node_exporter # 创建用户,若prometheus用户已经存在,可略过该步骤: useradd -r prometheus # 查看 root@lab1:~/package# id prometheus uid=997(prometheus) gid=997(prometheus) groups=997(prometheus) # 创建Systemd Unitfile,保存于/usr/lib/systemd/system/node_exporter.service文件中: cat >/usr/lib/systemd/system/node_exporter.service<<EOF [Unit] Description=node_exporter Documentation=https://prometheus.io/docs/introduction/overview/ After=network.target [Service] Type=simple User=prometheus ExecStart=/usr/local/node_exporter/node_exporter \ --collector.ntp \ --collector.mountstats \ --collector.systemd \ --collector.ethtool \ --collector.tcpstat ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s Restart=always [Install] WantedBy=multi-user.target EOF # 启动服务 systemctl daemon-reload systemctl start node_exporter.service systemctl enable node_exporter.service # 验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9100' # 浏览器访问 http://10.0.0.100:9100/
#下载程序包 curl -LO https://releases.hashicorp.com/consul/1.14.2/consul_1.14.2_linux_amd64.zip # 展开程序包 mkdir -p /usr/local/consul/{data,config} unzip consul_1.14.2_linux_amd64.zip -d /usr/local/consul ln -sv /usr/local/consul/consul /usr/local/consul # 创建用户,若consul用户已经存在,可略过该步骤 useradd -r consul chown consul.consul /usr/local/consul/{data,config} #创建Systemd Unitfile,保存于/usr/lib/systemd/system/consul.service文件中: cat >/usr/lib/systemd/system/consul.service<<EOF [Unit] Description="HashiCorp Consul - A service mesh solution" Documentation=https://www.consul.io/ Requires=network-online.target After=network-online.target [Service] EnvironmentFile=-/etc/consul.d/consul.env User=consul Group=consul ExecStart=/usr/local/consul/consul agent -dev -bootstrap \ -config-dir /usr/local/consul/config \ -data-dir /usr/local/consul/data \ -ui \ -log-level INFO \ -bind 127.0.0.1 \ -client 0.0.0.0 ExecReload=/bin/kill --signal HUP $MAINPID KillMode=process KillSignal=SIGTERM Restart=on-failure LimitNOFILE=65536 [Install] WantedBy=multi-user.target EOF # 启动服务 systemctl daemon-reload systemctl start consul.service systemctl enable consul.service # 验证监听的端口 ss -tnlp | grep '8500' # 浏览器访问 http://10.0.0.100:8500/
host解析
root@lab1:/usr/local/consul# cat /etc/hosts
10.0.0.100 prometheus.lec.com
10.0.0.90 prometheus
10.0.0.91 node5
10.0.0.92 node6
#列出已经注册的服务: curl -XGET http://localhost:8500/v1/agent/services # 获取某个特定服务的配置信息: curl -XGET http://localhost:8500/v1/agent/service/<SERVICE_ID> # 注册一个服务到Consul上,请求报文的body必须遵循json语法规范,且要符合Consul Service的API要求: curl -XPUT --data @/path/to/payload_file.json http://localhost:8500/v1/agent/service/register # 注册node5的node_exporter服务 cat > node5.json<<EOF { "id": "node5", "name": "node5", "address": "node5", "port": 9100, "tags": ["node_exporter"], "checks": [{ "http": "http://node5:9100/metrics", "interval": "5s" }] } EOF # 服务注册 curl -XPUT --data @node5.json http://localhost:8500/v1/agent/service/register # 注销某个服务 curl -XPUT http://localhost:8500/v1/agent/service/deregister/<SERVICE_ID> # 如/usr/local/consul/consul services deregister -id node5
# 服务注册 consul services register /path/to/pyload_file.json # 注册node6的node_exporter服务示例 cat > node6.json<<EOF { "service": { "id": "node6", "name": "node6", "address": "node6", "port": 9100, "tags": ["node_exporter"], "checks": [{ "http": "http://node6:9100/metrics", "interval": "5s" }] } } EOF # 服务注册 /usr/local/consul/consul services register node6.json # 注销服务 /usr/local/consul/consul services deregister -id <SERVICE_ID> # 如/usr/local/consul/consul services deregister -id node6
# 修改nodes-linux.yml mkdir targets vi targets/nodes-linux.yml - targets: - prometheus.lec.com:9100 labels: app: node-exporter ----------------- # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: "node_exporter" metrics_path: /metrics scheme: http file_sd_configs: - files: - targets/nodes-linux.yml refresh_interval: 1m # 检查配置文件格式 root@lab1:/usr/local/prometheus# ./promtool check config prometheus.yml Checking prometheus.yml SUCCESS: prometheus.yml is valid prometheus config file syntax # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
# 下载程序包 curl -LO https://github.com/prometheus/consul_exporter/releases/download/v0.8.0/consul_exporter-0.8.0.linux-amd64.tar.gz # 展开程序包 tar xf consul_exporter-0.8.0.linux-amd64.tar.gz -C /usr/local/ ln -sv /usr/local/consul_exporter-0.8.0.linux-amd64 /usr/local/consul_exporter # 创建用户,若consul用户已经存在,可略过该步骤: useradd -r consul # 创建Systemd Unitfile,保存于/usr/lib/systemd/system/consul_exporter.service文件中: cat >/usr/lib/systemd/system/consul_exporter.service<<EOF [Unit] Description=consul_exporter Documentation=https://prometheus.io/docs/introduction/overview/ After=network.target [Service] Type=simple User=consul EnvironmentFile=-/etc/default/consul_exporter # 具体使用时,若consul_exporter与consul server不在同一主机时,consul server要指向实际的地址; ExecStart=/usr/local/consul_exporter/consul_exporter \ --consul.server="http://localhost:8500" \ --web.listen-address=":9107" \ --web.telemetry-path="/metrics" \ --log.level=info \ $ARGS ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s Restart=always [Install] WantedBy=multi-user.target EOF #启动服务: systemctl daemon-reload systemctl start consul_exporter.service systemctl enable consul_exporter.service #验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9107' # 浏览器访问 http://10.0.0.100:9107/ # 创建consul_exporter服务的json文件 cat > consul_exporter.json<<EOF { "id": "consul_exporter", "name": "consul_exporter", "address": "lab1.lec.org", "port": 9107, "tags": ["consul_exporter"], "checks": [{ "http": "http://lab1.lec.org:9107/metrics", "interval": "5s" }] } EOF # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'consul_exporter' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "consul_exporter" refresh_interval: 1m # 注册服务 curl -XPUT --data @consul_exporter.json http://localhost:8500/v1/agent/service/register # 注销某个服务 curl -XPUT http://localhost:8500/v1/agent/service/deregister/<SERVICE_ID> # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
apt -y install mysql-server
systemctl status mysql
# 下载程序包 curl -LO https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.linux-amd64.tar.gz # 展开程序包 tar xf mysqld_exporter-0.14.0.linux-amd64.tar.gz -C /usr/local/ ln -sv /usr/local/mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter # 创建用户,若consul用户已经存在,可略过该步骤: useradd -r mysql # 创建Systemd Unitfile,保存于/usr/lib/systemd/system/mysqld_exporter.service文件中: cat >/usr/lib/systemd/system/mysqld_exporter.service<<EOF [Unit] Description=consul_exporter Documentation=https://prometheus.io/docs/introduction/overview/ After=network.target [Service] Type=simple User=mysql EnvironmentFile=-/etc/default/mysqld_exporter # 具体使用时,若mysql_exporter与mysql server不在同一主机时,mysql server要指向实际的地址; # mysql_exporter连接mysql server使用的用户名和密码均为exporter,该用户要获得正确的授权; Environment='DATA_SOURCE_NAME=exporter:exporter@(localhost:3306)' ExecStart=/usr/local/mysqld_exporter/mysqld_exporter \ --web.listen-address=":9104" \ --web.telemetry-path="/metrics" \ --collect.info_schema.innodb_tablespaces \ --collect.info_schema.innodb_metrics \ --collect.global_status \ --collect.global_variables \ --collect.slave_status \ --collect.engine_innodb_status \ $ARGS ExecReload=/bin/kill -HUP $MAINPID TimeoutStopSec=20s Restart=always [Install] WantedBy=multi-user.target EOF # 在mysqld server上添加用户,并授权其能够加载mysql的信息并转换为指标输出。需要注意的是用户账号授权时使用的主机范围。 mysql> CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'exporter'; mysql> GRANT PROCESS, REPLICATION CLIENT ON *.* TO 'exporter'@'localhost'; mysql> GRANT SELECT ON performance_schema.* TO 'exporter'@'localhost'; mysql> FLUSH PRIVILEGES; #启动服务: systemctl daemon-reload systemctl start mysqld_exporter.service systemctl enable mysqld_exporter.service #验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9104' # 浏览器访问 http://10.0.0.100:9104/ # 服务注册job # 创建mysqld_exporter服务的json文件 cat > mysqld_exporter.json<<EOF { "id": "mysqld_exporter", "name": "mysqld_exporter", "address": "lab1.lec.org", "port": 9104, "tags": ["mysqld_exporter"], "checks": [{ "http": "http://lab1.lec.org:9104/metrics", "interval": "5s" }] } EOF # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'mysqld_exporter' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "mysqld_exporter" refresh_interval: 1m # 注册服务 curl -XPUT --data @mysqld_exporter.json http://localhost:8500/v1/agent/service/register # 注销某个服务 curl -XPUT http://localhost:8500/v1/agent/service/deregister/mysqld_exporter # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
apt -y install docker.io # docker镜像加速 cat > /etc/docker/daemon.json <<EOF { "registry-mirrors": [ "https://docker.mirrors.ustc.edu.cn", "https://hub-mirror.c.163.com", "https://reg-mirror.qiniu.com", "https://registry.docker-cn.com" ], "exec-opts": ["native.cgroupdriver=systemd"] } EOF # 启动服务 systemctl daemon-reload systemctl restart docker.service # 安装docker-compose curl -L https://get.daocloud.io/docker/compose/releases/download/1.25.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose chmod +x /usr/local/bin/docker-compose
docker-compose 运行nginx
# docker-compose 运行nginx root@node5:~# cat docker-compose.yml version: '3.6' networks: monitoring: driver: bridge ipam: config: - subnet: 172.31.107.0/24 services: nginx: image: nginx:1.22.1 volumes: - ./nginx/stub_status-server.conf:/etc/nginx/conf.d/stub_status-server.conf:ro networks: - monitoring expose: - 8080 - 80 ports: - 80:80 nginx-exporter: image: nginx/nginx-prometheus-exporter:0.11 command: - '-nginx.scrape-uri=http://nginx:8080/stub_status' networks: - monitoring ports: - '9113:9113' depends_on: - nginx root@node5:~# cat nginx/stub_status-server.conf server { listen 8080; server_name localhost; location /stub_status { stub_status; access_log off; #allow 172.31.0.0/16; #deny all; } } # 运行 docker-compose up -d prometheus 机器访问curl 10.0.0.91:9113/metrics #验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9113' # 浏览器访问 http://10.0.0.91:9113/ # 服务注册job # 注册nginx_exporter服务示例 cat > nginx_exporter.json<<EOF { "service": { "id": "nginx_exporter", "name": "nginx_exporter", "address": "node5.lec.org", "port": 9113, "tags": ["nginx_exporter"], "checks": [{ "http": "http://node5.lec.org:9113/metrics", "interval": "5s" }] } } EOF # 服务注册 /usr/local/consul/consul services register nginx_exporter.json # 注销服务 /usr/local/consul/consul services deregister -id <SERVICE_ID> # 如/usr/local/consul/consul services deregister -id nginx_exporter # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'nginx_exporter' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "nginx_exporter" refresh_interval: 1m # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
# 注册tomcat_exporter服务示例 cat > tomcat.json<<EOF { "service": { "id": "tomcat", "name": "tomcat", "address": "node6.lec.org", "port": 8080, "tags": ["tomcat"], "checks": [{ "http": "http://node6.lec.org:8080/metrics", "interval": "5s" }] } } EOF # 服务注册 /usr/local/consul/consul services register tomcat.json # 注销服务 /usr/local/consul/consul services deregister -id <SERVICE_ID> # 如/usr/local/consul/consul services deregister -id nginx_exporter # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'tomcat' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "tomcat" refresh_interval: 1m # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
# docker-compose.yml cat docker-compose.yml version: '3.6' networks: monitoring: driver: bridge ipam: config: - subnet: 172.31.136.0/24 services: blackbox_exporter: image: prom/blackbox-exporter:v0.22.0 volumes: - ./configs/:/etc/blackboxexporter/ command: - '--config.file=/etc/blackboxexporter/blackbox.yml' networks: - monitoring ports: - 9115:9115 # blackbox.yml cat configs/blackbox.yml modules: # https://github.com/prometheus/blackbox_exporter/blob/master/example.yml http_2xx: prober: http timeout: 5s http: valid_http_versions: - "HTTP/1.1" - "HTTP/2" valid_status_codes: [] # Defaults to 2xx enable_http2: false method: GET no_follow_redirects: false # fail_if_ssl为true时,表示如果站点启用了SSL则探针失败,反之成功; # fail_if_not_ssl刚好相反; fail_if_ssl: false fail_if_not_ssl: false # fail_if_body_matches_regexp, fail_if_body_not_matches_regexp, fail_if_header_matches, fail_if_header_not_matches # 可以定义一组正则表达式,用于验证HTTP返回内容是否符合或者不符合正则表达式的内容 fail_if_body_matches_regexp: - "Could not connect to database" tls_config: insecure_skip_verify: false preferred_ip_protocol: "ip4" # defaults to "ip6" http_post_2xx: prober: http http: method: POST tcp_connect: prober: TCP # prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'blackbox' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://www.baidu.com # edit here - https://www.google.com # edit here relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: "10.0.0.92:9115" # Blackbox exporter. - target_label: region replacement: "remote" # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
能够预先运行频繁用到或计算消耗较大的表达式,并将其结果保存为一组 新的时间序列;
# 下载程序包 curl -LO https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz # 展开程序包 tar xf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/ ln -sv /usr/local/alertmanager-0.24.0.linux-amd64 /usr/local/alertmanager # 创建用户,若consul用户已经存在,可略过该步骤: useradd -r mysql # 创建Systemd Unitfile,保存于/usr/lib/systemd/system/alertmanager.service文件中: cat >/usr/lib/systemd/system/alertmanager.service<<EOF [Unit] Description=alertmanager After=network.target [Service] ExecStart=/usr/local/alertmanager/alertmanager --config.file="/usr/local/alertmanager/alertmanager.yml" [Install] WantedBy=multi-user.target EOF #启动服务: systemctl daemon-reload systemctl start alertmanager systemctl enable alertmanager #验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9093' # 浏览器访问 http://10.0.0.92:9093/ # 注册alertmanager服务示例 cat > alertmanager.json<<EOF { "service": { "id": "alertmanager", "name": "alertmanager", "address": "10.0.0.92", "port": 9093, "tags": ["alertmanager"], "checks": [{ "http": "http://10.0.0.92:9093/metrics", "interval": "5s" }] } } EOF # 服务注册 /usr/local/consul/consul services register alertmanager.json # 注销服务 /usr/local/consul/consul services deregister -id <SERVICE_ID> # 如/usr/local/consul/consul services deregister -id alertmanager # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'alertmanager' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "alertmanager" refresh_interval: 1m # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
配置报警规则rule
cat >/usr/local/prometheus/rules/hoststats-alert.yml <<EOF
groups:
- name: hostStatsAlert
rules:
- alert: hostCpuUsageAlert
expr: 100 * node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
for: 5m
labels:
severity: warning
annotations:
summary: "test--info"
EOF
配置报警规则一 alert rule
cat > /usr/local/prometheus/rules/record-rules-mysql.yml<<EOF groups: - name: mysqld_rules rules: - record: instance:mysql_slave_lag_seconds expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay - record: instance:mysql_heartbeat_lag_seconds expr: mysql_heartbeat_now_timestamp_seconds - mysql_heartbeat_stored_timestamp_seconds - record: job:mysql_transactions:rate5m expr: sum without (command) (rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m])) EOF
配置报警规则二
vi /usr/local/prometheus/rules/record-rules-node.yml
groups:
- name: custom_rules
interval: 5s
rules:
- record: instance:node_cpu:avg_rate5m
expr: (1 - avg(irate(node_cpu_seconds_total{job="node", mode="idle"}[5m])) by (instance)) * 100
- record: instace:node_memory_MemFree_percent
expr: 100 * (node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes) / node_memory_MemTotal_bytes
- record: instance:root:node_filesystem_free_percent
expr: 100 * node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
alertmanager推送qq邮箱
[root@node00 alertmanager]# cat alertmanager.yml global: resolve_timeout: 5m ################################### smtp_auth_username: "xxxxxx@qq.com" smtp_auth_password: "这是你的QQ邮箱授权码" #smtp_auth_secret: "" smtp_require_tls: false smtp_smarthost: "smtp.qq.com:465" smtp_from: "xxxxxx@qq.com" #################################### route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'email-lec' receivers: - name: 'email-lec' email_configs: - send_resolved: true to: xxxxxx@qq.com inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
prometheus采集alertmanager和配置rule
# 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml alerting: alertmanagers: - static_configs: - targets: - 10.0.0.92:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - rules/record-rules-*.yml - rules/hoststats-alert.yml # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
官网
https://grafana.com/grafana/download/9.3.0?pg=oss-graf&plcmt=resources
# 下载 sudo apt-get install -y adduser libfontconfig1 wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.3.0_amd64.deb sudo dpkg -i grafana-enterprise_9.3.0_amd64.deb # 启动服务 systemctl daemon-reload systemctl enable --now grafana-server.service systemctl status grafana-server.service # 查看端口,监听的是3000 ss -ntl # 浏览器打开 http://10.0.0.100:3000/login 默认账号admin密码admin
# 注册grafana服务示例 cat > grafana.json<<EOF { "service": { "id": "grafana", "name": "grafana", "address": "10.0.0.100", "port": 3000, "tags": ["grafana"], "checks": [{ "http": "http://10.0.0.100:3000/metrics", "interval": "5s" }] } } EOF # 服务注册 /usr/local/consul/consul services register grafana.json # 注销服务 /usr/local/consul/consul services deregister -id <SERVICE_ID> # 如/usr/local/consul/consul services deregister -id nginx_exporter # 修改prometheus.yml vi /usr/local/prometheus/prometheus.yml - job_name: 'grafana' consul_sd_configs: - server: "10.0.0.100:8500" # Consul的ip tags: - "grafana" refresh_interval: 1m # 修改配置后的重载命令 curl -XPOST http://localhost:9090/-/reload
vminsert 8480
接收数据存储请求,并根据指标名 称和标签等进行hash计算后分散存 储于vmstorage节点上
vmstorage 8482
存储原始格式的指标数据,并基于 标签过滤器返回请求的数据
vmselect 8481
接收查询请求,并从vmstorage节 点上检索出相关数据以响应查询
# ------------- cat docker-compose.yml version: '3.6' networks: vm_net: driver: bridge volumes: strgdata-1: {} strgdata-2: {} grafanadata: {} services: vmstorage-1: container_name: vmstorage-1 image: victoriametrics/vmstorage:v1.83.1-cluster ports: - 8482 - 8400 - 8401 volumes: - strgdata-1:/storage networks: - vm_net command: - '--storageDataPath=/storage' restart: always vmstorage-2: container_name: vmstorage-2 image: victoriametrics/vmstorage:v1.83.1-cluster networks: - vm_net ports: - 8482 - 8400 - 8401 volumes: - strgdata-2:/storage command: - '--storageDataPath=/storage' restart: always vminsert: container_name: vminsert image: victoriametrics/vminsert:v1.83.1-cluster depends_on: - "vmstorage-1" - "vmstorage-2" command: - '--storageNode=vmstorage-1:8400' - '--storageNode=vmstorage-2:8400' ports: - 8480:8480 networks: - vm_net restart: always vmselect: container_name: vmselect image: victoriametrics/vmselect:v1.83.1-cluster depends_on: - "vmstorage-1" - "vmstorage-2" command: - '--storageNode=vmstorage-1:8401' - '--storageNode=vmstorage-2:8401' #- '--vmalert.proxyURL=http://vmalert:8880' networks: - vm_net ports: - 8481:8481 restart: always docker-compose up -d vi /usr/local/prometheus/prometheus.yml #### 配置Prometheus以之为远程存储:
remote_write: # 远程写入到远程 VM 存储
remote_read:
# 修改配置后的重载命令
curl -XPOST http://localhost:9090/-/reload
# 下载程序包 curl -LO https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz # 展开程序包 tar xf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/ ln -sv /usr/local/alertmanager-0.24.0.linux-amd64 /usr/local/alertmanager # 创建用户,若consul用户已经存在,可略过该步骤: useradd -r mysql # 创建Systemd Unitfile,保存于/usr/lib/systemd/system/alertmanager.service文件中: cat >/usr/lib/systemd/system/alertmanager.service<<EOF [Unit] Description=alertmanager After=network.target [Service] ExecStart=/usr/local/alertmanager/alertmanager --config.file="/usr/local/alertmanager/alertmanager.yml" [Install] WantedBy=multi-user.target EOF #启动服务: systemctl daemon-reload systemctl start alertmanager systemctl enable alertmanager #验证监听的端口,并测试访问其暴露的指标 ss -tnlp | grep '9093' # 浏览器访问 http://10.0.0.92:9093/
kube-prometheus-values.yaml
root@master01:~/yaml# cat kube-prometheus-values.yaml global: imageRegistry: "" storageClass: "nfs-csi" clusterDomain: cluster.local operator: enabled: true image: registry: docker.io repository: bitnami/prometheus-operator tag: 0.60.1-debian-11-r9 prometheus: enabled: true replicaCount: 2 image: registry: docker.io repository: bitnami/prometheus tag: 2.39.1-debian-11-r10 ingress: enabled: true pathType: Prefix apiVersion: "" hostname: prometheus.lec.com path: / annotations: {} ingressClassName: "nginx" tls: false selfSigned: false externalUrl: "" persistence: enabled: false storageClass: "" accessModes: - ReadWriteOnce size: 8Gi annotations: {} priorityClassName: "" thanos: create: false image: registry: docker.io repository: bitnami/thanos tag: 0.28.1-scratch-r0 ingress: enabled: false pathType: Prefix hostname: thanos.lec.com path: / annotations: {} ingressClassName: "nginx" tls: false selfSigned: false alertmanager: enabled: true replicaCount: 2 image: registry: docker.io repository: bitnami/alertmanager tag: 0.24.0-debian-11-r55 ingress: enabled: true pathType: Prefix hostname: alertmanager.lec.com path: / annotations: {} ingressClassName: "nginx" tls: false selfSigned: false persistence: enabled: false storageClass: "" accessModes: - ReadWriteOnce size: 8Gi annotations: {} exporters: node-exporter: enabled: true kube-state-metrics: enabled: true node-exporter: service: labels: jobLabel: node-exporter serviceMonitor: enabled: true jobLabel: jobLabel extraArgs: collector.filesystem.ignored-mount-points: "^/(dev|proc|sys|var/lib/docker/.+)($|/)" collector.filesystem.ignored-fs-types: "^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$" kube-state-metrics: serviceMonitor: enabled: true kubelet: enabled: true namespace: kube-system blackboxExporter: enabled: true image: registry: docker.io repository: bitnami/blackbox-exporter tag: 0.22.0-debian-11-r32 configuration: | "modules": "http_2xx": "http": "preferred_ip_protocol": "ip4" "prober": "http" "http_post_2xx": "http": "method": "POST" "preferred_ip_protocol": "ip4" "prober": "http" "irc_banner": "prober": "tcp" "tcp": "preferred_ip_protocol": "ip4" "query_response": - "send": "NICK prober" - "send": "USER prober prober prober :prober" - "expect": "PING :([^ ]+)" "send": "PONG ${1}" - "expect": "^:[^ ]+ 001" "pop3s_banner": "prober": "tcp" "tcp": "preferred_ip_protocol": "ip4" "query_response": - "expect": "^+OK" "tls": true "tls_config": "insecure_skip_verify": false "ssh_banner": "prober": "tcp" "tcp": "preferred_ip_protocol": "ip4" "query_response": - "expect": "^SSH-2.0-" "tcp_connect": "prober": "tcp" "tcp": "preferred_ip_protocol": "ip4" kubeApiServer: enabled: true kubeControllerManager: enabled: false kubeScheduler: enabled: false coreDns: enabled: true kubeProxy: enabled: false rbac: create: true pspEnabled: true
安装
# helm安装prometheus helm install prometheus -f ./kube-prometheus-values.yaml bitnami/kube-prometheus -n prom # helm取消 helm uninstall prometheus -n prom # 查看 root@master01:~# kubectl get pods -n prom -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES alertmanager-prometheus-kube-prometheus-alertmanager-0 2/2 Running 1 (7m47s ago) 36m 10.244.2.101 node02.lec.org <none> <none> alertmanager-prometheus-kube-prometheus-alertmanager-1 2/2 Running 1 (8m27s ago) 36m 10.244.1.40 node01.lec.org <none> <none> prometheus-kube-prometheus-blackbox-exporter-5b9758bb6-9q9xf 1/1 Running 1 (7m46s ago) 36m 10.244.2.100 node02.lec.org <none> <none> prometheus-kube-prometheus-operator-859f6bc8fd-tjltj 1/1 Running 1 (7m47s ago) 36m 10.244.2.105 node02.lec.org <none> <none> prometheus-kube-state-metrics-7d97d8b99d-xjs2j 1/1 Running 1 (7m46s ago) 36m 10.244.2.99 node02.lec.org <none> <none> prometheus-node-exporter-mslxb 1/1 Running 1 (7m46s ago) 36m 10.0.0.72 node02.lec.org <none> <none> prometheus-node-exporter-slwwk 1/1 Running 1 (8m27s ago) 36m 10.0.0.71 node01.lec.org <none> <none> prometheus-prometheus-kube-prometheus-prometheus-0 1/2 Running 1 (8m27s ago) 36m 10.244.1.47 node01.lec.org <none> <none> prometheus-prometheus-kube-prometheus-prometheus-1 1/2 Running 1 (7m46s ago) 36m 10.244.2.97 node02.lec.org <none> <none> root@master01:~# kubectl get ingress -n prom -o wide NAME CLASS HOSTS ADDRESS PORTS AGE prometheus-kube-prometheus-alertmanager nginx alertmanager.lec.com 10.0.0.71 80 40m prometheus-kube-prometheus-prometheus nginx prometheus.lec.com 10.0.0.71 80 40m root@master01:~# kubectl get svc -n ingress-nginx NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE ingress-nginx-controller LoadBalancer 10.109.221.72 10.0.0.71 80:32509/TCP,443:30222/TCP 4d22h ingress-nginx-controller-admission ClusterIP 10.96.5.234 <none> 443/TCP 4d22
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。