赞
踩
需求:报警信息太多,需要筛选出来一部分,实现紧急处理,不紧急处理,然后发送到不同的群。
cat dingtalk-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: dingtalk-config
namespace: prometheus
data:
config.yml: |-
templates:
- /etc/prometheus-webhook-dingtalk/template.tmpl
targets:
webhook:
url: https://oapi.dingtalk.com/robot/send?access_token=b5b550b72447d935572d5c717cd1ec4bed7f17cc82ef #机器人wehbook地址
secret: SECcbc9fe62f53d9a533d5e506f30722e0a1a39b36bd0b8e242c15772dc39e05aa1 #钉钉机器人的加签
mention:
all: true #@所有人
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=4df2745e8df1de6d0429e35caf15e032e2b33ee2ba73899043c99953db9d68a3
secret: SECe079af795abd316a7e1f431ee8ebcf082cc0b0611a859da37ec4d7ad6800b0f5
template.tmpl: |-
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
**Labels**
{{ range .Labels.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Annotations**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }})
{{ end }}{{ end }}
{{ define "default.__text_alert_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}
**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**事件信息:**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
{{ end }}
{{ end }}
{{ define "default.__text_alertresovle_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}
**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
**事件信息:**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
{{ end }}
{{ end }}
{{/* Default */}}
{{ define "default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "default.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ if gt (len .Alerts.Firing) 0 -}}
{{ template "default.__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
{{ template "default.__text_alertresovle_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
{{/* Legacy */}}
{{ define "legacy.title" }}{{ template "__subject" . }}{{ end }}
{{ define "legacy.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{/* Following names for compatibility */}}
{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
kubectl apply -f dingtalk-config.yaml
cd /home/k8s/kube-prometheus-0.10.0/manifests
#cat alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.23.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"receivers":
- "name": "Webhook" #第一个钉钉名字
"webhook_configs":
- "url": "http://dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook/send" #第一个钉钉机器人转发的地址
"send_resolved": true #恢复是否发送消息
- "name": "Webhook2" #第二个钉钉名字
"webhook_configs":
- "url": "http://dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook2/send" #第二个钉钉机器人转发的地址
"send_resolved": true
"route":
"group_by":
- "alertname"
- "namespace"
- "team"
"group_wait": "60s"
"group_interval": "10s"
"repeat_interval": "12h"
"receiver": "Webhook" #默认的接收人必须要配置
"routes":
- "match": #标签里面带有service: diji 的发送到webhook上
"service": "diji"
"receiver": "Webhook" #接受人
"group_wait": "10s"
"group_interval": "15s"
"repeat_interval": "3h"
- "match": #标签里面带有service: gaoji 的发送到webhook2上
"service": "gaoji"
"receiver": "Webhook2" #接受人
"group_wait": "10s"
"group_interval": "15s"
"repeat_interval": "3h"
"inhibit_rules":
- "source_match": #匹配当前告警发生后其他告警抑
"severity": 'error' #指定告警级别
"target_match": #抑制告警
"severity": 'warning' #指定抑制告警级别
"equal": ['instance','namespace',"alertname","team"] #只有包含指定标签才可成立规则
type: Opaque
kubectl apply -f alertmanager-secret.yaml
routes:
- match: #基本匹配
team: test
group_by: [env,dc] #分组
receiver: 'ops' #接受人
- match_re: #正则匹配
service: nginx|apache #多个|
receiver: 'webhook3' #接收人
- match_re: #正则匹配
severity: critical #报警级别
receiver: 'webhook4' #接收人
cd /home/k8s/kube-prometheus-0.10.0/manifests
cat nodeExporter-prometheusRule.yaml
...
- alert: demon-pod
annotations:
description: filed demon-pod < 2
expr: sum(node_namespace_pod:kube_pod_info:{namespace="demon"}) < 2 #这是监控语句不能直接使用需要自己写 promql,sum是求和函数。
for: 2m
labels:
team: pods
severity: critical
service: diji
team: demon-pod
- alert: demon-nignx-stop
annotations:
description: nginx pod stop
expr: sum(kube_pod_container_status_ready{namespace="demon"}) < 2 #这是监控语句不能直接使用需要自己写 promql,sum是求和函数。
for: 2m
labels:
severity: critical
service: gaoji
namespace: demon
...
kubectl apply -f nodeExporter-prometheusRule.yaml
promethus 上看是否出现
钉钉查看是否出现2个机器人分别发送,当然机器人可以放在不同的群里。我这测试无所谓了。
报错收集:
true不能为字符串
一级route里面必须有接收人receiver
没有webhook接收人
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。