赞
踩
调用http://<prometheus.address>/api/v1/targets
并解析。
def getTargetsStatus(address): url = address + '/api/v1/targets' response = requests.request('GET', url) if response.status_code == 200: targets = response.json()['data']['activeTargets'] aliveNum, totalNum = 0, 0 downList = [] for target in targets: totalNum += 1 if target['health'] == 'up': aliveNum += 1 else: downList.append(target['labels']['instance']) print('-----------------------TargetsStatus--------------------------') print(str(aliveNum) + ' in ' + str(totalNum) + ' Targets are alive !!!') print('--------------------------------------------------------------') for down in downList: print('\033[31m\033[1m' + down + '\033[0m' + ' down !!!') print('-----------------------TargetsStatus--------------------------') else: print('\033[31m\033[1m' + 'Get targets status failed!' + '\033[0m')
调用http://<prometheus.address>/api/v1/query?query=<expr>
并解析,其中expr为prometheus的查询语句。
### 定义cpu、mem、disks使用率的空字典 diskUsageDict = {} cpuUsageDict = {} memUsageDict = {} ### 定义采集时间间隔 s monitorInterval = 5 ### 定义超时告警时间 s diskAlertTime = 5 cpuAlertTime = 300 memAlertTime = 300 ### 定义告警阈值 % diskThreshold = 80 cpuThreshold = 60 memThreshold = 70 def queryUsage(address, expr): url = address + '/api/v1/query?query=' + expr try: return json.loads(requests.get(url=url).content.decode('utf8', 'ignore')) except Exception as e: print(e) return {} def orderUsageDict(usageDict, currentTime, monitorInterval): ''' :param usageDict: 资源使用率字典 :param usageDict: 资源使用率字典 :param currentTime: 当前获取监控数据的时间节点 :return: :description: 剔除字典中不满足连续超出阈值的数据 ''' for key in list(usageDict.keys()): if currentTime - usageDict[key][1] >= monitorInterval: usageDict.pop(key) def getCurrentUsageGreater(address, record, threshold, usageDict, monitorInterval): ''' :param address: Prometheus address :param record: Prometheus rules record :param threshold: 阈值 :param usageDict: 资源使用率字典 :param monitorInterval: 监控时间间隔 :return: :description: 获取资源使用率大于阈值的数据 ''' expr = record + '>=' + str(threshold) usage = queryUsage(address=address, expr=expr) currentTime = 0 if 'data' in usage and usage['data']['result']: for metric in usage['data']['result']: instance = metric['metric']['instance'] if record == 'node:fs_usage:ratio' or record == 'node:fs_root_usage:ratio': metricLabel = instance + ':' + metric['metric']['mountpoint'] else: metricLabel = instance utctime = metric['value'][0] value = metric['value'][1] describe = record.split(':')[1] if not metricLabel in usageDict.keys(): usageDict[metricLabel] = (utctime, utctime, describe, value) else: startTime = usageDict.get(metricLabel)[0] usageDict[metricLabel] = (startTime, utctime, describe, value) currentTime = utctime orderUsageDict(usageDict=usageDict, currentTime=currentTime, monitorInterval=monitorInterval) def printUsageDict(usageDict, alertTime): ''' :param usageDict: 资源使用率字典 :param alertTime: 监控告警时间 :return: :description: 打印出超过监控告警时间的数据 ''' for key, value in usageDict.items(): deltaT = value[1] - value[0] if deltaT >= alertTime: print(key + ' ----- ' + value[2] + '\033[31m\033[1m ' + str(value[3]) + '\033[0m ----- lasted for\033[31m\033[1m %.2f \033[0mseconds' % deltaT) def monitorUsageGreater(address): ''' :param address: Prometheus address :return: :description: 持续监控并输出数据 ''' while True: getCurrentUsageGreater(address, 'node:fs_usage:ratio', diskThreshold, diskUsageDict, monitorInterval) printUsageDict(diskUsageDict, alertTime=diskAlertTime) getCurrentUsageGreater(address, 'node:memory_usage:ratio', cpuThreshold, memUsageDict, monitorInterval) printUsageDict(memUsageDict, alertTime=memAlertTime) getCurrentUsageGreater(address, 'node:cpu_usage:ratio', memThreshold, cpuUsageDict, monitorInterval) printUsageDict(cpuUsageDict, alertTime=cpuAlertTime) time.sleep(monitorInterval)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。