赞
踩
最近在做多变量时间序列异常检测相关的工作,顺带也整理了目前市面上比较常用的五个多变量时间序列异常检测数据集,测试集都有标好的label,这五个数据集应该是在这个领域最为常用benchmark的数据集,整理主要来自于很多顶会的对比实验。
本文主要介绍五个数据集的具体信息和对应的标准化处理,并给出处理的代码和最终标准化的格式。
wget https://s3-us-west-2.amazonaws.com/telemanom/data.zip && unzip data.zip && rm data.zip
cd data && wget https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv
最终的格式为:
时间统一为datetime格式,很多时间信息匿名的,给予0-len(dataset),label列名统一用label,0为正常,1为异常。
import ast import csv import os import sys from pickle import dump import pandas as pd import numpy as np output_folder = 'processed_csv' os.makedirs(output_folder, exist_ok=True) def load_and_save(category, filename, dataset, dataset_folder): os.makedirs(os.path.join(output_folder, filename.split('.')[0]), exist_ok=True) temp = np.genfromtxt(os.path.join(dataset_folder, category, filename), dtype=np.float32, delimiter=',') # print(dataset, category, filename, temp.shape) fea_len = len(temp[0, :]) header_list = [] for i in range(fea_len): header_list.append("col_%d"%i) data = pd.DataFrame(temp, columns=header_list).reset_index() data.rename(columns={'index': 'timestamp'}, inplace=True) if category == "test": temp1 = np.genfromtxt(os.path.join(dataset_folder, "test_label", filename), dtype=np.float32, delimiter=',') data1 = pd.DataFrame(temp1, columns=["label"]).reset_index() data1.rename(columns={'index': 'timestamp'}, inplace=True) data = pd.merge(data, data1, how="left", on='timestamp') print(dataset, category, filename, temp.shape) data.to_csv(os.path.join(output_folder, filename.split('.')[0], dataset + "_" + category + ".csv"), index=False) def load_data(dataset): if dataset == 'SMD': dataset_folder = 'ServerMachineDataset' file_list = os.listdir(os.path.join(dataset_folder, "train")) for filename in file_list: if filename.endswith('.txt'): load_and_save('train', filename, filename.strip('.txt'), dataset_folder) load_and_save('test', filename, filename.strip('.txt'), dataset_folder) elif dataset == 'SMAP' or dataset == 'MSL': dataset_folder = 'data' with open(os.path.join(dataset_folder, 'labeled_anomalies.csv'), 'r') as file: csv_reader = csv.reader(file, delimiter=',') res = [row for row in csv_reader][1:] res = sorted(res, key=lambda k: k[0]) label_folder = os.path.join(dataset_folder, 'test_label') os.makedirs(label_folder, exist_ok=True) data_info = [row for row in res if row[1] == dataset and row[0] != 'P-2'] labels = [] for row in data_info: anomalies = ast.literal_eval(row[2]) length = int(row[-1]) label = np.zeros([length], dtype=np.int) for anomaly in anomalies: label[anomaly[0]:anomaly[1] + 1] = 1 labels.extend(label) labels = np.asarray(labels) print(dataset, 'test_label', labels.shape) labels = pd.DataFrame(labels, columns=["label"]).reset_index() labels.rename(columns={'index': 'timestamp'}, inplace=True) def concatenate_and_save(category): data = [] for row in data_info: filename = row[0] print(os.path.join(dataset_folder, category, filename + '.npy')) temp = np.load(os.path.join(dataset_folder, category, filename + '.npy')) data.extend(temp) data = np.asarray(data) print(dataset, category, data.shape) fea_len = len(data[0, :]) header_list = [] for i in range(fea_len): header_list.append("col_%d" % i) data = pd.DataFrame(data, columns=header_list).reset_index() data.rename(columns={'index': 'timestamp'}, inplace=True) if category == "test": data = pd.merge(data, labels, how="left", on='timestamp') print(dataset, category, filename, temp.shape) data.to_csv(os.path.join(output_folder, dataset + "_" + category + ".csv"), index=False) for c in ['train', 'test']: concatenate_and_save(c) if __name__ == '__main__': datasets = ['SMD', 'SMAP', 'MSL'] load_data('MSL')
改于:https://github.com/NetManAIOps/OmniAnomaly/blob/master/data_preprocess.py
import pandas as pd train_new = pd.read_csv('./WADI.A2_19 Nov 2019/WADI_14days_new.csv') test_new = pd.read_csv('./WADI.A2_19 Nov 2019/WADI_attackdataLABLE.csv', skiprows=1) test = pd.read_csv('./WADI.A1_9 Oct 2017/WADI_attackdata.csv') train = pd.read_csv('./WADI.A1_9 Oct 2017/WADI_14days.csv', skiprows=4) def recover_date(str1, str2): return str1+" "+str2 train["datetime"] = train.apply(lambda x : recover_date(x['Date'], x['Time']), axis=1) train["datetime"] = pd.to_datetime(train['datetime']) train_time = train[['Row', 'datetime']] train_new_time = pd.merge(train_new, train_time, how='left', on='Row') del train_new_time['Row'] del train_new_time['Date'] del train_new_time['Time'] train_new_time.to_csv('./processing/WADI_train.csv', index=False) test["datetime"] = test.apply(lambda x : recover_date(x['Date'], x['Time']), axis=1) test["datetime"] = pd.to_datetime(test['datetime']) test = test.loc[-2:, :] test_new = test_new.rename(columns={'Row ':'Row'}) test_time = test[['Row', 'datetime']] test_new_time = pd.merge(test_new, test_time, how='left', on='Row') del test_new_time['Row'] del test_new_time['Date '] del test_new_time['Time'] test_new_time = test_new_time.rename(columns={'Attack LABLE (1:No Attack, -1:Attack)':'label'}) test_new_time.loc[test_new_time['label'] == 1, 'label'] = 0 test_new_time.loc[test_new_time['label'] == -1, 'label'] = 1 test_new_time.to_csv('./processing/WADI_test.csv', index=False)
import numpy as np import pandas as pd normal = pd.read_csv("input/SWaT_Dataset_Normal_v1.csv") attack = pd.read_csv("input/SWaT_Dataset_Attack_v0.csv",sep=";") normal['Timestamp'] = pd.to_datetime(normal['Timestamp']) del normal['Normal/Attack'] normal = normal.rename(columns={'Timestamp':'datetime'}) datetime = normal['datetime'] del normal['datetime'] for i in list(normal): normal[i]=normal[i].apply(lambda x: str(x).replace("," , ".")) normal = normal.astype(float) normal['datetime']= datetime normal.to_csv('SWaT_train.csv', index=False) attack['Timestamp'] = pd.to_datetime(attack['Timestamp']) attack = attack.rename(columns={'Timestamp':'datetime'}) datetime = attack['datetime'] del attack['datetime'] labels = [ float(label!= 'Normal' ) for label in attack["Normal/Attack"].values] del attack['Normal/Attack'] for i in list(attack): attack[i]=attack[i].apply(lambda x: str(x).replace("," , ".")) attack = attack.astype(float) attack['datetime'] = datetime attack['label'] = labels attack.to_csv('SWaT_test.csv', index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。