赞
踩
1:构建的特征有
a:不同样本调用api,tid,index的频次信息
b:对数值字段采用mean,max,min等函数生成数值特征
c:对api调用tid的次数统计形成特征(采用pd.pivot_table)
d:对api调用不同tid的次数统计形成特征
注:对训练集和测试集中api种类统计发现不完全重合(有很大交集),因此,删除训练集中独有的三种api信息。并特征选取时采用训练集特征构建测试集的c和d类特征(这样对数据有一定的浪费)
2:采用的算法:LGB(其它算法未怎么尝试,先练练手,熟悉流程)
代码如下
- import pandas as pd
- import numpy as np
- import seaborn as sns
- import matplotlib.pyplot as plt
- from neicunyasuo import _Data_Preprocess
- import lightgbm as lgb
- from sklearn.model_selection import train_test_split
-
- import warnings
- warnings.filterwarnings('ignore')
-
-
- memory_process = _Data_Preprocess()
-
- path = '../恶意程序检测分类/恶意程序数据/'
- train = pd.read_csv(path + 'security_train.csv')
- test = pd.read_csv(path + 'security_test.csv')
- train = train[(train.api != 'EncryptMessage')&(train.api !='WSASendTo')&(train.api !='RtlCompressBuffer')].reset_index()
- # {'EncryptMessage', 'WSASendTo', 'RtlCompressBuffer'}
-
-
- # 反映样本调用api,tid,index的频率信息的特证
- def simple_sts_features(df):
- simple_fea = pd.DataFrame()
- simple_fea['file_id'] = df['file_id'].unique()
- simple_fea = simple_fea.sort_values('file_id')
-
- df_grp = df.groupby('file_id')
- simple_fea['file_id_api_count'] = df_grp['api'].count().values
- simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
-
- simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
- simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values
-
- simple_fea['file_id_index_count'] = df_grp['index'].count().values
- simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values
-
- return simple_fea
-
-
- simple_train_fea1 = simple_sts_features(train)
- simple_test_fea1 = simple_sts_features(test)
-
-
- # 提取其数值特征
- def simple_numerical_sts_features(df):
- simple_numerical_fea = pd.DataFrame()
- simple_numerical_fea['file_id'] = df['file_id'].unique()
- simple_numerical_fea = simple_numerical_fea.sort_values('file_id')
-
- df_grp = df.groupby('file_id')
-
- simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
- simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
- simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
- simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values
-
- simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values
- simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
- simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
- simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values
-
- return simple_numerical_fea
-
-
- simple_train_fea2 = simple_numerical_sts_features(train)
- simple_test_fea2 = simple_numerical_sts_features(test)
-
- # 每个api调用线程的次数 高级特征:数据透视表运用
- def api_pivot_count_features(df):
- tmp = df.groupby(['file_id', 'api'])['tid'].count().to_frame('api_tid_count').reset_index()
- tmp_pivot = pd.pivot_table(data=tmp, index = 'file_id', columns='api', values='api_tid_count', fill_value=0)
- tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
- tmp_pivot.reset_index(inplace = True)
- tmp_pivot = memory_process._memory_process(tmp_pivot)
- return tmp_pivot
-
- simple_train_fea3 = api_pivot_count_features(train)
- simple_test_fea3 = api_pivot_count_features(test)
- # 每个api调用不同线程的次数 高级特征
- def api_pivot_nunique_features(df):
- tmp = df.groupby(['file_id','api'])['tid'].nunique().to_frame('api_tid_nunique').reset_index()
- tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_nunique',fill_value=0)
- tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
- tmp_pivot.reset_index(inplace = True)
- tmp_pivot = memory_process._memory_process(tmp_pivot)
- return tmp_pivot
-
-
- simple_train_fea4 = api_pivot_count_features(train)
- simple_test_fea4 = api_pivot_count_features(test)
-
- train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
- test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')
- # 训练集&测试集构建
- train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
- train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')
- train_data = train_data.merge(simple_train_fea3, on='file_id', how='left')
- train_data = train_data.merge(simple_train_fea4, on='file_id', how='left')
-
- test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')
- test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')
- test_submit = test_submit.merge(simple_test_fea3, on='file_id', how='left')
- test_submit = test_submit.merge(simple_test_fea4, on='file_id', how='left')
-
-
- train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
- train_label = 'label'
-
- from sklearn.model_selection import StratifiedKFold, KFold
-
-
- params = {
- 'task': 'train',
- 'num_leaves': 255,
- 'objective': 'multiclass', # 多分类的意思
- 'num_class': 8, # 八分类
- 'min_data_in_leaf': 50,
- 'learning_rate': 0.05,
- 'feature_fraction': 0.85,
- 'bagging_fraction': 0.85,
- 'bagging_freq': 5,
- 'max_bin': 128,
- 'random_state': 100,
- 'metric': 'multi_logloss'
- }
-
- folds = KFold(n_splits=5, shuffle=True, random_state=15)
-
- predict_res = 0
- models = []
- for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
- print("fold n°{}".format(fold_))
- trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
- val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
-
- clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50,
- early_stopping_rounds=100)
- models.append(clf)
-
-
- # 特征重要性分析
- feature_importance = pd.DataFrame()
- feature_importance['fea_name'] = train_features
- feature_importance['fea_imp'] = clf.feature_importance()
- feature_importance = feature_importance.sort_values('fea_imp', ascending=False)
- feature_importance.sort_values('fea_imp', ascending=False)
-
- plt.figure(figsize=[40, 20])
- plt.figure(figsize=[40, 20])
- sns.barplot(x = feature_importance.iloc[:10]['fea_name'], y = feature_importance.iloc[:10]['fea_imp'])
- plt.show()
-
- plt.figure(figsize=[40, 20])
- sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])
- plt.show()
-
- # 模型测试
- pred_res = 0
- fold = 5
- for model in models:
- pred_res +=model.predict(test_submit[train_features]) * 1.0 / fold
-
- test_submit['prob0'] = 0
- test_submit['prob1'] = 0
- test_submit['prob2'] = 0
- test_submit['prob3'] = 0
- test_submit['prob4'] = 0
- test_submit['prob5'] = 0
- test_submit['prob6'] = 0
- test_submit['prob7'] = 0
-
- test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res
- test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline2.csv',index = None)
-
后续可改进的地方:采用其它算法,采用分层多折交叉验证等
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。