当前位置:   article > 正文

阿里云天池学习赛之恶意程序检测(学习笔记)_阿里云恶意软件检测数据集数据多分类

阿里云恶意软件检测数据集数据多分类

 

1:构建的特征有

a:不同样本调用api,tid,index的频次信息

b:对数值字段采用mean,max,min等函数生成数值特征

c:对api调用tid的次数统计形成特征(采用pd.pivot_table

d:对api调用不同tid的次数统计形成特征

注:对训练集和测试集中api种类统计发现不完全重合(有很大交集),因此,删除训练集中独有的三种api信息。并特征选取时采用训练集特征构建测试集的c和d类特征(这样对数据有一定的浪费)

2:采用的算法:LGB(其它算法未怎么尝试,先练练手,熟悉流程)

代码如下

  1. import pandas as pd
  2. import numpy as np
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. from neicunyasuo import _Data_Preprocess
  6. import lightgbm as lgb
  7. from sklearn.model_selection import train_test_split
  8. import warnings
  9. warnings.filterwarnings('ignore')
  10. memory_process = _Data_Preprocess()
  11. path = '../恶意程序检测分类/恶意程序数据/'
  12. train = pd.read_csv(path + 'security_train.csv')
  13. test = pd.read_csv(path + 'security_test.csv')
  14. train = train[(train.api != 'EncryptMessage')&(train.api !='WSASendTo')&(train.api !='RtlCompressBuffer')].reset_index()
  15. # {'EncryptMessage', 'WSASendTo', 'RtlCompressBuffer'}
  16. # 反映样本调用api,tid,index的频率信息的特证
  17. def simple_sts_features(df):
  18. simple_fea = pd.DataFrame()
  19. simple_fea['file_id'] = df['file_id'].unique()
  20. simple_fea = simple_fea.sort_values('file_id')
  21. df_grp = df.groupby('file_id')
  22. simple_fea['file_id_api_count'] = df_grp['api'].count().values
  23. simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
  24. simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
  25. simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values
  26. simple_fea['file_id_index_count'] = df_grp['index'].count().values
  27. simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values
  28. return simple_fea
  29. simple_train_fea1 = simple_sts_features(train)
  30. simple_test_fea1 = simple_sts_features(test)
  31. # 提取其数值特征
  32. def simple_numerical_sts_features(df):
  33. simple_numerical_fea = pd.DataFrame()
  34. simple_numerical_fea['file_id'] = df['file_id'].unique()
  35. simple_numerical_fea = simple_numerical_fea.sort_values('file_id')
  36. df_grp = df.groupby('file_id')
  37. simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
  38. simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
  39. simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
  40. simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values
  41. simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values
  42. simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
  43. simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
  44. simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values
  45. return simple_numerical_fea
  46. simple_train_fea2 = simple_numerical_sts_features(train)
  47. simple_test_fea2 = simple_numerical_sts_features(test)
  48. # 每个api调用线程的次数 高级特征:数据透视表运用
  49. def api_pivot_count_features(df):
  50. tmp = df.groupby(['file_id', 'api'])['tid'].count().to_frame('api_tid_count').reset_index()
  51. tmp_pivot = pd.pivot_table(data=tmp, index = 'file_id', columns='api', values='api_tid_count', fill_value=0)
  52. tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
  53. tmp_pivot.reset_index(inplace = True)
  54. tmp_pivot = memory_process._memory_process(tmp_pivot)
  55. return tmp_pivot
  56. simple_train_fea3 = api_pivot_count_features(train)
  57. simple_test_fea3 = api_pivot_count_features(test)
  58. # 每个api调用不同线程的次数 高级特征
  59. def api_pivot_nunique_features(df):
  60. tmp = df.groupby(['file_id','api'])['tid'].nunique().to_frame('api_tid_nunique').reset_index()
  61. tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_nunique',fill_value=0)
  62. tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
  63. tmp_pivot.reset_index(inplace = True)
  64. tmp_pivot = memory_process._memory_process(tmp_pivot)
  65. return tmp_pivot
  66. simple_train_fea4 = api_pivot_count_features(train)
  67. simple_test_fea4 = api_pivot_count_features(test)
  68. train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
  69. test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')
  70. # 训练集&测试集构建
  71. train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
  72. train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')
  73. train_data = train_data.merge(simple_train_fea3, on='file_id', how='left')
  74. train_data = train_data.merge(simple_train_fea4, on='file_id', how='left')
  75. test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')
  76. test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')
  77. test_submit = test_submit.merge(simple_test_fea3, on='file_id', how='left')
  78. test_submit = test_submit.merge(simple_test_fea4, on='file_id', how='left')
  79. train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
  80. train_label = 'label'
  81. from sklearn.model_selection import StratifiedKFold, KFold
  82. params = {
  83. 'task': 'train',
  84. 'num_leaves': 255,
  85. 'objective': 'multiclass', # 多分类的意思
  86. 'num_class': 8, # 八分类
  87. 'min_data_in_leaf': 50,
  88. 'learning_rate': 0.05,
  89. 'feature_fraction': 0.85,
  90. 'bagging_fraction': 0.85,
  91. 'bagging_freq': 5,
  92. 'max_bin': 128,
  93. 'random_state': 100,
  94. 'metric': 'multi_logloss'
  95. }
  96. folds = KFold(n_splits=5, shuffle=True, random_state=15)
  97. predict_res = 0
  98. models = []
  99. for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
  100. print("fold n°{}".format(fold_))
  101. trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
  102. val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
  103. clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50,
  104. early_stopping_rounds=100)
  105. models.append(clf)
  106. # 特征重要性分析
  107. feature_importance = pd.DataFrame()
  108. feature_importance['fea_name'] = train_features
  109. feature_importance['fea_imp'] = clf.feature_importance()
  110. feature_importance = feature_importance.sort_values('fea_imp', ascending=False)
  111. feature_importance.sort_values('fea_imp', ascending=False)
  112. plt.figure(figsize=[40, 20])
  113. plt.figure(figsize=[40, 20])
  114. sns.barplot(x = feature_importance.iloc[:10]['fea_name'], y = feature_importance.iloc[:10]['fea_imp'])
  115. plt.show()
  116. plt.figure(figsize=[40, 20])
  117. sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])
  118. plt.show()
  119. # 模型测试
  120. pred_res = 0
  121. fold = 5
  122. for model in models:
  123. pred_res +=model.predict(test_submit[train_features]) * 1.0 / fold
  124. test_submit['prob0'] = 0
  125. test_submit['prob1'] = 0
  126. test_submit['prob2'] = 0
  127. test_submit['prob3'] = 0
  128. test_submit['prob4'] = 0
  129. test_submit['prob5'] = 0
  130. test_submit['prob6'] = 0
  131. test_submit['prob7'] = 0
  132. test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res
  133. test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline2.csv',index = None)

后续可改进的地方:采用其它算法,采用分层多折交叉验证

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/秋刀鱼在做梦/article/detail/860702
推荐阅读
相关标签
  

闽ICP备14008679号