当前位置:   article > 正文

炼丹记之solo四川诈骗电话识别复赛rank13分享

炼丹记之solo四川诈骗电话识别复赛rank13分享

赛题地址:http://www.scdata.net.cn/kfds/urgent2/pages/index.html  ,诈骗电话识别是算法对抗赛的赛题之一。

attention:由于签了保密协议,本文不会提供代码涉及到的数据。

参赛历程:初赛时,由于大佬开的baseline分数已经很高,于是本人只调了调baseline然后就弃赛了。然后初赛结束的时候竟然收到了进入复赛的短信,于是交保密协议,下载数据,算是复赛才开始正式参赛吧。本来给一个大佬的结果融合可以进入top4的,可惜没选中。

本文参考的bl:https://github.com/biaobiao2/DC_phone

本方案简要说明:

特征工程:主要使用了统计特征,在bl的特征基础上新增了sem、skew等统计特征,实践证明sem、skew是该赛题数据非常优秀的特征。

模型搭建:lgb+xgb+cat,三模输出概率按0.25:0.25:0.5累加求0-1的概率,将1概率大于0.6的输出为1,否则为0。

输出结果:复赛B榜0.9005

其它trick:训练特征未做填充,测试数据做分位数填充。单模建模时通过求所有预测值0-1概率差的绝对值的和,找出和为最大的分位数作为填充,从B榜分数来看,能带来几个千分位的提升。

代码部分:

特征工程:

  1. # coding=utf-8
  2. '''
  3. @author: csdn xuxml
  4. '''
  5. import os
  6. import gc
  7. import time
  8. import psutil
  9. import datetime
  10. import numpy as np
  11. import pandas as pd
  12. import catboost as cat
  13. import lightgbm as lgb
  14. from sklearn.metrics import f1_score
  15. from catboost import CatBoostClassifier
  16. from scipy.stats import entropy, pearsonr, stats
  17. from sklearn.model_selection import GridSearchCV
  18. from sklearn.model_selection import StratifiedKFold
  19. from sklearn.preprocessing import MinMaxScaler, LabelEncoder
  20. pd.set_option('display.unicode.ambiguous_as_wide', True)
  21. pd.set_option('display.unicode.east_asian_width', True)
  22. pd.set_option('display.max_columns', None)
  23. pd.set_option("display.max_colwidth",100)
  24. pd.set_option('display.max_rows', None)
  25. pd.set_option('display.width',100)
  26. path = "./0527/"
  27. feat_path = path + "data/"
  28. def get_app_feats(df):
  29. phones_app = df[["phone_no_m"]].copy()
  30. phones_app = phones_app.drop_duplicates(subset=['phone_no_m'], keep='last')
  31. tmp = df.groupby("phone_no_m")["busi_name"].agg(busi_count="nunique")
  32. phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
  33. """使用的流量统计
  34. """
  35. tmp = df.groupby("phone_no_m")["flow"].agg(flow_mean="mean",
  36. flow_median = "median",
  37. flow_min = "min",
  38. flow_max = "max",
  39. flow_var = "var",
  40. flow_skew = "skew",
  41. flow_std = "std",
  42. flow_quantile = "quantile",
  43. flow_sem = "sem",
  44. flow_sum = "sum")
  45. phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
  46. tmp = df.groupby("phone_no_m")["month_id"].agg(month_ids ="nunique")
  47. phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
  48. #月流量使用统计
  49. phones_app["flow_month"] = phones_app["flow_sum"] / phones_app["month_ids"]
  50. return phones_app
  51. def get_voc_feat(df):
  52. df["start_datetime"] = pd.to_datetime(df['start_datetime'] )
  53. df["hour"] = df['start_datetime'].dt.hour
  54. df["day"] = df['start_datetime'].dt.day
  55. phone_no_m = df[["phone_no_m"]].copy()
  56. phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
  57. #对话人数和对话次数
  58. tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(opposite_count="count", opposite_unique="nunique")
  59. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  60. """主叫通话
  61. """
  62. df_call = df[df["calltype_id"]==1].copy()
  63. tmp = df_call.groupby("phone_no_m")["imei_m"].agg(voccalltype1="count", imeis="nunique")
  64. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  65. phone_no_m["voc_calltype1"] = phone_no_m["voccalltype1"] / phone_no_m["opposite_count"]
  66. tmp = df_call.groupby("phone_no_m")["city_name"].agg(city_name_call="nunique")
  67. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  68. tmp = df_call.groupby("phone_no_m")["county_name"].agg(county_name_call="nunique")
  69. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  70. """和固定通话者的对话统计
  71. """
  72. tmp = df.groupby(["phone_no_m","opposite_no_m"])["call_dur"].agg(count="count", sum="sum")
  73. phone2opposite = tmp.groupby("phone_no_m")["count"].agg(phone2opposite_mean="mean"
  74. , phone2opposite_median="median"
  75. , phone2opposite_max="max"
  76. , phone2opposite_min="min"
  77. , phone2opposite_var="var"
  78. , phone2opposite_skew="skew"
  79. , phone2opposite_sem="sem"
  80. , phone2opposite_std="std"
  81. , phone2opposite_quantile="quantile"
  82. )
  83. phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
  84. phone2opposite = tmp.groupby("phone_no_m")["sum"].agg(phone2oppo_sum_mean="mean"
  85. , phone2oppo_sum_median="median"
  86. , phone2oppo_sum_max="max"
  87. , phone2oppo_sum_min="min"
  88. , phone2oppo_sum_var="var"
  89. , phone2oppo_sum_skew="skew"
  90. , phone2oppo_sum_sem="sem"
  91. , phone2oppo_sum_std="std"
  92. , phone2oppo_sum_quantile="quantile"
  93. )
  94. phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
  95. """通话时间长短统计
  96. """
  97. tmp = df.groupby("phone_no_m")["call_dur"].agg(call_dur_mean="mean"
  98. , call_dur_median="median"
  99. , call_dur_max="max"
  100. , call_dur_min="min"
  101. , call_dur_var="var"
  102. , call_dur_skew="skew"
  103. , call_dur_sem="sem"
  104. , call_dur_std="std"
  105. , call_dur_quantile="quantile"
  106. )
  107. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  108. tmp = df.groupby("phone_no_m")["city_name"].agg(city_name_nunique="nunique")
  109. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  110. tmp = df.groupby("phone_no_m")["county_name"].agg(county_name_nunique="nunique")
  111. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  112. tmp = df.groupby("phone_no_m")["calltype_id"].agg(calltype_id_unique="nunique")
  113. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  114. """通话时间点偏好
  115. """
  116. tmp = df.groupby("phone_no_m")["hour"].agg(voc_hour_mode = lambda x:stats.mode(x)[0][0],
  117. voc_hour_mode_count = lambda x:stats.mode(x)[1][0],
  118. voc_hour_nunique="nunique")
  119. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  120. tmp = df.groupby("phone_no_m")["day"].agg(voc_day_mode = lambda x:stats.mode(x)[0][0],
  121. voc_day_mode_count = lambda x:stats.mode(x)[1][0],
  122. voc_day_nunique="nunique")
  123. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  124. return phone_no_m
  125. def get_sms_feats(df):
  126. df['request_datetime'] = pd.to_datetime(df['request_datetime'] )
  127. df["hour"] = df['request_datetime'].dt.hour
  128. df["day"] = df['request_datetime'].dt.day
  129. phone_no_m = df[["phone_no_m"]].copy()
  130. phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
  131. #对话人数和对话次数
  132. tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count", sms_nunique="nunique")
  133. tmp["sms_rate"] = tmp["sms_count"]/tmp["sms_nunique"]
  134. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  135. """短信下行比例
  136. """
  137. calltype2 = df[df["calltype_id"]==2].copy()
  138. calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(calltype_2="count")
  139. phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
  140. phone_no_m["calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
  141. """短信时间
  142. """
  143. tmp = df.groupby("phone_no_m")["hour"].agg(hour_mode = lambda x:stats.mode(x)[0][0],
  144. hour_mode_count = lambda x:stats.mode(x)[1][0],
  145. hour_nunique="nunique")
  146. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  147. tmp = df.groupby("phone_no_m")["day"].agg(day_mode = lambda x:stats.mode(x)[0][0],
  148. day_mode_count = lambda x:stats.mode(x)[1][0],
  149. day_nunique="nunique")
  150. phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
  151. return phone_no_m
  152. def feats():
  153. test_voc=pd.read_csv(path+'test/test_voc.csv',)
  154. test_voc_feat = get_voc_feat(test_voc)
  155. test_voc_feat.to_csv(feat_path + "test_voc_feat.csv", index=False)
  156. test_app=pd.read_csv(path+'test/test_app.csv',)
  157. test_app_feat = get_app_feats(test_app)
  158. test_app_feat.to_csv(feat_path + "test_app_feat.csv", index=False)
  159. test_sms=pd.read_csv(path+'test/test_sms.csv',)
  160. test_sms_feat = get_sms_feats(test_sms)
  161. test_sms_feat.to_csv(feat_path + "test_sms_feat.csv", index=False)
  162. train_voc=pd.read_csv(path+'train/train_voc.csv',)
  163. train_voc_feat = get_voc_feat(train_voc)
  164. train_voc_feat.to_csv(feat_path + "train_voc_feat.csv", index=False)
  165. train_app=pd.read_csv(path+'train/train_app.csv',)
  166. train_app_feat = get_app_feats(train_app)
  167. train_app_feat.to_csv(feat_path + "train_app_feat.csv", index=False)
  168. train_sms=pd.read_csv(path+'train/train_sms.csv',)
  169. train_sms_feat = get_sms_feats(train_sms)
  170. train_sms_feat.to_csv(feat_path + "train_sms_feat.csv", index=False)
  171. test_vocfs=pd.read_csv(path + 'zpfsdata/test_voc.csv',)
  172. test_voc_featfs = get_voc_feat(test_vocfs)
  173. test_voc_featfs.to_csv(path + "zpfsdata/test_voc_feat.csv", index=False)
  174. test_appfs=pd.read_csv(path + 'zpfsdata/test_app.csv',)
  175. test_app_featfs = get_app_feats(test_appfs)
  176. test_app_featfs.to_csv(path + "zpfsdata/test_app_feat.csv", index=False)
  177. test_smsfs=pd.read_csv(path + 'zpfsdata/test_sms.csv',)
  178. test_sms_featfs = get_sms_feats(test_smsfs)
  179. test_sms_featfs.to_csv(path + "zpfsdata/test_sms_feat.csv", index=False)

生成特征:

  1. #create and save voc、app、sms features
  2. feats()

数据加载:

  1. #load april features
  2. test_app_feat=pd.read_csv(feat_path+'test_app_feat.csv')
  3. test_voc_feat=pd.read_csv(feat_path+'test_voc_feat.csv')
  4. test_sms_feat=pd.read_csv(feat_path + "test_sms_feat.csv")
  5. test_user=pd.read_csv(path+'test/test_user.csv')
  6. test_user = test_user.merge(test_app_feat, on="phone_no_m", how="left")
  7. test_user = test_user.merge(test_voc_feat, on="phone_no_m", how="left")
  8. test_user = test_user.merge(test_sms_feat, on="phone_no_m", how="left")
  9. test_user["city_name"] = LabelEncoder().fit_transform(test_user["city_name"].astype(np.str))
  10. test_user["county_name"] = LabelEncoder().fit_transform(test_user["county_name"].astype(np.str))
  11. #load april label
  12. test_user_lb1 = pd.read_csv(path + 'zpfsdata/4yuelabel1.csv')
  13. test_user_lb2 = pd.read_csv(path + 'zpfsdata/4yuelabel2.csv')
  14. #concat april label and merge with features
  15. test_user_label = pd.concat([test_user_lb1, test_user_lb2])
  16. test_user = test_user.merge(test_user_label, on="phone_no_m", how="left")
  17. test_user.rename(columns={"arpu_202004":"arpu_202005"},inplace=True)
  18. #load train features and label
  19. train_app_feat = pd.read_csv(feat_path + "train_app_feat.csv")
  20. train_voc_feat = pd.read_csv(feat_path + "train_voc_feat.csv")
  21. train_sms_feat = pd.read_csv(feat_path + "train_sms_feat.csv")
  22. train_user=pd.read_csv(path+'train/train_user.csv')
  23. drop_r = ["arpu_201908","arpu_201909","arpu_201910","arpu_201911","arpu_201912","arpu_202001","arpu_202002"]
  24. train_user.drop(drop_r, axis=1,inplace=True)
  25. train_user.rename(columns={"arpu_202003":"arpu_202005"},inplace=True)
  26. train_user = train_user.merge(train_app_feat, on="phone_no_m", how="left")
  27. train_user = train_user.merge(train_voc_feat, on="phone_no_m", how="left")
  28. train_user = train_user.merge(train_sms_feat, on="phone_no_m", how="left")
  29. train_user["city_name"] = LabelEncoder().fit_transform(train_user["city_name"].astype(np.str))
  30. train_user["county_name"] = LabelEncoder().fit_transform(train_user["county_name"].astype(np.str))
  31. #concat preli data(train and test)
  32. train_user = pd.concat([train_user, test_user])
  33. #final label
  34. train_label = train_user[["label"]].copy()
  35. #drop phone_no_m
  36. test_user.drop(["phone_no_m"], axis=1,inplace=True)
  37. train_user.drop(["phone_no_m", "label"], axis=1,inplace=True)
  38. #load final test features as testfs, fs means fusai
  39. test_app_featfs=pd.read_csv(path + 'zpfsdata/test_app_feat.csv')
  40. test_voc_featfs=pd.read_csv(path + 'zpfsdata/test_voc_feat.csv')
  41. test_sms_featfs=pd.read_csv(path + 'zpfsdata/test_sms_feat.csv')
  42. test_userfs=pd.read_csv(path + 'zpfsdata/test_user.csv')
  43. test_userfs = test_userfs.merge(test_app_featfs, on="phone_no_m", how="left")
  44. test_userfs = test_userfs.merge(test_voc_featfs, on="phone_no_m", how="left")
  45. test_userfs = test_userfs.merge(test_sms_featfs, on="phone_no_m", how="left")
  46. test_userfs["city_name"] = LabelEncoder().fit_transform(test_userfs["city_name"].astype(np.str))
  47. test_userfs["county_name"] = LabelEncoder().fit_transform(test_userfs["county_name"].astype(np.str))
  48. #create submission dataframe
  49. submission = test_userfs[["phone_no_m"]].copy()
  50. #drop phone_no_m
  51. test_userfs.drop(["phone_no_m"], axis=1,inplace=True)
  52. #test_userfs.replace([r'\\N'], np.nan, inplace=True)
  53. test_userfs.replace([u'\\N'], np.nan, inplace=True)
  54. test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
  55. #col = list(test_userfs.columns)
  56. #test_userfs[col] = test_userfs[col].apply(pd.to_numeric, errors='coerce')
  57. #test_userfs['arpu_202005'] = test_userfs['arpu_202005'].apply(lambda x:x.replace('\n', '').replace('\r', '').replace('\\N', '')).astype(np.float32)
  58. test_userfs_ori = test_userfs

模型搭建:

由于数据量少,服务器性能还可以,于是建模部分非常简单粗暴,直接grid search

  1. depth = 8
  2. cv = 5
  3. #create catboost model
  4. catclf = cat.CatBoostClassifier(
  5. allow_writing_files = False
  6. , od_type= 'Iter'
  7. , silent=True
  8. )
  9. #final parameters
  10. cat_grid = {'depth':[depth]
  11. , 'bootstrap_type':['Bernoulli']
  12. , 'od_type':['Iter']
  13. , 'l2_leaf_reg':[15]
  14. , 'learning_rate': [0.1]
  15. , 'allow_writing_files':[False]
  16. , 'silent':[True]
  17. }
  18. #search and fit
  19. catgrid = GridSearchCV(cat.CatBoostClassifier(), param_grid=cat_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose = 10)
  20. catgrid.fit( train_user, train_label['label'] )
  21. #predict output prob
  22. test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.39) )
  23. test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
  24. cat_proba = catgrid.predict_proba( test_userfs )
  25. rslt_prob_cat = pd.DataFrame( cat_proba )
  26. rslt_prob_cat.columns = ['lb0','lb1']
  27. #create lgb model
  28. #final parameters
  29. lgb_grid = {'booster':['gbdt']
  30. , 'num_leaves':[256]
  31. , 'min_child_weight':[4]
  32. , 'feature_fraction':[0.7]
  33. , 'bagging_fraction':[0.8]
  34. , 'bagging_freq': [1]
  35. }
  36. #search and fit
  37. lgbgrid = GridSearchCV(lgb.LGBMClassifier(), param_grid=lgb_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose = 10)
  38. lgbgrid.fit( train_user, train_label['label'] )
  39. #predict output prob
  40. test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.34) )
  41. test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
  42. lgb_proba = lgbgrid.predict_proba( test_userfs )
  43. rslt_prob_lgb = pd.DataFrame(lgb_proba)
  44. rslt_prob_lgb.columns = ['lb0','lb1']
  45. #create xgb model
  46. #final parameters
  47. from xgboost import XGBClassifier
  48. xgbclf=XGBClassifier(base_score=0.5
  49. , booster='gbtree'
  50. , colsample_bytree=0.9
  51. , learning_rate=0.1
  52. , max_depth=8
  53. , min_child_weight=7
  54. , n_estimators=100
  55. , n_jobs=-1
  56. , objective='binary:logistic'
  57. , subsample=0.75
  58. , verbosity=1)
  59. #fit
  60. xgbclf.fit( train_user, train_label['label'] )
  61. #predict output prob
  62. test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.319) )
  63. test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
  64. xgb_proba = xgbclf.predict_proba( test_userfs )
  65. rslt_prob_xgb = pd.DataFrame(lgb_proba)
  66. rslt_prob_xgb.columns = ['lb0','lb1']

调整概率输出:

  1. bestnew112 = 0.25*rslt_prob_lgb + 0.25*rslt_prob_xgb + 0.5*rslt_prob_cat
  2. bestnew112["label"]=bestnew112["lb1"]
  3. bestnew112["label"][bestnew112.label>60/100]=1
  4. bestnew112["label"][bestnew112.label<60/100]=0
  5. sub['label'] = bestnew112['label']
  6. print(sub['label'].value_counts())
  7. print(sub['label'].value_counts()/sub.shape[0])
  8. sub.to_csv('lgb25xgb25cat50threshold60.csv',index=None)

以上结果B榜0.9005

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号