赞
踩
偶然参加了诈骗电话识别这个比赛项目,分享一个线上0.76+的baseline,本次只用了一个基础表,lgb算法采用默认参数,并采用八折取最大的方法,后期如果有时间的话就把分析过程分析出来,欢迎捧场。
import warnings
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm
os.chdir(r'C:\Users\lenovo\Desktop\诈骗电话识别\诈骗电话号码识别-0527')
from sklearn.metrics import f1_score
train_user=pd.read_csv(r'train\train_user.csv')
test_user=pd.read_csv(r'test\test_user.csv')
#判断名下手机号个数
def shouji(x):
if x >0 and x<3:
return 1
elif x>=3 and x<=5:
return 2
elif x>5 :
return 4
else:
return 0
def user_table(train_user,col):
# 构建user消费统计特征
data=train_user
dict_city=dict(data.groupby(['city_name']).mean()[col])
dict_county=dict(data.groupby(['county_name']).mean()[col])
#将数据值填写
data['city_name_mean_arup']=data['city_name'].map(dict_city)
data['county_name_mean_arup']=data['county_name'].map(dict_county)
data[col]=data[col].fillna(0)
# 判断当月消费记录是否为空
data['arup_null']=data[col].apply(lambda x:1 if x ==0 else 0)
#判断当月是否属于高消费人群
data['arup_high']=data[col].apply(lambda x:1 if x >=500 else 0)
data['idcard_cnt_count']=data['idcard_cnt'].apply(lambda x:shouji(x))
cat_col = ['city_name', 'county_name']
for i in tqdm(cat_col):
lbl = LabelEncoder()
data[i] = lbl.fit_transform(data[i].astype(str))
return data
col='arpu_202003'
train_data=user_table(train_user,col)
# 处理测试集 user
col='arpu_202004'
test_data=user_table(test_user,col)
#筛选建模特征
cat_col = [i for i in train_data.columns if i not in ['label','phone_no_m','arpu_201908','arpu_201909','arpu_201910','arpu_201911','arpu_201912','arpu_202001','arpu_202002','arpu_202003']]
target=train_data['label']
train_data=train_data[cat_col]
test_data=test_data[cat_col]
model_lgb = lgb.LGBMRegressor()
best=0
i=1
sk=KFold(n_splits=8,shuffle=True,random_state=1000000)
for train,test in sk.split(train_data,target):
x_train=train_data.iloc[train]
y_train=target.iloc[train]
x_test=train_data.iloc[test]
y_test=target.iloc[test]
model_lgb.fit(x_train,y_train)
y_hat=model_lgb.predict(x_test)
y_hat=np.array(y_hat)
y_hat[y_hat<0.5]=0
y_hat[y_hat>=0.5]=1
#score=1/(mean_squared_error(y_test,y_hat)**0.5+1)
score = f1_score(y_test, y_hat, average='macro')
print(score)
if score>best:
best=score
print(i)
keys=model_lgb.predict(test_data)
print(keys)
i+=1
sub=pd.read_csv(r'submit_example.csv')
keys=np.array(keys)
keys[keys<0.5]=0
keys[keys>=0.5]=1
#存入提交数据
sub.iloc[:,1]=keys
os.chdir(r'C:\Users\lenovo\Desktop\诈骗电话识别\jg')
sub.to_csv('lgbbaseline{}.csv'.format(best),encoding='utf-8',index=None)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。