当前位置:   article > 正文

2020首届数字四川创新大赛_诈骗电话识别baseline0.76+

2020首届数字四川创新大赛_诈骗电话识别baseline0.76+

偶然参加了诈骗电话识别这个比赛项目,分享一个线上0.76+的baseline,本次只用了一个基础表,lgb算法采用默认参数,并采用八折取最大的方法,后期如果有时间的话就把分析过程分析出来,欢迎捧场。

import warnings
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
import  os
from tqdm import tqdm
os.chdir(r'C:\Users\lenovo\Desktop\诈骗电话识别\诈骗电话号码识别-0527')
from sklearn.metrics import f1_score

 

train_user=pd.read_csv(r'train\train_user.csv')
test_user=pd.read_csv(r'test\test_user.csv')

#判断名下手机号个数
def shouji(x):
    if x >0 and x<3:
        return 1
    elif x>=3 and x<=5:
        return 2
    elif x>5 :
        return 4
    else:
        return 0
def user_table(train_user,col):
    #  构建user消费统计特征
    data=train_user
    dict_city=dict(data.groupby(['city_name']).mean()[col])
    dict_county=dict(data.groupby(['county_name']).mean()[col])
    #将数据值填写
    data['city_name_mean_arup']=data['city_name'].map(dict_city)
    data['county_name_mean_arup']=data['county_name'].map(dict_county)
    data[col]=data[col].fillna(0)
    # 判断当月消费记录是否为空
    data['arup_null']=data[col].apply(lambda x:1 if x ==0  else 0)
    #判断当月是否属于高消费人群
    data['arup_high']=data[col].apply(lambda x:1 if x  >=500  else 0)
    data['idcard_cnt_count']=data['idcard_cnt'].apply(lambda x:shouji(x))
    cat_col = ['city_name', 'county_name']
    for i in tqdm(cat_col):
        lbl = LabelEncoder()
        data[i] = lbl.fit_transform(data[i].astype(str))

    
    return data

col='arpu_202003'
train_data=user_table(train_user,col)

# 处理测试集  user  
col='arpu_202004'
test_data=user_table(test_user,col)

 #筛选建模特征
cat_col = [i for i in train_data.columns if i not in ['label','phone_no_m','arpu_201908','arpu_201909','arpu_201910','arpu_201911','arpu_201912','arpu_202001','arpu_202002','arpu_202003']]

target=train_data['label']
train_data=train_data[cat_col]
test_data=test_data[cat_col]

model_lgb = lgb.LGBMRegressor()

best=0
i=1
sk=KFold(n_splits=8,shuffle=True,random_state=1000000)
for train,test in sk.split(train_data,target):
    x_train=train_data.iloc[train]
    y_train=target.iloc[train]
    x_test=train_data.iloc[test]
    y_test=target.iloc[test]
    model_lgb.fit(x_train,y_train)
    y_hat=model_lgb.predict(x_test)
    y_hat=np.array(y_hat)
    y_hat[y_hat<0.5]=0
    y_hat[y_hat>=0.5]=1
    
    #score=1/(mean_squared_error(y_test,y_hat)**0.5+1)
    score = f1_score(y_test, y_hat, average='macro')
    print(score)
    if score>best:
        best=score
        print(i)
        keys=model_lgb.predict(test_data)
        print(keys)
    i+=1

sub=pd.read_csv(r'submit_example.csv')

keys=np.array(keys)

keys[keys<0.5]=0
keys[keys>=0.5]=1
#存入提交数据
sub.iloc[:,1]=keys
os.chdir(r'C:\Users\lenovo\Desktop\诈骗电话识别\jg')
sub.to_csv('lgbbaseline{}.csv'.format(best),encoding='utf-8',index=None)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Monodyee/article/detail/717546
推荐阅读
相关标签
  

闽ICP备14008679号