赞
踩
以下学习笔记来源于 Coggle 30 Days of ML(22年1&2月)
链接:https://coggle.club/blog/30days-of-ml-202201
比赛链接:https://aistudio.baidu.com/aistudio/competition/detail/45/0/task-definition
import numpy as np
import pandas as pd
import jieba
import Levenshtein #计算编辑距离
from tqdm import tqdm
import warnings
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score,accuracy_score
import glob
#读取tsv文件的方法
def read_tsv(input_file,columns):
with open(input_file,"r",encoding="utf-8") as file:
lines = []
count = 1
for line in file:
if len(line.strip().split("\t")) != 1:
lines.append([count]+line.strip().split("\t"))
count += 1
df = pd.DataFrame(lines)
df.columns = columns
return df
bq_train=read_tsv('./bq_corpus/train.tsv',['index','text1','text2','label'])
lcqmc_train=read_tsv('./lcqmc/train.tsv',['index','text1','text2','label'])
pawsx_train=read_tsv('./paws-x-zh/train.tsv',['index','text1','text2','label'])
bq_test=read_tsv('./bq_corpus/test.tsv',['index','text1','text2'])
lcqmc_test=read_tsv('./lcqmc/test.tsv',['index','text1','text2'])
pawsx_test=read_tsv('./paws-x-zh/test.tsv',['index','text1','text2'])
#文本处理,有些可能用不到 import re import string import jieba with open("dict/stop_words.utf8",encoding="utf-8") as f: stopword_list=f.readlines() def tokenize_text(text): tokens=jieba.cut(text) tokens=[token.strip() for token in tokens] return tokens def remove_special_characters(text): tokens=tokenize_text(text) pattern=re.compile('[{}]'.format(re.escape(string.punctuation))) filtered_tokens=filter(None,[pattern.sub('',token) for token in tokens]) filtered_text=''.join(filtered_tokens) return filtered_text #去除停用词 def remove_stopwords(text): tokens=tokenize_text(text) filtered_tokens=[token for token in tokens if token not in stopword_list] filtered_text=''.join(filtered_tokens) return filtered_text def normalize_corpus(corpus,tokenize=False): normalize_corpus=[] for text in corpus: text=remove_special_characters(text) text=remove_stopwords(text) if tokenize: normalize_corpus.append(tokenize_text(text)) else: normalize_corpus.append(text) return normalize_corpus
df_train=pd.concat([bq_train,lcqmc_train,pawsx_train])
df_test=pd.concat([bq_test,lcqmc_test,pawsx_test])
corpus=df_train['text1'].values.tolist()+df_train['text2'].values.tolist()
tokenized_corpus=[' '.join(jieba.lcut(text)) for text in corpus]
构建统计特征,包括:
句子A包含的字符个数、句子B包含的字符个数
句子A与句子B的编辑距离
句子A与句子B共有单词的个数
句子A与句子B共有字符的个数
句子A与句子B共有单词的个数 / 句子A字符个数
句子A与句子B共有单词的个数 / 句子B字符个数
#计算编辑距离,可以直接使用Levenshtein的distance计算 def edit_distance(text1,text2): n,m=len(text1),len(text2) import numpy as np dp=np.zeros((n+1,m+1)) for i in range(1,n+1): dp[i][0]=i for j in range(1,m+1): dp[0][j]=j for i in range(1,n+1): for j in range(1,m+1): tmp=int(text1[i-1]!=text2[j-1]) dp[i][j]=min(dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+tmp) return dp[n][m] # 两个列表共有变量的个数 def both_num(list1,list2): dict1,dict2,ans={},{},0 for i in list1: dict1[i]=list1.count(i) for i in list2: dict2[i]=list2.count(i) for k,v in dict1.items(): tmp=0 if k not in list2 else dict2[k] ans+=min(v,tmp) return ans #text1和text2长度差 def len_diff(text1,text2): return abs(len(text1)-len(text2)) # text1和text2共有单词的个数 def both_words_num(text1,text2): a,b=jieba.lcut(text1),jieba.lcut(text2) return both_num(a,b) # text1和text2共有字符的个数 def both_chars_num(text1,text2): a,b=[i for i in text1],[i for i in text2] return both_num(a,b) #text1与text2共有单词的个数 / text1字符个数 def both_words_divideby_char1(text1,text2): return both_words_num(text1,text2)/len(text1) #text1与text2共有单词的个数 / text2字符个数 def both_words_divideby_char2(text1,text2): return both_words_num(text1,text2)/len(text2)
计算TFIDF,并对句子A和句子B进行特征转换,计算句子A与句子B的TFIDF向量的内积距离
from sklearn.feature_extraction.text import TfidfVectorizer def tfidf_extractor(corpus,ngram_range=(1,1)): vectorizer=TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=ngram_range) features=vectorizer.fit_transform(corpus) return vectorizer,features def participle_text(text): words_list=jieba.lcut(text) return ' '.join(words_list) #得到tfidf特征向量的内积距离 def get_tfidfvec_dis(tfidf_vectorizer,text1,text2): fit_text1=tfidf_vectorizer.transform([participle_text(text1)]) fit_text2=tfidf_vectorizer.transform([participle_text(text2)]) vec1=fit_text1.toarray()[0] vec2=fit_text2.toarray()[0] return np.dot(vec1,vec2) def tfidfvec_dis_list(tfidf_vectorizer,df): dis_list=[] for text1,text2 in zip(df['text1'],df['text2']): dis_list.append(get_tfidfvec_dis(tfidf_vectorizer,text1,text2)) return dis_list
tfidf_vectorizer,tfidf_train_features=tfidf_extractor(tokenized_corpus)
示例:
def feature_eng(data):
"""
统计特征
"""
data['len_diff']=[len_diff(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
data['both_words']=[both_words_num(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
data['both_chars']=[both_chars_num(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
data['both_words_div1']=[both_words_divideby_char1(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
data['both_words_div2']=[both_words_divideby_char2(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
data['tf_idf_dis']=tfidfvec_dis_list(tfidf_vectorizer,data)
data['edit_dis']=[Levenshtein.distance(t1,t2) for t1,t2 in zip(data['text1'],data['text2'])]
return data
df_train=feature_eng(df_train)
df_test=feature_eng(df_test)
处理完之后的样子:
eval_fun = accuracy_score def run_oof(clf, X_train, y_train, X_test, kf): print(clf) preds_train = np.zeros((len(X_train), 2), dtype = np.float64) preds_test = np.zeros((len(X_test), 2), dtype = np.float64) train_loss = []; test_loss = [] i = 1 for train_index, test_index in kf.split(X_train, y_train): x_tr = X_train[train_index]; x_te = X_train[test_index] y_tr = y_train[train_index]; y_te = y_train[test_index] clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], verbose = False) train_loss.append(eval_fun(y_tr, np.argmax(clf.predict_proba(x_tr)[:], 1))) test_loss.append(eval_fun(y_te, np.argmax(clf.predict_proba(x_te)[:], 1))) preds_train[test_index] = clf.predict_proba(x_te)[:] preds_test += clf.predict_proba(X_test)[:] print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss))) print('-' * 50) i += 1 print('Train: ', train_loss) print('Val: ', test_loss) print('-' * 50) print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss))) preds_test /= n_fold return preds_train, preds_test
params = { 'objective':'binary', 'boosting_type':'gbdt', 'metric':'auc', 'n_jobs':-1, 'learning_rate':0.05, 'num_leaves': 2**6, 'max_depth':8, 'tree_learner':'serial', 'colsample_bytree': 0.8, 'subsample_freq':1, 'subsample':0.8, 'num_boost_round':5000, 'max_bin':255, 'verbose':-1, 'seed': 2021, 'bagging_seed': 2021, 'feature_fraction_seed': 2021, 'early_stopping_rounds':100, } n_fold=10 skf = StratifiedKFold(n_splits = n_fold, shuffle = True) train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), df_train.iloc[:,1:].values, df_train.iloc[:,0].values, df_test.iloc[:,:].values, skf)
训练过程:
df_test['prediction']=np.argmax(test_pred, 1)
bq_test['prediction']=df_test['prediction'][:len(bq_test)].tolist()
lcqmc_test['prediction']=df_test['prediction'][len(bq_test):len(bq_test)+len(lcqmc_test)].tolist()
pawsx_test['prediction']=df_test['prediction'][len(bq_test)+len(lcqmc_test):].tolist()
bq_test['index']=[i for i in range(len(bq_test))]
lcqmc_test['index']=[i for i in range(len(lcqmc_test))]
pawsx_test['index']=[i for i in range(len(pawsx_test))]
bq_test.to_csv('submit/bq_corpus.tsv', sep='\t',
index=False, columns=["index","prediction"], mode="w")
lcqmc_test.to_csv('submit/lcqmc.tsv', sep='\t',
index=False, columns=["index","prediction"], mode="w")
pawsx_test.to_csv('submit/paws-x.tsv', sep='\t',
index=False, columns=["index","prediction"], mode="w")
0.6495的分数,再接再厉。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。