赞
踩
对句子二分类,检测两个句子是否表达是同一个意思,模型数据来自天池全球人工智能技术创新大赛【赛道三】详情
模型测试集准确率高99.5%,线上准确率高75%左右,略低于baseline
数据格式:
fasttext使用方法可查看:fasttext官网
import pandas as pd import random cate_dic = {'same':1, 'different':0} #数据加载,未构建验证集 train_file = r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\oppo_breeno_round1_data\gaiic_track3_round1_train_20210228.tsv' test_file = r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\oppo_breeno_round1_data\gaiic_track3_round1_testA_20210228.tsv' df_train = pd.read_table(train_file,names=['q1', 'q2', 'label']).fillna("0") # (100000, 3) df_test = pd.read_table(test_file, names=['q1', 'q2']).fillna( "0") # (25000, 2) label = df_train['label'].values df = pd.concat([df_train, df_test], ignore_index=True) # (125000, 4) df['text'] = df['q1'] + " " + df['q2'] #处理成fasttext格式 def preprocess_text(content_lines, sentences, category): for line in content_lines: try: sentences.append("__label__"+str(category)+" , "+line) except: print(line) continue #生成训练数据 sentences = [] same_sentences = df_train[df_train.label==1] same_sentences = (same_sentences['q1']+ " " + same_sentences['q2']).values.tolist() diffent_sentences = df_train[df_train.label==0] diffent_sentences = (diffent_sentences['q1']+ " " + diffent_sentences['q2']).values.tolist() preprocess_text(same_sentences, sentences, cate_dic['same']) preprocess_text(diffent_sentences, sentences, cate_dic['different'] ) random.shuffle(sentences) #写入到文本 out = open(r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\train_data.txt', 'w',encoding='utf-8') for sentence in sentences: out.write(sentence+"\n") #训练 classifier = fasttext.train_supervised(input='train_data.txt', lr=1.0, epoch=25, wordNgrams=3, bucket=200000, dim=50, loss='hs') #查看效果,默认是所有类别的f1score classifier.test('train_data.txt') #100000, 0.9671, 0.9671 #预测 lr_0_predictions = [] lr_1_predictions = [] test_sentences_list = (df_test['q1']+ " " + df_test['q2']).values.tolist() for i,texts in enumerate(test_sentences_list): labels, probabilities = classifier.predict(texts, k=2) print(labels,'--', probabilities) if (labels[0]=='__label__0'): lr_0_predictions.append(probabilities[0]) if (labels[0]=='__label__1'): lr_0_predictions.append(probabilities[1]) #预测的第一列是0,写入到文件 pd.DataFrame(lr_0_predictions).to_csv("result.csv", index=False, header=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。