赞
踩
扫码关注“自然语言处理与算法”公众号,持续更新~
下面详细介绍基于FastText文本分类实战。对fasttext原理及模型中的方法不熟悉的话,可以先看看FastText原理解析
训练数据保存在csv文件中,包含labels,text两列,labels有0,1,2三种。
labels,text
0,大华技术:超高精度人体热成像测温系统经信发布测温系统采
1,A股3月迎来艳阳天牛市布局正当时!这类股成主力新宠儿涨停战机
2,泰格医药—公司动态点评:业绩符合预期,三大业务板块值得期待Dword
执行训练之前需要对文本进行分词和去停用词,其中分词步骤可以构建领域词典,以期能够正确分词。然后进一步处理为模型训练要求的格式,处理后的文本格式如下:
国网 江苏 无锡 供电 保电 战疫 有数 服务 有心 中国 电力 报 02 2817 43 关注 __label__0
外汇 新三板 行情 行情 股 新闻 外汇 新三板 工程机械 板块 走强 山河 智能 涨幅 居前 工程机械 板块 走强 山河 智能 涨幅 居前 __label__2
广西 科技 特派员 指导 春耕 广西 农 __label__1
然后直接加载中间数据进行训练,训练代码如下。
import pandas as pd import jieba import codecs import fasttext from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.metrics import accuracy_score import numpy as np ''' 训练并验证模型的性能,同时保存训练模型 验证集为从训练集中随机选择25%的数据,可以在test_size=0.25处调整 机遇:0,风险:1,不确定:2 ''' stop_data_dir = 'data/stop_words.txt' #停用词路径 user_dict_dir = "data/userdict_all.txt" #自定义词典路径 train_data_dir = 'data/train.csv' test_data_dir = 'data/train.csv' output_dir = 'output.xlsx' jieba.load_userdict(user_dict_dir) #加载训练数据 data = pd.read_csv(train_data_dir,encoding='utf-8') data['segment'] = data['text'].apply(lambda x:jieba.lcut(x)) real = pd.read_csv(test_data_dir,encoding='utf-8') print('----------开始去停用词--------') #去停用词 stop_list= [] #用来存停用词的list with codecs.open(stop_data_dir,encoding='utf-8') as f: for x in f.readlines(): x1 = x.replace("\n", "").replace("\r","").replace("\r\n","") stop_list.append(x1) for i in range(len(data)): word = data['segment'][i].copy() for x in word: if x in stop_list: data['segment'][i].remove(x) real['segment'] = real['text'].apply(lambda x:jieba.lcut(x)) for i in range(len(real)): word = real['segment'][i].copy() for x in word: if x in stop_list: real['segment'][i].remove(x) train_data, test_data, train_label, test_label = train_test_split(data['segment'], data['label'], test_size=0.1, random_state=42) train_data.index = range(len(train_data)) train_label.index = range(len(train_label)) test_data.index = range(len(test_data)) test_label.index = range(len(test_label)) #将训练集和测试集中已经去停用词并分词之后的文本处理成fasttext能够处理的格式 with open('./data/train_semantic.txt','w',encoding='utf-8') as f: for i in range(len(train_data)): str1 = " ".join(train_data[i])+"\t"+"__label__"+str(train_label[i])+'\n' f.write(str1) with open('./data/test_semantic.txt','w',encoding='utf-8') as f: for i in range(len(test_data)): str1 = " ".join(test_data[i])+"\t"+"__label__"+str(test_label[i])+'\n' f.write(str1) #获取标签 def get_label(pred): index = np.argmax(pred[1]) label = int(pred[0][index][-1]) return label def get_origin_label(number): label_dict = {'0': '机遇', '1': '风险', '2': '不确定'} return label_dict[number] def get_proba(pred): pred_dic = {} pred_dic[pred[0][0]] = pred[1][0] pred_dic[pred[0][1]] = pred[1][1] return pred_dic['__label__1'] print('------------------开始训练模型--------------------') model = fasttext.train_supervised(input="./data/train_semantic.txt",lr=0.1, epoch=100, wordNgrams=3, dim=300,loss='softmax') print('------------------模型训练结束--------------------') # 保存model model_path = './model/model.bin' model.save_model(model_path) f1 = open('./data/yuce.txt', 'w', encoding='utf-8') test_pred = [] for i in range(len(test_data)): r = model.predict(" ".join(test_data[i]),k=2) test_pred.append(get_label(r)) f1.write(str(r) + '**' + str(get_label(r)) + '\n') acc = accuracy_score(test_pred,test_label) precision = precision_score(test_pred,test_label,average='micro') recall = recall_score(test_pred,test_label,average='micro') f1 = f1_score(test_pred,test_label,average='micro') print("准确率:"+ str(acc)+"\n") print("精确率:"+ str(precision)+"\n") print("召回率:"+ str(recall)+"\n") print("F1值:"+ str(f1)+"\n")
预测时,首先加载训练后保存下来的模型,然后读取待预测数据进行分词、去停用词处理,最后执行预测。
import pandas as pd import jieba import codecs import fasttext import json import numpy as np ''' 新闻文本分类fasttext 加载训练好的模型并对新数据进行预测 ''' stop_data_dir = 'data/stop_words.txt' #停用词路径 user_dict_dir = "data/userdict_all.txt" #自定义词典路径 test_data_dir = 'data/file.csv' output_dir = 'output.xlsx' jieba.load_userdict(user_dict_dir) real = pd.read_csv(test_data_dir,encoding='utf-8') print('----------开始去停用词--------') #去停用词 stop_list= [] #用来存停用词的list with codecs.open(stop_data_dir,encoding='utf-8') as f: for x in f.readlines(): x1 = x.replace("\n", "").replace("\r","").replace("\r\n","") stop_list.append(x1) real['segment'] = real['text'].apply(lambda x:jieba.lcut(x)) for i in range(len(real)): word = real['segment'][i].copy() for x in word: if x in stop_list: real['segment'][i].remove(x) #获取标签 def get_label(pred): index = np.argmax(pred[1]) label = int(pred[0][index][-1]) return label def get_origin_label(number): label_dict = {'0': '机遇', '1': '风险', '2': '不确定'} return label_dict[number] #获取标签 def get_label(pred): index = np.argmax(pred[1]) label = int(pred[0][index][-1]) return label #加载训练好的模型 model_path = './model/model.bin' model = fasttext.load_model(model_path) print('------------------正在预测--------------------') cqk=[] type_list = [] for u in real['segment'].values: res = model.predict(" ".join(u),k=2) cqk.append(get_label(res)) type_list.append(get_origin_label(get_label(res))) real['label'] = cqk print('------------------正在写入预测文件--------------------') #real[['content','pred_score']].to_excel(output_dir,encoding='gbk') real[['text','label']].to_excel(output_dir,encoding='gbk')
参考:
https://blog.csdn.net/ymaini/article/details/81489599
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。