赞
踩
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2019-01-01 21:03:59 # @Author : cdl (1217096231@qq.com) # @Link : https://github.com/cdlwhm1217096231 # @Version : $Id$ import fasttext import jieba import os import logging logging.basicConfig( format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) """ 第1步:获取分类文本,文本直接使用清华大学新闻文本,输出格式是:样本+样本标签 所使用的训练集和测试集已经分词好了,每个样本后会用Tab键隔开,打上样本标签,例如__label__sports """ # 空缺 """ 第2步:利用fasttext进行分类,使用fasttext包 """ # 训练模型 classifier = fasttext.supervised( "./dataset/news_fasttext_train.txt", "news_fasttext.model", label_prefix="__label__") # 训练好的模型 # classifier = fasttext.load_model('new_fasttext.model.bin', label_prefix="__label__") # 测试模型 result = classifier.test("./dataset/news_fasttext_test.txt") print("准确率:", result.precision) print("召回率:", result.recall) # fasttext只是对整个文本提供precision和recall,要统计不同的分类结果,需要自己实现 classifier = fasttext.load_model( 'news_fasttext.model.bin', label_prefix='__label__') labels_right = [] texts = [] with open("./dataset/news_fasttext_test.txt") as f: for line in f: line = line.strip() labels_right.append(line.split('\t')[1].replace("__label__", "")) texts.append(line.split('\t')[0]) labels_predict = [e[0] for e in classifier.predict(texts)] # 预测标签值 text_labels = list(set(labels_right)) # 实际标签值 text_predict_labels = list(set(labels_predict)) # 去重后,预测标签值 # print("预测标签值:", text_predict_labels) # print("真实标签值:", text_labels) A = dict.fromkeys(text_labels, 0) # 预测正确的各个类的数目,真阳性TP B = dict.fromkeys(text_labels, 0) # 真实测试数据集中各个类的数目 C = dict.fromkeys(text_predict_labels, 0) # 预测结果中各个类的数目,所有的预测结果 for i in range(0, len(labels_right)): B[labels_right[i]] += 1 C[labels_predict[i]] += 1 if labels_right[i] == labels_predict[i]: # 判断是否是真阳性TP样本 A[labels_right[i]] += 1 print("真阳性样本TP的类别数目A:", A) print("测试数据集中各个类别的数目B:", B) print("预测结果中各个类别的数目C:", C) # 计算准确率,召回率,F值 for key in B: try: r = float(A[key]) / float(B[key]) # 召回率 p = float(A[key]) / float(C[key]) # 准确率 f = p * r * 2 / (p + r) # f1值 print("%s:\t p:%f\t r:%f\t f:%f" % (key, p, r, f)) except: print("错误:", key, "正确:", A.get(key, 0), "real:", B.get(key, 0), "预测:", C.get(key, 0))
1.Bag of Tricks for Efficient Text Classification
2.Enriching Word Vectors with Subword Information
3.玩转Fasttext
4.fastText原理及实践
5.手打例子一步一步带你看懂softmax函数以及相关求导过程
6.使用fastText对文本进行分类
7.训练集数据下载链接
8.测试集数据下载链接
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。