赞
踩
参考链接:14种分类算法进行文本分类实战:https://blog.csdn.net/qq_41731978/article/details/109459234(这个博主写的很详细,受益匪浅)
本文暂时不涉及深度学习,还没学会(哭.jpg)
主要是在自己实习的时候接触到的文本分类,主要是中文文本的单标签分类(one vs one),多标签分类(one vs rest),用python实现,需要先学习一下机器学习方面的内容,还有就是pandas科学计算库,sklearn等。
import os import shutil import zipfile import jieba import time import warnings import xgboost import lightgbm import numpy as np import pandas as pd from keras import models from keras import layers from keras.utils.np_utils import to_categorical from keras.preprocessing.text import Tokenizer # sklearn提供的一些模型 from sklearn import svm # 支持向量机 from sklearn import metrics # 用于预测准确度等 from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier # 决策树 from sklearn.neighbors import KNeighborsClassifier # K 近邻 from sklearn.naive_bayes import BernoulliNB # 伯努利朴素贝叶斯 from sklearn.naive_bayes import GaussianNB # 高斯贝朴素叶斯 from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯 from sklearn.linear_model import LogisticRegression # 逻辑回归 from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier # 标签化 from sklearn.preprocessing import LabelEncoder # 特征提取 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # 训练集、测试集的划分 from sklearn.model_selection import train_test_split warnings.filterwarnings('ignore')
从数据库中读取数据
# 导入pymysql模块 import pymysql # 连接database conn = pymysql.connect( host="localhost", user="root", password="123", database="kss", charset="utf8") # 得到一个可以执行SQL语句的光标对象 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 # 定义要执行的SQL语句 sql = 'SELECT item_description, item_catalog_name FROM item'
加载停用词
stoplist = [word.strip() for word in open('D:\Code\PythonCode\project01\project_01\db\stopwords.txt', encoding='utf-8').readlines()]
逐条读取并处理
labels, texts = [], [] sentences = [] try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() for row in results: # 逐行处理 segs=jieba.lcut(row[0]) segs = filter(lambda x:len(x)>1, segs) # 去除长度小于1的词 segs = filter(lambda x:x not in stoplist, segs) # 去掉停用词 # labels.append(row[1]) # texts.append( " ".join(list(jieba.cut(row[0])))) sentences.append((" ".join(segs),row[1])) # 一定用join()把分词连接起来,row[1]是标签 # 打印结果 # print(row) # for row in results[-500:]: # # labels.append(row[1]) # # texts.append( " ".join(list(jieba.cut(row[0])))) # sentences.append((" ".join(list(jieba.cut(row[0]))),row[1])) except: print ("Error: unable to fecth data")
扩展:
可以学习一下train_test_split这个函数(之后单独我再学习一下sklearn的API吧)
import random
random.shuffle(sentences) # 打乱顺序生成有效训练集
# 将数据集划分为训练集和测试集
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.7, random_state=0)
我这一步,给省略了,遇到点问题,还不太清楚
le = LabelEncoder()
# 为什么编码之后的准确率低很多
# y_train_le = le.fit_transform(train_y)
# y_test_le = le.fit_transform(test_y)
y_train_le = train_y
y_test_le = test_y
这个需要学一下TfidfVectorizer这个函数的使用
# 文本数据转换成数据值数据矩阵 # tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist) tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,4), max_features=10000) '''注意: 这里要先 count.fit() 训练所有训练和测试集,保证特征数一致, 这样在算法建模时才不会报错 ''' # count.fit(list(train_x) + list(test_x)) tfidf_vectorizer.fit(train_x) X_train_count = tfidf_vectorizer.transform(train_x) X_test_count = tfidf_vectorizer.transform(test_x) # 这里会内存不够 # X_train_count = X_train_count.torarray() # X_test_count = X_test_count.toaray() # X_train_count.toarray() # print(X_train_count.shape, X_test_count.shape) # X_train_count, X_test_count
这个是直接用那个大佬的代码,一个字,赞!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。