赞
踩
以购物评论为例:
- # 读入原始数据集
- import pandas as pd
-
- dfpos = pd.read_excel('../data/购物评论.xlsx', sheet_name='正向', header=None)
- dfpos['y'] = 1
- dfpos
- dfneg = pd.read_excel('../data/购物评论.xlsx', sheet_name='负向', header=None)
- dfneg['y'] = 0
- dfneg
- # 将正向和负向数据集进行合并,用y=0和1来区分
- df0 = dfpos.append(dfneg, ignore_index=True)
- df0
- # 分词和预处理
- import jieba
-
- cut_txt = lambda x: " ".join(jieba.lcut(x)) # 这里不做任何清理工作,以保留情感词
- df0['clean_txt'] = df0[0].apply(cut_txt)
- df0
- from sklearn.feature_extraction.text import CountVectorizer
-
- count_vec = CountVectorizer(min_df=5) # 出现5次以上才纳入
- word_mtx = count_vec.fit_transform(df0.clean_txt)
- word_mtx
- # 按照7:3的比例生成训练集和测试集
- from sklearn.model_selection import train_test_split
-
- x_train, x_test, y_train, y_test = train_test_split(word_mtx, df0.y, test_size=0.3) # 这里可以直接使用稀疏矩阵格式
-
- x_train[0]
- # 使用SVM进行建模
- from sklearn.svm import SVC
-
- clf = SVC(kernel='rbf', verbose=True)
- clf.fit(x_train, y_train) # 内存占用可能较高
- clf.score(x_train, y_train)
- # 对模型效果进行评估
- from sklearn.metrics import classification_report
-
- print(classification_report(y_test, clf.predict(x_test)))
clf.predict(count_vec.transform([df0.clean_txt[0]]))[0]
- # 模型预测
- import jieba
-
- def m_pred(string, count_vec, model):
- words = " ".join(jieba.lcut(string))
- words_vecs = count_vec.transform([words]) # 数据需要转换为可迭代格式
-
- result = model.predict(words_vecs)
-
- if int(result[0]) == 1:
- print(string, '正向情感!')
- else:
- print(string, '负向情感!')
-
- comment = '外观美观,速度也不错'
- m_pred(comment, count_vec, clf)
- comment = '总的来说,给与好评!'
- m_pred(comment, count_vec, clf)
【基于词袋的模型,对于复杂的情感分类还是存在问题的,还需要寻找更有的模型】
以购物评论为例:
- # 读入原始数据集
- import pandas as pd
-
- dfpos = pd.read_excel('../data/购物评论.xlsx', sheet_name='正向', header=None)
- dfpos['y'] = 1
-
- dfneg = pd.read_excel('../data/购物评论.xlsx', sheet_name='负向', header=None)
- dfneg['y'] = 0
-
- # 将正向和负向数据集进行合并,用y=0和1来区分
- df0 = dfpos.append(dfneg, ignore_index=True)
- df0
- # 分词和预处理, 生成list of list格式
- import jieba
-
- df0['cut'] = df0[0].apply(jieba.lcut)
- df0
- # 按照7:3的比例生成训练集和测试集
- from sklearn.model_selection import train_test_split
-
- x_train, x_test, y_train, y_test = train_test_split(df0.cut, df0.y, test_size=0.3)
-
- x_train[:2]
- # 设置word2vec模型
- from gensim.models.word2vec import Word2Vec
-
- n_dim = 300 # 指定向量维度,大样本时300-500较好
-
- w2v_model = Word2Vec(size=n_dim, min_count=10)
- w2v_model.build_vocab(x_train) # 生成词表
- # 在评论训练集上建模(大数据集时可能会花费几分钟)
- # 本例消耗内存较少
- %time w2v_model.train(x_train, total_examples=w2v_model.corpus_count, epochs=10)
- # 情感词向量间的相似度
- w2v_model.wv.most_similar('不错')
w2v_model.wv.most_similar('失望')
- # 生成整句向量用于情感分值预测
- # 对购物评价、微博等短文本而言,一般是将所有词向量的平均值作为分类算法的输入值
-
- # 生成整句所对应的所有词条的词向量矩阵
- print(len(df0.cut[0]))
- pd.DataFrame([w2v_model.wv[w] for w in df0.cut[0] if w in w2v_model.wv])
- # 用各个词向量直接平均的方式生成整句对应的向量
- def m_avgvec(words, w2v_model):
- return pd.DataFrame([w2v_model.wv[w] for w in words if w in w2v_model.wv]).agg('mean')
- # 生成建模用的矩阵,耗时较长
- %time train_vecs = pd.DataFrame([m_avgvec(s, w2v_model) for s in x_train])
- train_vecs
- # 情感分析模型拟合
- from sklearn.svm import SVC
- clf2 = SVC(kernel='rbf',verbose=True)
- clf2.fit(train_vecs, y_train) # 占用内存小于1G
- clf2.score(train_vecs, y_train)
- from sklearn.metrics import classification_report
- print(classification_report(y_train, clf2.predict(train_vecs))) # 此处未用验证集
- # 保存训练完毕的模型以便于今后使用
- import joblib
-
- # joblib.dump(modelname, 'filename.pkl')
- # modelname = joblib.load('filename.pkl')
- # 模型预测
- import jieba
-
- def m_pre(string, model):
- words = jieba.lcut(string)
- words_vecs = pd.DataFrame(m_avgvec(words, w2v_model)).T
-
- result = model.predict(words_vecs)
-
- if int(result[0]) == 1:
- print(string, '情感正向!')
- else:
- print(string, '情感负向!')
-
-
- comment = '颜色不错,喜欢这个款式,好评!'
- m_pre(comment, clf2)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。