赞
踩
AI实战:从入门到精通系列——用感知器实现情感分类(一)
AI实战:从入门到精通系列——用感知器实现情感分类(二)
环境
数据集
weibo_senti_100k 微博情感数据集
查看数据信息
import pandas as pd
pd_all = pd.read_csv('./data/weibo_senti_100k.csv')
print('评论数目(总体):%d' % pd_all.shape[0])
print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0])
print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0])
print(pd_all.sample(10))
输出
数据信息及样式:
数据集拆分为train、test
import os import pandas as pd from sklearn.model_selection import train_test_split from sklearn.utils import shuffle def train_valid_test_split(x_data, y_data, test_size=0.1, shuffle=True): x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=shuffle) return x_train, x_test, y_train, y_test if __name__ == '__main__': pd_all = pd.read_csv("./data/weibo_senti_100k.csv") pd_all = shuffle(pd_all) x_data, y_data = pd_all.review, pd_all.label x_train, x_test, y_train, y_test = train_valid_test_split(x_data, y_data, 0.1) train = pd.DataFrame({'label':y_train, 'x_train': x_train}) train.to_csv("./data/train.csv", index=False, sep='\t') test = pd.DataFrame({'label':y_test, 'x_test': x_test}) test.to_csv("./data/test.csv", index=False, sep='\t')
结果如下
test.csv
train.csv
生成词向量
def train_word2vec(train_path, test_path): import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence #加载数据 pd_all = pd.read_csv(train_path) train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist() pd_all = pd.read_csv(test_path) test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist() #分词 train_data_x = [segment(k) for k in train_data_x] test_data_x = [segment(k) for k in test_data_x] text = test_data_x + train_data_x input_file_name = './data/sentence.txt' open(input_file_name, 'w').write('\n'.join(text)) model = Word2Vec(LineSentence(input_file_name), word2vec_dimension = 100, # 词向量维度为100 window = 5, min_count = 1, workers = multiprocessing.cpu_count()) model.save('./data/word2vec.model') def segment(text): words = jieba.cut(text) words = list(words)[:max_text_len] return ' '.join(words)
执行:
train_path = './data/train.csv'
test_path = './data/test.csv'
train_word2vec(train_path, test_path)
输出:
./data/word2vec.model
./data/ word2vec.model.trainables.syn1neg.npy
./data/word2vec.model.wv.vectors.npy
测试词向量:
from gensim.models import Word2Vec
model = Word2Vec.load("./data/word2vec.model")
vector = model.wv['好吃']
输出:
[-0.73283386 0.86413544 0.75687975 -0.5055297 -0.42515302 0.18348737 2.3790839 1.0248554 2.101729 -0.4618316 0.43203285 -0.5404889 -1.017284 -2.2938926 2.3901055 -0.69351804 1.6101934 -0.59710294 -0.03683157 0.57503146 -1.250644 2.980576 1.1501396 -0.81633765 0.6402967 2.3379786 -0.877263 -1.9016323 1.1057235 0.06841037 -0.05232436 -0.08345098 -0.30685595 -1.1040177 -1.7571559 -1.7910484 -0.7331836 0.1513317 -0.621015 0.8975967 2.5499363 1.1568907 0.3688762 -0.5182226 -0.30297205 0.5822141 -1.0808538 -0.01062215 -1.4400197 -2.2848194 2.1822946 0.15740396 1.0032029 -0.8410342 -1.1311824 -0.33163172 1.3151053 -0.2986618 1.9608823 -0.2602172 0.63158864 1.239699 0.10924603 -1.7023547 -1.554196 0.03117983 0.6561903 -0.4397268 -1.9914472 0.79718435 -1.4864717 -2.9809866 -0.46287113 0.4837672 -0.71872777 2.4697163 -0.53781223 0.23790799 2.0566401 1.6394123 -0.9129417 1.5896504 1.5701648 1.1339688 -1.8522842 2.0832975 -1.9120314 -0.23889321 2.8850334 0.70530176 1.6666977 -1.0355597 0.36848044 -0.02313641 -1.3314507 -0.52943283 0.29032257 -1.952622 -0.674098 -0.20572844]
数据处理
data_helper.py
import numpy as np import pandas as pd import jieba import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence max_text_len = 50#50个词 word2vec_dimension = 100 def load_train_test_data(train_path, test_path, w2v_model_path): #加载数据 pd_all = pd.read_csv(train_path) train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist() pd_all = pd.read_csv(test_path) test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist() train_data_x = train_data_x[:5000]#数据太大了,只取5000个作为训练【我的电脑内存不足,故仅取部分数据作为训练集】 #分词 train_data_x = [segment(k) for k in train_data_x] test_data_x = [segment(k) for k in test_data_x] #文本转向量表示 w2v_model = Word2Vec.load(w2v_model_path) train_data_x = text_to_word2vec(w2v_model, train_data_x) test_data_x = text_to_word2vec(w2v_model, test_data_x) return train_data_x, test_data_x, train_data_y, test_data_y def text_to_word2vec(w2v_model, text_list): #文本转为向量表示 text_array = np.zeros((len(text_list), max_text_len, word2vec_dimension)) i = 0 while i < len(text_list): words = text_list[i].split(' ') for index, word in enumerate(words): if index >= max_text_len: break if word in w2v_model.wv: text_array[i, index] = w2v_model.wv[word] else: text_array[i, index] = [0.0]*word2vec_dimension i += 1 return text_array def segment(text): #分词 words = jieba.cut(text) words = list(words)[:max_text_len] return ' '.join(words)
训练感知器
train.py
import os, sys import data_helper import perception def train(train_path, test_path, w2v_model_path): x_train, x_test, y_train, y_test = data_helper.load_train_test_data(train_path, test_path, w2v_model_path) print(x_train[0], y_train[0]) p = perception.Perceptron(data_helper.max_text_len, data_helper.word2vec_dimension, perception.f) # 训练迭代100轮, 学习率为0.05 p.train(x_train, y_train, 100, 0.05) # 打印权重 p.print_weights() #测试 p.test(x_test, y_test) if __name__ == "__main__": train_path = './data/train.csv' test_path = './data/test.csv' w2v_model_path = './data/word2vec.model' train(train_path, test_path, w2v_model_path)
执行:python3 train.py
输出:
weights [[-1.42558697 -2.48767451 1.28752376 ... -0.78376229 3.16459166
-3.28389434]
[-3.68905224 -4.80877013 -3.13396478 ... -4.25494364 -3.01798689
-4.91744347]
[ 1.94075086 -1.94479774 5.51378438 ... -5.19175698 -4.50725763
3.28213941]
...
[-0.60414949 0.84948442 3.2864892 ... -3.96489623 0.9902426
7.86129972]
[-0.52215719 -2.85837685 -0.89045009 ... -1.01795905 -1.21213078
-0.16342622]
[-0.12955836 -3.43814853 0.094599 ... -2.52779952 -4.71311826
-1.97031286]]
bias 4.099999999999993
acc: 81.89%
可以看到使用5000条数据训练微博情感分类的感知器模型,在5000条测试数据集上面的准确率为81.89%,效果还是可以的。
训练词向量:https://radimrehurek.com/gensim/models/word2vec.html
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。