赞
踩
深度学习作为一项学习数据的多层特征或表征的强大机器学习技术。此项目中,将使用tensorflow深度学习平台,通过相关模型的构建,以及数据的处理,完成微博评论情感分析,已到达类似百度AI情感分析功能,可以通过用户输入的数据进行分析,得到情感为积极性还是消极性。
前端仿的百度ai的情感分析界面。只是做一个简单的交互页面,使用纯html+js就行了。当然,作为毕设, 你也可以花更多的想法去完善前端界面,毕竟答辩时的第一印象就是前端展示。这里也能做成.exe程序,反正都一样。
采用这两个来做。说白了就是rnn plus。
class Attention(Layer): def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs): self.supports_masking = True self.init = initializers.get('glorot_uniform') self.W_regularizer = regularizers.get(W_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.W_constraint = constraints.get(W_constraint) self.b_constraint = constraints.get(b_constraint) self.bias = bias self.step_dim = step_dim self.features_dim = 0 super(Attention, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = self.add_weight(name='{}_W'.format(self.name), shape=(input_shape[-1],), initializer=self.init, regularizer=self.W_regularizer,constraint=self.W_constraint ) self.features_dim = input_shape[-1] if self.bias: self.b = self.add_weight(name='{}_b'.format(self.name), shape=(input_shape[1],), initializer='zero', regularizer=self.b_regularizer, constraint=self.b_constraint ) else: self.b = None self.built = True def compute_mask(self, input, input_mask=None): return None def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) if self.bias: e += self.b e = K.tanh(e) a = K.exp(e) if mask is not None: a *= K.cast(mask, K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) c = K.sum(a * x, axis=1) return c def compute_output_shape(self, input_shape): return input_shape[0], self.features_dim from tensorflow.keras import Input, Model from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, GRU # 双向长短时间记忆模型 # 增加注意力机制 class TextAttBiRNN(object): def __init__(self, maxlen, max_features, embedding_dims, class_num = 10, last_cativation = 'sigmoid'): self.maxlen = maxlen self.max_features = max_features self.embedding_dims = embedding_dims self.class_num = class_num self.last_cativation = last_cativation def get_model(self): input = Input((self.maxlen,)) embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input) x = Bidirectional(LSTM(128, return_sequences=True))(embedding) x = Attention(self.maxlen)(x) x = Dense(128, activation='relu')(x) x = Dense(64, activation='relu')(x) x = Dense(32, activation='relu')(x) output = Dense(self.class_num, activation=self.last_cativation)(x) model = Model(inputs=input, outputs= output) model.summary() return model
模型有了那就需要将数据处理为机器能够学习的,将所有字符替换为数字。【至于数据标注,就百度ai调一下,对数据进行标注就行。不放心就自己看一遍】
txt_path = 'D:/weibo/dataprocess/data/data_label.txt' text_list, text_label = get_data(txt_path) train_data = [] for text in text_list: train_data.append(build_chars(text)) chars = [] for word_list in train_data: for word in word_list: if word not in chars: chars.append(word) chars = pd.Series(chars).value_counts() chars[:] = range(1, len(chars) + 1) build_dict(chars) # for k, v in enumerate(chars): # print('%d %s' % (k, v)) # print(chars.index(v)) maxlen = 64 embedding_dim = 128 train_x = [] for data in train_data: temp = [] for d in data: temp.append(chars[d]) train_x.append(temp) train_x = sequence.pad_sequences(train_x, maxlen=maxlen, value=0, padding='post') train_y = text_label
train_x = np.array(train_x) # log.logger.debug(train_x) train_y = np.array(train_y) # log.logger.debug(train_y) textRnn = TextAttBiRNN(maxlen, len(chars) + 1, embedding_dim, class_num = 2) model = textRnn.get_model() model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) num_epochs = 5 history = model.fit( train_x, train_y, epochs=num_epochs, verbose = 1 ) model.save_weights(model_path)
测试结果如下:
这个项目做的很临时,基本采用的方式就是能实现。对于数据处理正常就行。
模型这里,采用的是我在天池新闻分类中使用的lstm+attention。感觉还行吧,作为基础练练手用。想要更香的方式就试一试bert。新闻分类有95%以上。lstm只有93%多一点。
最近其他事耽搁,更得很慢,请谅解。(有问题可以私信,看到会第一时间回复)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。