赞
踩
2.1长短时记忆网络是基本的循环神经网络的一种变体,可以有效的解决简单RNN的梯度爆炸或消失问题。LSTM网络主要改进在下面两个方面:
2.2 LSTM网络的循环单元结构如下图所示,计算过程如下:
c ~ t = tanh ( W c x t + U c h t − 1 + b c ) \tilde{\mathbf{c}}_{t}=\tanh \left(W_{c} \mathbf{x}_{t}+U_{c} \mathbf{h}_{t-1}+\mathbf{b}_{c}\right) c~t=tanh(Wcxt+Ucht−1+bc)
import random import jieba import pandas as pd import numpy as np stopwords = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\stopwords.txt", index_col=False, quoting=3, sep="\t", names=["stopword"], encoding="utf-8") stopwords = stopwords["stopword"].values # 加载语料 laogong_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beilaogongda.csv", encoding="utf-8", sep=",") laopo_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beilaopoda.csv", encoding="utf-8", sep=",") erzi_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beierzida.csv", encoding="utf-8", sep=",") nver_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beinverda.csv", encoding="utf-8", sep=",") # 删除语料的nan行 laogong_df.dropna(inplace=True) laopo_df.dropna(inplace=True) erzi_df.dropna(inplace=True) nver_df.dropna(inplace=True) # 转换 laogong = laogong_df.segment.values.tolist() laopo = laopo_df.segment.values.tolist() erzi = erzi_df.segment.values.tolist() nver = nver_df.segment.values.tolist() # 分词和去掉停用词 ## 定义分词和打标签函数preprocess_text def preprocess_text(content_lines, sentences, category): # content_lines是上面转换得到的list # sentences是空的list,用来存储打上标签后的数据 # category是类型标签 for line in content_lines: try: segs = jieba.lcut(line) segs = [v for v in segs if not str(v).isdigit()] # 除去数字 segs = list(filter(lambda x: x.strip(), segs)) # 除去左右空格 segs = list(filter(lambda x: len(x) > 1, segs)) # 除去长度为1的字符 segs = list(filter(lambda x: x not in stopwords, segs)) # 除去停用词 sentences.append((" ".join(segs), category)) # 打标签 except Exception: print(line) continue # 调用上面函数,生成训练数据 sentences = [] preprocess_text(laogong, sentences, 0) preprocess_text(laopo, sentences, 1) preprocess_text(erzi, sentences, 2) preprocess_text(nver, sentences, 3) # 先打乱数据,使得数据分布均匀,然后获取特征和标签列表 random.shuffle(sentences) # 打乱数据,生成更可靠的训练集 for sentence in sentences[:10]: # 输出前10条数据,观察一下 print(sentence[0], sentence[1]) # 所有特征和对应标签 all_texts = [sentence[0] for sentence in sentences] all_labels = [sentence[1] for sentence in sentences] # 使用LSTM对数据进行分类 from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.layers import Dense, Input, Flatten, Dropout from keras.layers import LSTM, Embedding, GRU from keras.models import Sequential # 预定义变量 MAX_SEQENCE_LENGTH = 100 # 最大序列长度 EMBEDDING_DIM = 200 # 词嵌入维度 VALIDATION_SPLIT = 0.16 # 验证集比例 TEST_SPLIT = 0.2 # 测试集比例 # 使用keras的sequence模块文本序列填充 tokenizer = Tokenizer() tokenizer.fit_on_texts(all_texts) sequences = tokenizer.texts_to_sequences(all_texts) word_index = tokenizer.word_index print("Found %s unique tokens." % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQENCE_LENGTH) labels = to_categorical(np.asarray(all_labels)) print("data shape:", data.shape) print("labels shape:", labels.shape) # 数据切分 p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT)) p2 = int(len(data) * (1 - TEST_SPLIT)) # 训练集 x_train = data[:p1] y_train = labels[:p1] # 验证集 x_val = data[p1:p2] y_val = labels[p1:p2] # 测试集 x_test = data[p2:] y_test = labels[p2:] # LSTM训练模型 model = Sequential() model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQENCE_LENGTH)) model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2)) model.add(Dropout(0.2)) model.add(Dense(64, activation="relu")) model.add(Dense(labels.shape[1], activation="softmax")) model.summary() # 模型编译 model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]) print(model.metrics_names) model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128) model.save("lstm.h5") # 模型评估 print(model.evaluate(x_test, y_test)) # 使用GRU模型 model = Sequential() model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQENCE_LENGTH)) model.add(GRU(200, dropout=0.2, recurrent_dropout=0.2)) model.add(Dropout(0.2)) model.add(Dense(64, activation="relu")) model.add(Dense(labels.shape[1], activation="softmax")) model.summary() model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]) print(model.metrics_names) model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128) model.save("gru.h5") print(model.evaluate(x_test, y_test))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。