当前位置:   article > 正文

lstm的文本分类_文本分类lstm

文本分类lstm

        数据处理:分词,去除停用词,转换词向量等

  1. import torch
  2. from torch.utils.data import Dataset
  3. from gensim.models import KeyedVectors
  4. #
  5. from tqdm import tqdm
  6. class text_Process:
  7. def __init__(self, sentences, sen_len,w2v_path):
  8. self.sentences = sentences # 句子列表
  9. self.sen_len = sen_len # 句子的最大长度
  10. self.w2v_path = w2v_path # word2vec模型路径
  11. self.index2word = [] # 实现index到word转换
  12. self.word2index = {} # 实现word到index转换
  13. self.embedding_matrix = []
  14. # load word2vec.model
  15. #self.embedding = KeyedVectors.load_word2vec_format(self.w2v_path, binary=False,limit=100000)
  16. print('loading_word2vec...')
  17. self.embedding = KeyedVectors.load_word2vec_format(self.w2v_path, binary=True)
  18. #Word2Vec.load(self.w2v_path)
  19. self.embedding_dim = self.embedding.vector_size
  20. def make_embedding(self):
  21. # 为model里面的单词构造word2index, index2word 和 embedding
  22. for i, word in enumerate(self.embedding.key_to_index.keys()):
  23. if i % 5 == 0:
  24. print('get word #{}'.format(i+1), end='\r')
  25. self.word2index[word] = len(self.word2index)
  26. self.index2word.append(word)
  27. self.embedding_matrix.append(self.embedding[word])
  28. self.embedding_matrix = torch.tensor(self.embedding_matrix)
  29. # 將"<PAD>""<UNK>"加进embedding里面
  30. self.add_embedding("<PAD>")
  31. self.add_embedding("<UNK>")
  32. print("total words: {}".format(len(self.embedding_matrix)))
  33. return self.embedding_matrix
  34. def add_embedding(self, word):
  35. # 将新词添加进embedding中
  36. vector = torch.empty(1, self.embedding_dim)
  37. torch.nn.init.uniform_(vector)
  38. self.word2index[word] = len(self.word2index)
  39. self.index2word.append(word)
  40. self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
  41. def sentence_word2idx(self):
  42. sentence_list = []
  43. for i, sentence in enumerate(self.sentences):
  44. # 将句子中的单词表示成index
  45. sentence_index = []
  46. for word in sentence:
  47. if word in self.word2index.keys():
  48. # 如果单词在字典中则直接读取index
  49. sentence_index.append(self.word2index[word])
  50. else:
  51. # 否则赋予<UNK>
  52. sentence_index.append(self.word2index["<UNK>"])
  53. # 统一句子长度
  54. sentence_index = self.pad_sequence(sentence_index)
  55. sentence_list.append(sentence_index)
  56. return torch.LongTensor(sentence_list)
  57. def pad_sequence(self, sentence):
  58. # 统一句子长度
  59. if len(sentence) > self.sen_len:
  60. sentence = sentence[:self.sen_len]
  61. else:
  62. pad_len = self.sen_len - len(sentence)
  63. for _ in range(pad_len):
  64. sentence.append(self.word2index["<PAD>"])
  65. assert len(sentence) == self.sen_len
  66. return sentence
  67. def labels2tensor(self, y):
  68. y = [int(label) for label in y]
  69. return torch.LongTensor(y)
  70. # 分词并去除停用词
  71. class MyDataset(Dataset):
  72. def __init__(self, x, y):
  73. self.data = x
  74. self.label = y
  75. def __getitem__(self, index):
  76. if self.label is None:
  77. return self.data[index]
  78. return self.data[index], self.label[index]
  79. def __len__(self):
  80. return len(self.data)
  81. def stopwords(stopwords_path):
  82. # 加载停用词
  83. with open(stopwords_path, 'r', encoding='utf-8') as f:
  84. stopwords = [line.strip() for line in f.readlines()]
  85. return stopwords

        定义module

  1. # model.py
  2. import torch
  3. from sklearn.metrics import roc_auc_score,accuracy_score
  4. class evaluate:
  5. def __init__(self):
  6. pass
  7. def set_lable(self,y_ture,y_pre):
  8. self.y_ture = torch.concat(y_ture).flatten().cpu().numpy()
  9. self.y_pre = torch.concat(y_pre).flatten().cpu().numpy()
  10. def evaluation(self, outputs, labels): # 定义自己的评价函数,用分类的准确率来评价
  11. # outputs => probability (float)
  12. # labels => labels
  13. outputs[outputs >= 0.5] = 1 # 大於等於0.5為有惡意
  14. outputs[outputs < 0.5] = 0 # 小於0.5為無惡意
  15. correct = torch.sum(torch.eq(outputs, labels)).item()
  16. return correct
  17. def acc(self):
  18. return accuracy_score(self.y_ture, self.y_pre)
  19. def auc(self):
  20. return roc_auc_score(self.y_ture,self.y_pre)
  21. class LSTMModel(torch.nn.Module,evaluate):
  22. def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, requires_grad=True):
  23. super(LSTMModel, self).__init__()
  24. # 这是一个保存了固定字典和大小的简单查找表。这个模块常用来保存词嵌入和用下标检索它们。模块的输入是一个下标的列表,输出是对应的词嵌入。
  25. # embedding: (嵌入字典的大小, 每个嵌入向量的维度)
  26. self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
  27. # 将一个不可训练的类型为Tensor的参数转化为可训练的类型为parameter的参数,并将这个参数绑定到module里面,成为module中可训练的参数。
  28. self.embedding.weight = torch.nn.Parameter(embedding, requires_grad=requires_grad)
  29. self.LSTM = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
  30. self.linner = torch.nn.Sequential(
  31. torch.nn.Dropout(dropout),
  32. torch.nn.Linear(hidden_dim, 2),
  33. #torch.nn.Sigmoid()
  34. )
  35. def forward(self, inputs):
  36. inputs = self.embedding(inputs)
  37. x, _ = self.LSTM(inputs, None)
  38. # x.shape = (batch_size, seq_len, hidden_size)
  39. # 取用 LSTM 最后一个的 hidden state
  40. x = x[:, -1, :]
  41. x = self.linner(x)
  42. return x

        然后是训练

  1. from torch.utils.data import DataLoader
  2. from sklearn.model_selection import train_test_split
  3. from tqdm import tqdm
  4. from module import LSTMModel,evaluate
  5. from processing import text_Process,MyDataset
  6. import pandas as pd
  7. import torch
  8. from torch import nn
  9. import torch.optim as optim
  10. df = pd.read_csv('data2/sms.csv')
  11. #df = df[df['label'] != 2]
  12. #df = df.sample(100)
  13. # df['review'] = df['review'].map(lambda x:str(x).lower())
  14. # df['review'] = df['review'].map(cut_text)
  15. device = 'cuda' if torch.cuda.is_available() else 'cpu'
  16. #criterion = nn.BCELoss()
  17. criterion = nn.CrossEntropyLoss()
  18. data_x = df['MSG_CONTENT']
  19. data_y = df['is_overdue']
  20. # data pre_processing
  21. preprocess = text_Process(sentences=data_x,
  22. sen_len=25,
  23. w2v_path="./GoogleNews-vectors-negative300.bin"
  24. )
  25. embedding = preprocess.make_embedding()
  26. data_x = preprocess.sentence_word2idx()
  27. data_y = preprocess.labels2tensor(data_y)
  28. model = LSTMModel(
  29. embedding,
  30. embedding_dim=300,
  31. hidden_dim=128,
  32. num_layers=1,
  33. dropout=0.5,
  34. requires_grad=True
  35. ).to(device)
  36. lr = 0.01
  37. batch_size = 128
  38. optimizer = optim.Adam(model.parameters(), lr=lr)
  39. best_acc = 0.
  40. x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=5)
  41. train_dataset = MyDataset(x_train, y_train)
  42. val_dataset = MyDataset(x_test, y_test)
  43. train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,num_workers=16)
  44. val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True,num_workers=16)
  45. e1 = evaluate()
  46. e2 = evaluate()
  47. val_len = len(val_loader)
  48. train_len = len(train_loader)
  49. epochs = 10
  50. for epoch in tqdm(range(epochs)):
  51. model.train()
  52. total_loss, total_acc = 0, 0
  53. pre_list_train,train_label = [],[]
  54. if epoch % 3 == 0:
  55. optimizer.param_groups[0]['lr'] *= 0.9
  56. for i, (inputs, labels) in enumerate(train_loader):
  57. inputs = inputs.to(device, dtype=torch.long)
  58. labels = labels.to(device, dtype=torch.float) # 类型为float
  59. # 2. 清空梯度
  60. optimizer.zero_grad()
  61. outputs = model(inputs)
  62. #print(outputs)
  63. loss = criterion(outputs, labels.long())
  64. _,pre_train = outputs.max(1)
  65. pre_list_train.append(pre_train)
  66. train_label.append(labels)
  67. loss.backward()
  68. optimizer.step()
  69. print(f'train:{i/train_len}',end='\r')
  70. model.eval()
  71. total_loss, total_acc = 0, 0
  72. pre_list_test, test_label = [],[]
  73. for i, (inputs, labels) in tqdm(enumerate(val_loader)):
  74. inputs = inputs.to(device, dtype=torch.long)
  75. labels = labels.to(device, dtype=torch.float)
  76. outputs = model(inputs)
  77. _,pre_test = outputs.max(1)
  78. loss = criterion(outputs, labels.long())
  79. total_loss += loss.item()
  80. pre_list_test.append(pre_test)
  81. test_label.append(labels)
  82. print(f'val:{i/val_len}',end='\r')
  83. e1.set_lable(test_label,pre_list_test)
  84. e2.set_lable(train_label, pre_list_train)
  85. print('\n','epoch:',epoch,
  86. 'train_acc:',e1.acc(),
  87. 'test_acc:',e2.acc())

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/471923
推荐阅读
相关标签
  

闽ICP备14008679号