赞
踩
数据处理:分词,去除停用词,转换词向量等
- import torch
- from torch.utils.data import Dataset
- from gensim.models import KeyedVectors
-
- #
- from tqdm import tqdm
-
-
- class text_Process:
- def __init__(self, sentences, sen_len,w2v_path):
- self.sentences = sentences # 句子列表
- self.sen_len = sen_len # 句子的最大长度
- self.w2v_path = w2v_path # word2vec模型路径
- self.index2word = [] # 实现index到word转换
- self.word2index = {} # 实现word到index转换
- self.embedding_matrix = []
- # load word2vec.model
- #self.embedding = KeyedVectors.load_word2vec_format(self.w2v_path, binary=False,limit=100000)
- print('loading_word2vec...')
- self.embedding = KeyedVectors.load_word2vec_format(self.w2v_path, binary=True)
-
- #Word2Vec.load(self.w2v_path)
- self.embedding_dim = self.embedding.vector_size
-
- def make_embedding(self):
- # 为model里面的单词构造word2index, index2word 和 embedding
- for i, word in enumerate(self.embedding.key_to_index.keys()):
- if i % 5 == 0:
- print('get word #{}'.format(i+1), end='\r')
- self.word2index[word] = len(self.word2index)
- self.index2word.append(word)
- self.embedding_matrix.append(self.embedding[word])
- self.embedding_matrix = torch.tensor(self.embedding_matrix)
-
- # 將"<PAD>"和"<UNK>"加进embedding里面
- self.add_embedding("<PAD>")
- self.add_embedding("<UNK>")
- print("total words: {}".format(len(self.embedding_matrix)))
-
- return self.embedding_matrix
-
- def add_embedding(self, word):
- # 将新词添加进embedding中
- vector = torch.empty(1, self.embedding_dim)
- torch.nn.init.uniform_(vector)
- self.word2index[word] = len(self.word2index)
- self.index2word.append(word)
- self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
-
- def sentence_word2idx(self):
- sentence_list = []
- for i, sentence in enumerate(self.sentences):
- # 将句子中的单词表示成index
- sentence_index = []
- for word in sentence:
- if word in self.word2index.keys():
- # 如果单词在字典中则直接读取index
- sentence_index.append(self.word2index[word])
- else:
- # 否则赋予<UNK>
- sentence_index.append(self.word2index["<UNK>"])
-
- # 统一句子长度
- sentence_index = self.pad_sequence(sentence_index)
- sentence_list.append(sentence_index)
-
- return torch.LongTensor(sentence_list)
-
- def pad_sequence(self, sentence):
- # 统一句子长度
- if len(sentence) > self.sen_len:
- sentence = sentence[:self.sen_len]
- else:
- pad_len = self.sen_len - len(sentence)
- for _ in range(pad_len):
- sentence.append(self.word2index["<PAD>"])
- assert len(sentence) == self.sen_len
-
- return sentence
-
- def labels2tensor(self, y):
- y = [int(label) for label in y]
-
- return torch.LongTensor(y)
-
- # 分词并去除停用词
-
- class MyDataset(Dataset):
- def __init__(self, x, y):
- self.data = x
- self.label = y
-
- def __getitem__(self, index):
- if self.label is None:
- return self.data[index]
-
- return self.data[index], self.label[index]
-
- def __len__(self):
- return len(self.data)
- def stopwords(stopwords_path):
- # 加载停用词
- with open(stopwords_path, 'r', encoding='utf-8') as f:
- stopwords = [line.strip() for line in f.readlines()]
- return stopwords
定义module
- # model.py
- import torch
-
- from sklearn.metrics import roc_auc_score,accuracy_score
-
- class evaluate:
- def __init__(self):
- pass
- def set_lable(self,y_ture,y_pre):
- self.y_ture = torch.concat(y_ture).flatten().cpu().numpy()
- self.y_pre = torch.concat(y_pre).flatten().cpu().numpy()
- def evaluation(self, outputs, labels): # 定义自己的评价函数,用分类的准确率来评价
- # outputs => probability (float)
- # labels => labels
- outputs[outputs >= 0.5] = 1 # 大於等於0.5為有惡意
- outputs[outputs < 0.5] = 0 # 小於0.5為無惡意
- correct = torch.sum(torch.eq(outputs, labels)).item()
- return correct
- def acc(self):
- return accuracy_score(self.y_ture, self.y_pre)
- def auc(self):
- return roc_auc_score(self.y_ture,self.y_pre)
- class LSTMModel(torch.nn.Module,evaluate):
- def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, requires_grad=True):
- super(LSTMModel, self).__init__()
-
- # 这是一个保存了固定字典和大小的简单查找表。这个模块常用来保存词嵌入和用下标检索它们。模块的输入是一个下标的列表,输出是对应的词嵌入。
- # embedding: (嵌入字典的大小, 每个嵌入向量的维度)
- self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
- # 将一个不可训练的类型为Tensor的参数转化为可训练的类型为parameter的参数,并将这个参数绑定到module里面,成为module中可训练的参数。
- self.embedding.weight = torch.nn.Parameter(embedding, requires_grad=requires_grad)
-
- self.LSTM = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
-
- self.linner = torch.nn.Sequential(
- torch.nn.Dropout(dropout),
- torch.nn.Linear(hidden_dim, 2),
- #torch.nn.Sigmoid()
- )
-
- def forward(self, inputs):
- inputs = self.embedding(inputs)
- x, _ = self.LSTM(inputs, None)
- # x.shape = (batch_size, seq_len, hidden_size)
- # 取用 LSTM 最后一个的 hidden state
- x = x[:, -1, :]
- x = self.linner(x)
-
- return x
然后是训练
- from torch.utils.data import DataLoader
- from sklearn.model_selection import train_test_split
- from tqdm import tqdm
-
- from module import LSTMModel,evaluate
- from processing import text_Process,MyDataset
- import pandas as pd
- import torch
- from torch import nn
- import torch.optim as optim
- df = pd.read_csv('data2/sms.csv')
- #df = df[df['label'] != 2]
- #df = df.sample(100)
- # df['review'] = df['review'].map(lambda x:str(x).lower())
- # df['review'] = df['review'].map(cut_text)
-
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
- #criterion = nn.BCELoss()
- criterion = nn.CrossEntropyLoss()
- data_x = df['MSG_CONTENT']
- data_y = df['is_overdue']
- # data pre_processing
- preprocess = text_Process(sentences=data_x,
- sen_len=25,
- w2v_path="./GoogleNews-vectors-negative300.bin"
- )
-
- embedding = preprocess.make_embedding()
- data_x = preprocess.sentence_word2idx()
- data_y = preprocess.labels2tensor(data_y)
-
-
- model = LSTMModel(
- embedding,
- embedding_dim=300,
- hidden_dim=128,
- num_layers=1,
- dropout=0.5,
- requires_grad=True
- ).to(device)
- lr = 0.01
- batch_size = 128
- optimizer = optim.Adam(model.parameters(), lr=lr)
-
- best_acc = 0.
-
- x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=5)
- train_dataset = MyDataset(x_train, y_train)
- val_dataset = MyDataset(x_test, y_test)
- train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,num_workers=16)
- val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True,num_workers=16)
- e1 = evaluate()
- e2 = evaluate()
- val_len = len(val_loader)
- train_len = len(train_loader)
- epochs = 10
- for epoch in tqdm(range(epochs)):
- model.train()
- total_loss, total_acc = 0, 0
- pre_list_train,train_label = [],[]
- if epoch % 3 == 0:
- optimizer.param_groups[0]['lr'] *= 0.9
- for i, (inputs, labels) in enumerate(train_loader):
- inputs = inputs.to(device, dtype=torch.long)
- labels = labels.to(device, dtype=torch.float) # 类型为float
- # 2. 清空梯度
- optimizer.zero_grad()
- outputs = model(inputs)
- #print(outputs)
- loss = criterion(outputs, labels.long())
- _,pre_train = outputs.max(1)
- pre_list_train.append(pre_train)
- train_label.append(labels)
- loss.backward()
- optimizer.step()
- print(f'train:{i/train_len}',end='\r')
- model.eval()
- total_loss, total_acc = 0, 0
- pre_list_test, test_label = [],[]
- for i, (inputs, labels) in tqdm(enumerate(val_loader)):
- inputs = inputs.to(device, dtype=torch.long)
- labels = labels.to(device, dtype=torch.float)
- outputs = model(inputs)
- _,pre_test = outputs.max(1)
- loss = criterion(outputs, labels.long())
- total_loss += loss.item()
- pre_list_test.append(pre_test)
- test_label.append(labels)
- print(f'val:{i/val_len}',end='\r')
- e1.set_lable(test_label,pre_list_test)
- e2.set_lable(train_label, pre_list_train)
- print('\n','epoch:',epoch,
- 'train_acc:',e1.acc(),
- 'test_acc:',e2.acc())
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。