赞
踩
一些自己的nlp学习笔记
- import torch
- import numpy as np
- x = torch.Tensor(2,3) # 建立两行三列的torch tensor
-
- print(x.type()) # type是Tensor类的一个mothod,返回Python string
- # torch.FloatTensor是real number的默认类型,一般来说GPU都能很好的处理
-
- x = torch.rand(2,3) # uniform distribution
- x = torch.randn(2,3) # normal distribution
-
- x = torch.zeros(2, 3) # 全0 tensor
- x = torch.ones(2, 3) # 全1 tensor
- x.fill_(5) # 将tensor中全填入某相同的值
-
-
- #Tensor from list
- x = torch.Tensor([[1, 2, 3], [4, 5, 6]]) #从list中获取tensor
-
- #From numpy to torch
- a = np.random.rand(2, 3)
- x = torch.from_numpy(a) # 用from_numpy将numpy类型转为tensor
- x = torch.from_numpy(a).type(torch.FloatTensor) # 可以用type来指定数据类型
- y = torch.from_numpy(a).type_as(x) # 可以用type_as来指定与其他tensor相同的数
- # 据类型
-
- #数据类型以及数据类型的转换,一般默认为FloatTensor
- z = x.long() # 转为long
- # 求和
- print(torch.add(x,x))
- print(torch.sum(x, dim=0)) #按列求和
-
- # 对应元素求积
- print(torch.mul(x,x))
- print(x*x)
-
- # range tensor
- print(torch.arange(6))
-
- # 返回不同shape的tensor
- print(x.view(3, 2))
-
- x1 = torch.arange(6).view(2,3)
-
- # indexing + sum
- x2 = torch.ones(3, 2).long()
- x2[:, 1] += 1
-
- print('x1 =', x1)
- print('x2 =', x2)
-
- # 矩阵乘
- print(torch.mm(x1, x2))
- import torch
-
- print(torch.cuda.is_available())
- print(torch.cuda.current_device())
- print(torch.cuda.device(0))
- print(torch.cuda.device_count())
- print(torch.cuda.get_device_name(0))
- x = torch.ones(1, requires_grad=True)
- print(x)
-
- y = x+42
- print(y)
-
- z = 3*y*y
- print(z)
-
- z.backward() # 计算梯度
- print(x.grad) # ∂z/∂x = 6(x+42) = 6*1+252 = 258
- print(y.grad) # y的gradient没有保存因为没有requires_grad=True
- import torch
- import torch.nn as nn
-
- # nn.Module 是所有神经网络的基类
- class Perceptron(nn.Module):
- """Our perceptron class"""
-
- def __init__(self, input_dim):
- """
- Constructor
- """
- super().__init__()
- self.fc = nn.Linear(input_dim, 1)
- self.sigmoid = torch.nn.Sigmoid()
-
- def forward(self, x_in):
- # squeeze unwraps the result from the singleton list
- return self.sigmoid(self.fc(x_in)) #.squeeze()
-
- print(Perceptron(10).forward(torch.ones(10)))
Sigmoid :
Tanh :
Relu :
MSE Loss:
- import torch
- import torch.nn as nn
-
- mse_loss = nn.MSELoss()
- produced = torch.randn(2, 4, requires_grad=True)
- print(produced)
- expected = torch.randn(2, 4)
- print(expected)
- loss = mse_loss(produced, expected)
- print(loss)
categorical cross-entropy loss
- import torch
- import torch.nn as nn
-
- ce_loss = nn.CrossEntropyLoss() # for binary classification, we can use nn.BCELoss()
- produced = torch.randn(2, 4, requires_grad=True) # 2*4, normal distribution
- print(produced)
- # input is an index for each vector indicating the correct category/class
- expected = torch.tensor([1, 0], dtype=torch.int64)
- loss = ce_loss(produced, expected)
- print(loss)
- from random import randint
-
- import torch
- from torch.utils.data import Dataset, DataLoader
-
- import torch.nn as nn
- import torch.nn.functional as F
- import torch.optim as optim
建立LanguageRecognitionDataset类,用于处理原始data,生成我们language classification训练所需要的dataset
- class LanguageRecognitionDataset(Dataset):
- """An automatically generated dataset for our language classification task."""
-
- def _get_bigrams(self, sentence_list):
- big rams = {}
- # for each sentence
- for s in sentence_list:
- # for each bigram
- for k in range(len(s)-1):
- bigrams[s[k:k+2]] = 1.0
- return bigrams.keys()
-
- def _get_bigram_vector(self, sentence):
- sent_bigrams = self._get_bigrams([sentence])
- vector = []
-
- for bigram in self.bigrams:
- vector.append(1.0 if bigram in sent_bigrams else 0.0)
-
- return vector
-
- def __init__(self, sample, training_bigrams = None):
- """
- Args:
- sample: List of sentences with their classification (True/False)
- """
-
- self.num_samples = len(sample)
- if not training_bigrams:
- self.bigrams = self._get_bigrams([x for x, _ in sample])
- else:
- self.bigrams = training_bigrams
-
- self.data = []
- for sentence, gold_label in sample:
- sentence = sentence.lower()
- item = {'inputs': torch.tensor(self._get_bigram_vector(sentence)), 'outputs': torch.tensor([gold_label])}
- self.data.append(item)
-
- def __len__(self):
- return self.num_samples
-
- def __getitem__(self, idx):
- return self.data[idx]
-
- LanguageRecognitionDataset([("ciao ciao pippo", 1), ("la casa si trova in collina", 1)])[1]
- training_sentences = [("Scienziata italiana scopre la più grande esplosione nell’Universo.", 1.0),
- ("Nell’ammasso di galassie di Ofiuco, distante 390 milioni di anni luce.", 1.0),
- ("Ha rilasciato una quantità di energia 5 volte più grande della precedente che deteneva il primato.", 1.0),
- ("Syria war: Turkey says thousands of migrants have crossed to EU.", 0.0),
- ("Turkey could no longer deal with the amount of people fleeing Syria's civil war, he added.", 0.0),
- ("Greece says it has blocked thousands of migrants from entering illegally from Turkey.", 0.0),
- ("Tutto perfetto? Non proprio. Ci sono elementi problematici che vanno considerati.", 1.0),
- ("Il primo è l’autonomia degli studenti, che devono essere in grado di gestire la tecnologia.", 1.0),
- ("Il secondo, è la durata e la cadenza delle lezioni.", 1.0),
- ("Per motivi di connessione, di competenze, di strumenti.", 1.0),
- ("Serve un’assistenza dedicata.", 1.0),
- ("Potremmo completare l’anno scolastico in versione virtuale?", 1.0),
- ("Siamo preparati per affiancare la didattica tradizionale a quella virtuale, ma non siamo pronti per sostituirla", 1.0),
- ("Various architectures of recurrent neural networks have been successful.", 0.0),
- ("They perform tasks relating to sequence measuring", 0.0),
- ("The networks operate by processing input components sequentially", 0.0),
- ("They retain a hidden vector between iterations", 0.0),
- ("It is constantly used and modified throughout the sequence.", 0.0),
- ("They are able to model arbitrarily complicated programs.", 0.0),
- ("L’Istituto, che raccoglie studenti di liceo scientifico, linguistico e tecnico economico, è l’esempio ideale.", 1.0),
- ]
-
- validation_sentences = [("L’Istituto superiore di sanità ha confermato tutti i casi esaminati.", 1.0),
- ("Measures announced after an emergency cabinet meeting also include the cancellation of the Paris half-marathon which was to be held on Sunday.", 0.0),
- ("Lavagne in condivisione, documenti scaricabili sulla piattaforma gratuita, esercizi collaborativi.", 1.0),
- ("Each encoder consists of two major components", 0.0),
- ]
-
-
-
- test_sentences = [("Il ministro della Salute francese ha raccomandato di salutarsi mantenendo le distanze, mentre l’Organizzazione mondiale della sanità alza l’allerta a molto alta.", 1.0),
- ("Possiamo riammalarci ma in questo caso si parla di ricaduta.", 1.0),
- ("The vast majority of infections and deaths are in China, where the virus originated late last year.", 0.0),
- ("France has banned all indoor gatherings of more than 5,000 people, as part of efforts to contain the country's coronavirus outbreak", 0.0)]
-
- def test_dataset_class():
- simple_dataset = LanguageRecognitionDataset(training_sentences)
-
- print('Dataset test:')
- for i in range(len(training_sentences)):
- print(f' sample {i}: {simple_dataset[i]}')
-
- test_dataset_class()
我们建立一个trainer类,其中包含了以下几个部分
为了让模型正确的学习,我们需要loss function来评估模型输出与真实值的差距,需要optimizer来基于loss更正模型参数
- class Trainer():
- """Utility class to train and evaluate a model."""
-
- def __init__(
- self,
- model,
- loss_function,
- optimizer):
- """
- Args:
- model: the model we want to train.
- loss_function: the loss_function to minimize.
- optimizer: the optimizer used to minimize the loss_function.
- """
- self.model = model
- self.loss_function = loss_function
- self.optimizer = optimizer
-
- def train(self, train_dataset, valid_dataset, epochs=1):
- """
- Args:
- train_dataset: a Dataset or DatasetLoader instance containing
- the training instances.
- valid_dataset: a Dataset or DatasetLoader instance used to evaluate
- learning progress.
- epochs: the number of times to iterate over train_dataset.
- Returns:
- avg_train_loss: the average training loss on train_dataset over
- epochs.
- """
- assert epochs > 1 and isinstance(epochs, int)
- print('Training...')
-
- train_loss = 0.0
- for epoch in range(epochs):
- print(' Epoch {:03d}'.format(epoch + 1))
-
- epoch_loss = 0.0
-
- for step, sample in enumerate(train_dataset):
- inputs = sample['inputs']
- labels = sample['outputs']
-
- # we need to set the gradients to zero before starting to do backpropragation
- # because PyTorch accumulates the gradients on subsequent backward passes
- self.optimizer.zero_grad()
-
- predictions = self.model(inputs)
-
- sample_loss = self.loss_function(predictions, labels)
-
- #print("Before BP:", list(model.parameters()))
-
- sample_loss.backward()
- self.optimizer.step()
-
- #print("After BP:", list(model.parameters()))
-
- # sample_loss is a Tensor, tolist returns a float (alternative: use float() instead of .tolist())
- epoch_loss += sample_loss.tolist()
-
- print(' [E: {:2d} @ step {}] current avg loss = {:0.4f}'.format(epoch, step, epoch_loss / (step + 1)))
-
- avg_epoch_loss = epoch_loss / len(train_dataset)
- train_loss += avg_epoch_loss
- print(' [E: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))
-
- valid_loss = self.evaluate(valid_dataset)
-
- print(' [E: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))
-
- print('... Done!')
-
- avg_epoch_loss = train_loss / epochs
- return avg_epoch_loss
-
-
- def evaluate(self, valid_dataset):
- """
- Args:
- valid_dataset: the dataset to use to evaluate the model.
- Returns:
- avg_valid_loss: the average validation loss over valid_dataset.
- """
- valid_loss = 0.0
-
- # no gradient updates here
- with torch.no_grad():
- for sample in valid_dataset:
- inputs = sample['inputs']
- labels = sample['outputs']
-
- predictions = self.model(inputs)
- sample_loss = self.loss_function(predictions, labels)
- valid_loss += sample_loss.tolist()
-
- return valid_loss / len(valid_dataset)
-
-
- def predict(self, x):
- """
- Returns: hopefully the right prediction.
- """
- return self.model(x).tolist()
- training_dataset = DataLoader(LanguageRecognitionDataset(training_sentences), batch_size=6)
- validation_dataset = DataLoader(LanguageRecognitionDataset(validation_sentences, training_dataset.dataset.bigrams), batch_size=2)
- test_dataset = DataLoader(LanguageRecognitionDataset(test_sentences, training_dataset.dataset.bigrams), batch_size=2)
-
- print("Number of input dimensions", len(training_dataset.dataset.bigrams))
- model = Perceptron(len(training_dataset.dataset.bigrams))
- trainer = Trainer(
- model,
- loss_function = nn.MSELoss(),
- optimizer = optim.SGD(model.parameters(), lr=0.01)
- )
-
-
- avg_epoch_loss = trainer.train(training_dataset, validation_dataset,
- epochs=50)
检查我们的模型是否真的学习了一些东西
- trainer.evaluate(test_dataset)
-
- for step, batch in enumerate(test_dataset):
- print(step, trainer.predict(batch['inputs']), batch['outputs'])
- class LanguageRecognitionFF(nn.Module):
- """A simple model that classifies language"""
-
- def __init__(self, input_dim, hparams):
- super().__init__()
-
- # Hidden layer: transforms the input value/scalar into
- # a hidden vector representation.
- self.fc1 = nn.Linear(input_dim, hparams.hidden_size)
-
- self.relu = nn.ReLU()
-
- # Output layer: transforms the hidden vector representation
- # into a value/scalar (hopefully the input value + 1).
- self.fc2 = nn.Linear(hparams.hidden_size, 1)
-
- self.sigmoid = nn.Sigmoid()
-
-
- def forward(self, x):
- hidden = self.fc1(x)
- relu = self.relu(hidden)
- result = self.fc2(relu)
- return self.sigmoid(result)
尽量把超参数与model definition分开,因为这样可以我们可以在不碰模型的情况下改变超参数
- class HParams():
- hidden_size = 16
instance
model_ff = LanguageRecognitionFF(len(training_dataset.dataset.bigrams), HParams)
- trainer = Trainer(
- model = model_ff,
- loss_function = nn.MSELoss(),
- optimizer = optim.SGD(model_ff.parameters(), lr=1e-5)
- )
trainer.train(training_dataset, validation_dataset, 50)
- trainer.evaluate(test_dataset)
-
- for step, batch in enumerate(test_dataset):
- print(trainer.predict(batch['inputs']), batch['outputs'])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。