2.The Multilayer Perceptron(多层感知器)


2.1.1 A Simple Example:XOR





2.2 Implementing MLPs in PyTorch


例2-1:Multilayer Perceptron
  1. import torch.nn as nn
  2. import torch.nn.functional as F
  3. class MultilayerPerceptron(nn.Module):
  4. def __init__(self, input_dim, hidden_dim, output_dim):
  5. """
  6. Args:
  7. input_dim (int): the size of the input vectors
  8. hidden_dim (int): the output size of the first Linear layer
  9. output_dim (int): the output size of the second Linear layer
  10. """
  11. super(MultilayerPerceptron, self).__init__()
  12. self.fc1 = nn.Linear(input_dim, hidden_dim)
  13. self.fc2 = nn.Linear(hidden_dim, output_dim)
  14. def forward(self, x_in, apply_softmax=False):
  15. """The forward pass of the MLP
  16. Args:
  17. x_in (torch.Tensor): an input data tensor.
  18. x_in.shape should be (batch, input_dim)
  19. apply_softmax (bool): a flag for the softmax activation
  20. should be false if used with the Cross Entropy losses
  21. Returns:
  22. the resulting tensor. tensor.shape should be (batch, output_dim)
  23. """
  24. # 通过第一个全连接层并应用ReLU激活函数
  25. intermediate = F.relu(self.fc1(x_in))
  26. # 通过第二个全连接层得到模型输出
  27. output = self.fc2(intermediate)
  28. # 如果apply_softmax为真,则对输出应用softmax函数
  29. if apply_softmax:
  30. output = F.softmax(output, dim=1) # 在第1维(通常是类别维度)上应用softmax
  31. return output


例2-2: An example instantiation of an MLP
  1. batch_size = 2 # number of samples input at once
  2. input_dim = 3
  3. hidden_dim = 100
  4. output_dim = 4
  5. # Initialize model
  6. mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
  7. print(mlp)


例2-3:Testing the MLP with random inputs
  1. import torch
  2. def describe(x):
  3. print("Type: {}".format(x.type()))
  4. print("Shape/size: {}".format(x.shape))
  5. print("Values: \n{}".format(x))
  6. x_input = torch.rand(batch_size, input_dim)
  7. describe(x_input)




例2-4:MLP with apply_softmax=True
  1. y_output = mlp(x_input, apply_softmax=True)
  2. describe(y_output)



3.Convolutional Neural Network(卷积神经网络)

        卷积神经网络(Convolutional Neural Network,简称CNN)特别适用于处理具有网格结构的数据,如图像。CNN的设计灵感来源于对生物视觉系统的理解,尤其是猫和猴子的视觉皮层中的细胞如何响应特定类型的刺激。









        如ReLU(Rectified Linear Unit),用于引入非线性,帮助网络学习更复杂的模式。





4.1.1:Surname Dataset 数据集

        姓氏数据集,它收集了来自18个不同国家的10,000个姓氏,这些姓氏是作者从互联网上不同的姓名来源收集的。该数据集将在本课程实验的几个示例中重用,并具有一些使其有趣的属性。第一个性质是它是相当不平衡的。排名前三的课程占数据的60%以上:27%是英语,21%是俄语,14%是阿拉伯语。剩下的15个民族的频率也在下降——这也是语言特有的特性。第二个特点是,在国籍和姓氏正字法(拼写)之间有一种有效和直观的关系。有些拼写变体与原籍国联系非常紧密(比如“O ‘Neill”、“Antonopoulos”、“Nagasawa”或“Zhu”)。


例2-5: Implementing SurnameDataset.__getitem__()
  1. class SurnameDataset(Dataset):
  2. # Implementation is nearly identical to Section 3.5
  3. def __getitem__(self, index):
  4. # 获取目标数据框中对应索引的行
  5. row = self._target_df.iloc[index]
  6. # 使用_vectorizer的vectorize方法处理姓氏,得到向量表示
  7. surname_vector = self._vectorizer.vectorize(row.surname)
  8. # 查找国籍词汇表中对应国籍的索引(标签)
  9. nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)
  10. # 返回包含特征和标签的字典
  11. return {'x_surname': surname_vector,
  12. 'y_nationality': nationality_index}

        为了使用字符对姓氏进行分类,我们使用词汇表、向量化器和DataLoader将姓氏字符串转换为向量化的minibatches。这些数据结构与“Example: Classifying Sentiment of Restaurant Reviews”中使用的数据结构相同,它们举例说明了一种多态性,这种多态性将姓氏的字符标记与Yelp评论的单词标记相同对待。数据不是通过将字令牌映射到整数来向量化的,而是通过将字符映射到整数来向量化的。

例2-6.:Implementing SurnameVectorizer and Vocabulary
  1. class SurnameVectorizer(object):
  2. """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
  3. def __init__(self, surname_vocab, nationality_vocab):
  4. """
  5. Args:
  6. surname_vocab (Vocabulary): maps characters to integers
  7. nationality_vocab (Vocabulary): maps nationalities to integers
  8. """
  9. self.surname_vocab = surname_vocab
  10. self.nationality_vocab = nationality_vocab
  11. def vectorize(self, surname):
  12. """
  13. Args:
  14. surname (str): the surname
  15. Returns:
  16. one_hot (np.ndarray): a collapsed one-hot encoding
  17. """
  18. vocab = self.surname_vocab
  19. one_hot = np.zeros(len(vocab), dtype=np.float32)
  20. for token in surname:
  21. one_hot[vocab.lookup_token(token)] = 1
  22. return one_hot
  23. @classmethod
  24. def from_dataframe(cls, surname_df):
  25. """Instantiate the vectorizer from the dataset dataframe
  26. Args:
  27. surname_df (pandas.DataFrame): the surnames dataset
  28. Returns:
  29. an instance of the SurnameVectorizer
  30. """
  31. surname_vocab = Vocabulary(unk_token="@")
  32. nationality_vocab = Vocabulary(add_unk=False)
  33. for index, row in surname_df.iterrows():
  34. for letter in row.surname:
  35. surname_vocab.add_token(letter)
  36. nationality_vocab.add_token(row.nationality)
  37. return cls(surname_vocab, nationality_vocab)
  38. @classmethod
  39. def from_serializable(cls, contents):
  40. surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
  41. nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
  42. return cls(surname_vocab=surname_vocab, nationality_vocab=nationality_vocab)
  43. def to_serializable(self):
  44. return {'surname_vocab': self.surname_vocab.to_serializable(),
  45. 'nationality_vocab': self.nationality_vocab.to_serializable()}
  46. class Vocabulary(object):
  47. """Class to process text and extract vocabulary for mapping"""
  48. def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
  49. """
  50. Args:
  51. token_to_idx (dict): a pre-existing map of tokens to indices
  52. add_unk (bool): a flag that indicates whether to add the UNK token
  53. unk_token (str): the UNK token to add into the Vocabulary
  54. """
  55. if token_to_idx is None:
  56. token_to_idx = {}
  57. self._token_to_idx = token_to_idx
  58. self._idx_to_token = {idx: token
  59. for token, idx in self._token_to_idx.items()}
  60. self._add_unk = add_unk
  61. self._unk_token = unk_token
  62. self.unk_index = -1
  63. if add_unk:
  64. self.unk_index = self.add_token(unk_token)
  65. def to_serializable(self):
  66. """ returns a dictionary that can be serialized """
  67. return {'token_to_idx': self._token_to_idx,
  68. 'add_unk': self._add_unk,
  69. 'unk_token': self._unk_token}
  70. @classmethod
  71. def from_serializable(cls, contents):
  72. """ instantiates the Vocabulary from a serialized dictionary """
  73. return cls(**contents)
  74. def add_token(self, token):
  75. """Update mapping dicts based on the token.
  76. Args:
  77. token (str): the item to add into the Vocabulary
  78. Returns:
  79. index (int): the integer corresponding to the token
  80. """
  81. try:
  82. index = self._token_to_idx[token]
  83. except KeyError:
  84. index = len(self._token_to_idx)
  85. self._token_to_idx[token] = index
  86. self._idx_to_token[index] = token
  87. return index
  88. def add_many(self, tokens):
  89. """Add a list of tokens into the Vocabulary
  90. Args:
  91. tokens (list): a list of string tokens
  92. Returns:
  93. indices (list): a list of indices corresponding to the tokens
  94. """
  95. return [self.add_token(token) for token in tokens]
  96. def lookup_token(self, token):
  97. """Retrieve the index associated with the token
  98. or the UNK index if token isn't present.
  99. Args:
  100. token (str): the token to look up
  101. Returns:
  102. index (int): the index corresponding to the token
  103. Notes:
  104. `unk_index` needs to be >=0 (having been added into the Vocabulary)
  105. for the UNK functionality
  106. """
  107. if self.unk_index >= 0:
  108. return self._token_to_idx.get(token, self.unk_index)
  109. else:
  110. return self._token_to_idx[token]
  111. def lookup_index(self, index):
  112. """Return the token associated with the index
  113. Args:
  114. index (int): the index to look up
  115. Returns:
  116. token (str): the token corresponding to the index
  117. Raises:
  118. KeyError: if the index is not in the Vocabulary
  119. """
  120. if index not in self._idx_to_token:
  121. raise KeyError("the index (%d) is not in the Vocabulary" % index)
  122. return self._idx_to_token[index]
  123. def __str__(self):
  124. return "<Vocabulary(size=%d)>" % len(self)
  125. def __len__(self):
  126. return len(self._token_to_idx)
4.1.3:The Dataset
  1. import torch.utils.data.dataset as Dataset
  2. class SurnameDataset(Dataset.Dataset):
  3. def __init__(self, surname_df, vectorizer):
  4. """
  5. Args:
  6. surname_df (pandas.DataFrame): the dataset
  7. vectorizer (SurnameVectorizer): vectorizer instatiated from dataset
  8. """
  9. self.surname_df = surname_df
  10. self._vectorizer = vectorizer
  11. self.train_df = self.surname_df[self.surname_df.split == 'train']
  12. self.train_size = len(self.train_df)
  13. self.val_df = self.surname_df[self.surname_df.split == 'val']
  14. self.validation_size = len(self.val_df)
  15. self.test_df = self.surname_df[self.surname_df.split == 'test']
  16. self.test_size = len(self.test_df)
  17. self._lookup_dict = {'train': (self.train_df, self.train_size),
  18. 'val': (self.val_df, self.validation_size),
  19. 'test': (self.test_df, self.test_size)}
  20. self.set_split('train')
  21. # Class weights
  22. class_counts = surname_df.nationality.value_counts().to_dict()
  23. def sort_key(item):
  24. return self._vectorizer.nationality_vocab.lookup_token(item[0])
  25. sorted_counts = sorted(class_counts.items(), key=sort_key)
  26. frequencies = [count for _, count in sorted_counts]
  27. self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
  28. @classmethod
  29. def load_dataset_and_make_vectorizer(cls, surname_csv):
  30. """Load dataset and make a new vectorizer from scratch
  31. Args:
  32. surname_csv (str): location of the dataset
  33. Returns:
  34. an instance of SurnameDataset
  35. """
  36. surname_df = pd.read_csv(surname_csv)
  37. train_surname_df = surname_df[surname_df.split == 'train']
  38. return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))
  39. @classmethod
  40. def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
  41. """Load dataset and the corresponding vectorizer.
  42. Used in the case in the vectorizer has been cached for re-use
  43. Args:
  44. surname_csv (str): location of the dataset
  45. vectorizer_filepath (str): location of the saved vectorizer
  46. Returns:
  47. an instance of SurnameDataset
  48. """
  49. surname_df = pd.read_csv(surname_csv)
  50. vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
  51. return cls(surname_df, vectorizer)
  52. @staticmethod
  53. def load_vectorizer_only(vectorizer_filepath):
  54. """a static method for loading the vectorizer from file
  55. Args:
  56. vectorizer_filepath (str): the location of the serialized vectorizer
  57. Returns:
  58. an instance of SurnameVectorizer
  59. """
  60. with open(vectorizer_filepath) as fp:
  61. return SurnameVectorizer.from_serializable(json.load(fp))
  62. def save_vectorizer(self, vectorizer_filepath):
  63. """saves the vectorizer to disk using json
  64. Args:
  65. vectorizer_filepath (str): the location to save the vectorizer
  66. """
  67. with open(vectorizer_filepath, "w") as fp:
  68. json.dump(self._vectorizer.to_serializable(), fp)
  69. def get_vectorizer(self):
  70. """ returns the vectorizer """
  71. return self._vectorizer
  72. def set_split(self, split="train"):
  73. """ selects the splits in the dataset using a column in the dataframe """
  74. self._target_split = split
  75. self._target_df, self._target_size = self._lookup_dict[split]
  76. def __len__(self):
  77. return self._target_size
  78. def __getitem__(self, index):
  79. """the primary entry point method for PyTorch datasets
  80. Args:
  81. index (int): the index to the data point
  82. Returns:
  83. a dictionary holding the data point's:
  84. features (x_surname)
  85. label (y_nationality)
  86. """
  87. row = self._target_df.iloc[index]
  88. surname_vector = \
  89. self._vectorizer.vectorize(row.surname)
  90. nationality_index = \
  91. self._vectorizer.nationality_vocab.lookup_token(row.nationality)
  92. return {'x_surname': surname_vector,
  93. 'y_nationality': nationality_index}
  94. def get_num_batches(self, batch_size):
  95. """Given a batch size, return the number of batches in the dataset
  96. Args:
  97. batch_size (int)
  98. Returns:
  99. number of batches in the dataset
  100. """
  101. return len(self) // batch_size
  102. def generate_batches(dataset, batch_size, shuffle=True,
  103. drop_last=True, device="cpu"):
  104. """
  105. A generator function which wraps the PyTorch DataLoader. It will
  106. ensure each tensor is on the write device location.
  107. """
  108. dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
  109. shuffle=shuffle, drop_last=drop_last)
  110. for data_dict in dataloader:
  111. out_data_dict = {}
  112. for name, tensor in data_dict.items():
  113. out_data_dict[name] = data_dict[name].to(device)
  114. yield out_data_dict
4.1.4:The Surname Classifier Model


例2-7:The SurnameClassifier as an MLP
  1. import torch.nn as nn
  2. import torch.nn.functional as F
  3. class SurnameClassifier(nn.Module):
  4. """ A 2-layer Multilayer Perceptron for classifying surnames """
  5. def __init__(self, input_dim, hidden_dim, output_dim):
  6. """
  7. Args:
  8. input_dim (int): the size of the input vectors
  9. hidden_dim (int): the output size of the first Linear layer
  10. output_dim (int): the output size of the second Linear layer
  11. """
  12. super(SurnameClassifier, self).__init__()
  13. self.fc1 = nn.Linear(input_dim, hidden_dim)
  14. self.fc2 = nn.Linear(hidden_dim, output_dim)
  15. def forward(self, x_in, apply_softmax=False):
  16. """The forward pass of the classifier
  17. Args:
  18. x_in (torch.Tensor): an input data tensor.
  19. x_in.shape should be (batch, input_dim)
  20. apply_softmax (bool): a flag for the softmax activation
  21. should be false if used with the Cross Entropy losses
  22. Returns:
  23. the resulting tensor. tensor.shape should be (batch, output_dim)
  24. """
  25. # 通过第一层全连接层,并使用ReLU激活函数
  26. intermediate_vector = F.relu(self.fc1(x_in))
  27. # 通过第二层全连接层得到预测向量
  28. prediction_vector = self.fc2(intermediate_vector)
  29. # 如果apply_softmax为True,则对预测向量应用softmax函数
  30. if apply_softmax:
  31. prediction_vector = F.softmax(prediction_vector, dim=1)
  32. # 返回处理后的预测向量
  33. return prediction_vector

        一般来说,神经网络设计的目标是找到一个能够完成任务的超参数组态。我们再次考虑在“示例:带有多层感知器的姓氏分类”中引入的现在很熟悉的姓氏分类任务,但是我们将使用CNNs而不是MLP。我们仍然需要应用最后一个线性层,它将学会从一系列卷积层创建的特征向量创建预测向量。这意味着目标是确定卷积层的配置,从而得到所需的特征向量。所有CNN应用程序都是这样的:首先有一组卷积层,它们提取一个feature map,然后将其作为上游处理的输入。在分类中,上游处理几乎总是应用线性(或fc)层。

        本课程中的实现遍历设计决策,以构建一个特征向量。我们首先构造一个人工数据张量,以反映实际数据的形状。数据张量的大小是三维的——这是向量化文本数据的最小批大小。如果你对一个字符序列中的每个字符使用onehot向量,那么onehot向量序列就是一个矩阵,而onehot矩阵的小批量就是一个三维张量。使用卷积的术语,每个onehot(通常是词汇表的大小)的大小是”input channels”的数量,字符序列的长度是“width”。

  82. class SurnameVectorizer(object):
  83. """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
  84. def __init__(self, surname_vocab, nationality_vocab, max_surname_length):
  85. """
  86. Args:
  87. surname_vocab (Vocabulary): maps characters to integers
  88. nationality_vocab (Vocabulary): maps nationalities to integers
  89. max_surname_length (int): the length of the longest surname
  90. """
  91. self.surname_vocab = surname_vocab
  92. self.nationality_vocab = nationality_vocab
  93. self._max_surname_length = max_surname_length
  94. def vectorize(self, surname):
  95. """
  96. Args:
  97. surname (str): the surname
  98. Returns:
  99. one_hot_matrix (np.ndarray): a matrix of one-hot vectors
  100. """
  101. one_hot_matrix_size = (len(self.surname_vocab), self._max_surname_length)
  102. one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
  103. for position_index, character in enumerate(surname):
  104. character_index = self.surname_vocab.lookup_token(character)
  105. one_hot_matrix[character_index][position_index] = 1
  106. return one_hot_matrix
  107. @classmethod
  108. def from_dataframe(cls, surname_df):
  109. """Instantiate the vectorizer from the dataset dataframe
  110. Args:
  111. surname_df (pandas.DataFrame): the surnames dataset
  112. Returns:
  113. an instance of the SurnameVectorizer
  114. """
  115. surname_vocab = Vocabulary(unk_token="@")
  116. nationality_vocab = Vocabulary(add_unk=False)
  117. max_surname_length = 0
  118. for index, row in surname_df.iterrows():
  119. max_surname_length = max(max_surname_length, len(row.surname))
  120. for letter in row.surname:
  121. surname_vocab.add_token(letter)
  122. nationality_vocab.add_token(row.nationality)
  123. return cls(surname_vocab, nationality_vocab, max_surname_length)
  124. @classmethod
  125. def from_serializable(cls, contents):
  126. surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
  127. nationality_vocab = Vocabulary.from_serializable(contents['nationality_vocab'])
  128. return cls(surname_vocab=surname_vocab, nationality_vocab=nationality_vocab,
  129. max_surname_length=contents['max_surname_length'])
  130. def to_serializable(self):
  131. return {'surname_vocab': self.surname_vocab.to_serializable(),
  132. 'nationality_vocab': self.nationality_vocab.to_serializable(),
  133. 'max_surname_length': self._max_surname_length}
4.2.2:The Dataset
例2-8:The SurnameClassifier as an CNN
  1. import torch.nn as nn
  2. import torch.nn.functional as F
  3. class SurnameClassifier(nn.Module):
  4. def __init__(self, initial_num_channels, num_classes, num_channels):
  5. """
  6. Args:
  7. initial_num_channels (int): size of the incoming feature vector
  8. num_classes (int): size of the output prediction vector
  9. num_channels (int): constant channel size to use throughout network
  10. """
  11. super(SurnameClassifier, self).__init__()
  12. self.convnet = nn.Sequential(
  13. nn.Conv1d(in_channels=initial_num_channels,
  14. out_channels=num_channels, kernel_size=3),
  15. nn.ELU(),
  16. nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
  17. kernel_size=3, stride=2),
  18. nn.ELU(),
  19. nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
  20. kernel_size=3, stride=2),
  21. nn.ELU(),
  22. nn.Conv1d(in_channels=num_channels, out_channels=num_channels,
  23. kernel_size=3),
  24. nn.ELU()
  25. )
  26. self.fc = nn.Linear(num_channels, num_classes)
  27. def forward(self, x_surname, apply_softmax=False):
  28. """The forward pass of the classifier
  29. Args:
  30. x_surname (torch.Tensor): an input data tensor.
  31. x_surname.shape should be (batch, initial_num_channels, max_surname_length)
  32. apply_softmax (bool): a flag for the softmax activation
  33. should be false if used with the Cross Entropy losses
  34. Returns:
  35. the resulting tensor. tensor.shape should be (batch, num_classes)
  36. """
  37. features = self.convnet(x_surname).squeeze(dim=2)
  38. prediction_vector = self.fc(features)
  39. if apply_softmax:
  40. prediction_vector = F.softmax(prediction_vector, dim=1)
  41. return prediction_vector
  1. def make_train_state(args):
  2. return {'stop_early': False,
  3. 'early_stopping_step': 0,
  4. 'early_stopping_best_val': 1e8,
  5. 'learning_rate': args.learning_rate,
  6. 'epoch_index': 0,
  7. 'train_loss': [],
  8. 'train_acc': [],
  9. 'val_loss': [],
  10. 'val_acc': [],
  11. 'test_loss': -1,
  12. 'test_acc': -1,
  13. 'model_filename': args.model_state_file}
  14. def update_train_state(args, model, train_state):
  15. """Handle the training state updates.
  16. Components:
  17. - Early Stopping: Prevent overfitting.
  18. - Model Checkpoint: Model is saved if the model is better
  19. :param args: main arguments
  20. :param model: model to train
  21. :param train_state: a dictionary representing the training state values
  22. :returns:
  23. a new train_state
  24. """
  25. # Save one model at least
  26. if train_state['epoch_index'] == 0:
  27. torch.save(model.state_dict(), train_state['model_filename'])
  28. train_state['stop_early'] = False
  29. # Save model if performance improved
  30. elif train_state['epoch_index'] >= 1:
  31. loss_tm1, loss_t = train_state['val_loss'][-2:]
  32. # If loss worsened
  33. if loss_t >= train_state['early_stopping_best_val']:
  34. # Update step
  35. train_state['early_stopping_step'] += 1
  36. # Loss decreased
  37. else:
  38. # Save the best model
  39. if loss_t < train_state['early_stopping_best_val']:
  40. torch.save(model.state_dict(), train_state['model_filename'])
  41. # Reset early stopping step
  42. train_state['early_stopping_step'] = 0
  43. # Stop early ?
  44. train_state['stop_early'] = \
  45. train_state['early_stopping_step'] >= args.early_stopping_criteria
  46. return train_state
  47. def compute_accuracy(y_pred, y_target):
  48. y_pred_indices = y_pred.max(dim=1)[1]
  49. n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  50. return n_correct / len(y_pred_indices) * 100
  51. args = Namespace(
  52. # Data and Path information
  53. surname_csv="data/surnames/surnames_with_splits.csv",
  54. vectorizer_file="vectorizer.json",
  55. model_state_file="model.pth",
  56. save_dir="model_storage/ch4/cnn",
  57. # Model hyper parameters
  58. hidden_dim=100,
  59. num_channels=256,
  60. # Training hyper parameters
  61. seed=1337,
  62. learning_rate=0.001,
  63. batch_size=128,
  64. num_epochs=100,
  65. early_stopping_criteria=5,
  66. dropout_p=0.1,
  67. # Runtime options
  68. cuda=False,
  69. reload_from_files=False,
  70. expand_filepaths_to_save_dir=True,
  71. catch_keyboard_interrupt=True
  72. )
  73. if args.expand_filepaths_to_save_dir:
  74. args.vectorizer_file = os.path.join(args.save_dir,
  75. args.vectorizer_file)
  76. args.model_state_file = os.path.join(args.save_dir,
  77. args.model_state_file)
  78. print("Expanded filepaths: ")
  79. print("\t{}".format(args.vectorizer_file))
  80. print("\t{}".format(args.model_state_file))
  81. # Check CUDA
  82. if not torch.cuda.is_available():
  83. args.cuda = False
  84. args.device = torch.device("cuda" if args.cuda else "cpu")
  85. print("Using CUDA: {}".format(args.cuda))
  86. def set_seed_everywhere(seed, cuda):
  87. np.random.seed(seed)
  88. torch.manual_seed(seed)
  89. if cuda:
  90. torch.cuda.manual_seed_all(seed)
  91. def handle_dirs(dirpath):
  92. if not os.path.exists(dirpath):
  93. os.makedirs(dirpath)
  94. # Set seed for reproducibility
  95. set_seed_everywhere(args.seed, args.cuda)
  96. # handle dirs
  97. handle_dirs(args.save_dir)


  1. if args.reload_from_files:
  2. # 如果reload_from_files参数为True,表示从已有的文件中加载数据集和向量化器
  3. # 这意味着我们将从上次保存的检查点继续训练
  4. dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv,
  5. args.vectorizer_file)
  6. else:
  7. # 如果reload_from_files参数为False,表示创建新的数据集和向量化器
  8. # 这通常在首次训练模型时使用
  9. dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
  10. # 保存新创建的向量化器到指定文件,供将来使用或继续训练
  11. dataset.save_vectorizer(args.vectorizer_file)
  12. # 获取数据集的向量化器,用于将文本数据转换为数值向量
  13. vectorizer = dataset.get_vectorizer()
  14. # 初始化分类器模型,使用向量化器的大小作为输入和输出层的维度
  15. classifier = SurnameClassifier(initial_num_channels=len(vectorizer.surname_vocab),
  16. num_classes=len(vectorizer.nationality_vocab),
  17. num_channels=args.num_channels)
  18. # 将模型移动到指定的设备(CPU或GPU)
  19. classifier = classifier.to(args.device)
  20. # 将数据集的类别权重也移动到相同的设备
  21. dataset.class_weights = dataset.class_weights.to(args.device)
  22. # 定义损失函数,这里使用加权的交叉熵损失,权重来源于数据集的类别权重
  23. loss_func = nn.CrossEntropyLoss(weight=dataset.class_weights)
  24. # 定义优化器,这里使用Adam优化器,学习率由args.learning_rate参数指定
  25. optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
  26. # 定义学习率调度器,当验证集的损失不再下降时,按一定比例降低学习率
  27. scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
  28. mode='min', factor=0.5,
  29. patience=1)
  30. # 创建一个训练状态字典,用于跟踪训练过程中的关键信息,如epoch、loss等
  31. train_state = make_train_state(args)
  32. # 创建一个描述为'training routine'的进度条,总长度为训练周期数
  33. epoch_bar = tqdm_notebook(desc='training routine',
  34. total=args.num_epochs,
  35. position=0)
  36. # 将数据集设置为训练模式
  37. dataset.set_split('train')
  38. # 创建一个描述为'split=train'的进度条,总长度为训练集的批次数
  39. train_bar = tqdm_notebook(desc='split=train',
  40. total=dataset.get_num_batches(args.batch_size),
  41. position=1,
  42. leave=True)
  43. # 将数据集设置为验证模式
  44. dataset.set_split('val')
  45. # 创建一个描述为'split=val'的进度条,总长度为验证集的批次数
  46. val_bar = tqdm_notebook(desc='split=val',
  47. total=dataset.get_num_batches(args.batch_size),
  48. position=1,
  49. leave=True)
  50. # 尝试执行以下训练循环
  51. try:
  52. for epoch_index in range(args.num_epochs):
  53. # 更新训练状态字典中的epoch_index
  54. train_state['epoch_index'] = epoch_index
  55. # 开始迭代训练数据集
  56. # 准备工作:生成批次,初始化损失和准确率为0,设置模型为训练模式
  57. dataset.set_split('train')
  58. batch_generator = generate_batches(dataset,
  59. batch_size=args.batch_size,
  60. device=args.device)
  61. running_loss = 0.0
  62. running_acc = 0.0
  63. classifier.train()
  64. # 遍历训练数据集的每个批次
  65. for batch_index, batch_dict in enumerate(batch_generator):
  66. # 训练循环分为以下5个步骤:
  67. # --------------------------------------
  68. # 步骤1:清零梯度
  69. optimizer.zero_grad()
  70. # 步骤2:计算模型输出
  71. y_pred = classifier(batch_dict['x_surname'])
  72. # 步骤3:计算损失
  73. loss = loss_func(y_pred, batch_dict['y_nationality'])
  74. loss_t = loss.item()
  75. # 更新运行时的平均损失
  76. running_loss += (loss_t - running_loss) / (batch_index + 1)
  77. # 步骤4:使用损失反向传播计算梯度
  78. loss.backward()
  79. # 步骤5:使用优化器更新权重
  80. optimizer.step()
  81. # -----------------------------------------
  82. # 计算准确率
  83. acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
  84. # 更新运行时的平均准确率
  85. running_acc += (acc_t - running_acc) / (batch_index + 1)
  86. # 更新进度条信息
  87. train_bar.set_postfix(loss=running_loss, acc=running_acc,
  88. epoch=epoch_index)
  89. train_bar.update()
  90. # 将当前epoch的训练损失和准确率添加到训练状态字典中
  91. train_state['train_loss'].append(running_loss)
  92. train_state['train_acc'].append(running_acc)
  93. # 开始迭代验证数据集
  94. # 准备工作:生成批次,初始化损失和准确率为0,设置模型为评估模式
  95. dataset.set_split('val')
  96. batch_generator = generate_batches(dataset,
  97. batch_size=args.batch_size,
  98. device=args.device)
  99. running_loss = 0.
  100. running_acc = 0.
  101. classifier.eval()
  102. # 遍历验证数据集的每个批次
  103. for batch_index, batch_dict in enumerate(batch_generator):
  104. # 计算模型输出
  105. y_pred = classifier(batch_dict['x_surname'])
  106. # 计算损失
  107. loss = loss_func(y_pred, batch_dict['y_nationality'])
  108. loss_t = loss.item()
  109. # 更新运行时的平均损失
  110. running_loss += (loss_t - running_loss) / (batch_index + 1)
  111. # 计算准确率
  112. acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
  113. # 更新运行时的平均准确率
  114. running_acc += (acc_t - running_acc) / (batch_index + 1)
  115. # 更新进度条信息
  116. val_bar.set_postfix(loss=running_loss, acc=running_acc,
  117. epoch=epoch_index)
  118. val_bar.update()
  119. # 将当前epoch的验证损失和准确率添加到训练状态字典中
  120. train_state['val_loss'].append(running_loss)
  121. train_state['val_acc'].append(running_acc)
  122. # 更新训练状态字典,检查是否需要提前终止训练
  123. train_state = update_train_state(args=args, model=classifier,
  124. train_state=train_state)
  125. # 根据最新的验证损失更新学习率
  126. scheduler.step(train_state['val_loss'][-1])
  127. # 如果训练状态表明需要提前终止训练,则跳出循环
  128. if train_state['stop_early']:
  129. break
  130. # 重置进度条的位置
  131. train_bar.n = 0
  132. val_bar.n = 0
  133. epoch_bar.update()
  134. except KeyboardInterrupt:
  135. print("Exiting loop")


  1. # 加载之前保存的最佳模型参数
  2. classifier.load_state_dict(torch.load(train_state['model_filename']))
  3. # 将模型移动到指定的设备(如GPU)上
  4. classifier = classifier.to(args.device)
  5. # 将数据集的类别权重也移动到相同的设备上,这通常用于处理类别不平衡问题
  6. dataset.class_weights = dataset.class_weights.to(args.device)
  7. # 定义交叉熵损失函数,并将它也移动到相应的设备上
  8. loss_func = nn.CrossEntropyLoss(dataset.class_weights)
  9. # 设置数据集为测试模式
  10. dataset.set_split('test')
  11. # 创建一个生成器,用于产生测试数据集的批次
  12. batch_generator = generate_batches(dataset,
  13. batch_size=args.batch_size,
  14. device=args.device)
  15. # 初始化测试损失和准确率为0
  16. running_loss = 0.
  17. running_acc = 0.
  18. # 将模型设置为评估模式
  19. classifier.eval()
  20. # 遍历测试数据集的每一个批次
  21. for batch_index, batch_dict in enumerate(batch_generator):
  22. # 使用模型预测当前批次的输出
  23. y_pred = classifier(batch_dict['x_surname'])
  24. # 计算损失
  25. loss = loss_func(y_pred, batch_dict['y_nationality'])
  26. loss_t = loss.item()
  27. # 更新运行时的平均损失
  28. running_loss += (loss_t - running_loss) / (batch_index + 1)
  29. # 计算准确率
  30. acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
  31. # 更新运行时的平均准确率
  32. running_acc += (acc_t - running_acc) / (batch_index + 1)
  33. # 将测试集上的最终损失和准确率记录在训练状态字典中
  34. train_state['test_loss'] = running_loss
  35. train_state['test_acc'] = running_acc
  36. print("Test loss: {};".format(train_state['test_loss']))
  37. print("Test Accuracy: {}".format(train_state['test_acc']))


  1. def predict_nationality(surname, classifier, vectorizer):
  2. """Predict the nationality from a new surname
  3. Args:
  4. surname (str): the surname to classifier
  5. classifier (SurnameClassifer): an instance of the classifier
  6. vectorizer (SurnameVectorizer): the corresponding vectorizer
  7. Returns:
  8. a dictionary with the most likely nationality and its probability
  9. """
  10. vectorized_surname = vectorizer.vectorize(surname)
  11. vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(0)
  12. result = classifier(vectorized_surname, apply_softmax=True)
  13. probability_values, indices = result.max(dim=1)
  14. index = indices.item()
  15. predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)
  16. probability_value = probability_values.item()
  17. return {'nationality': predicted_nationality, 'probability': probability_value}
  18. new_surname = input("Enter a surname to classify: ")
  19. classifier = classifier.cpu()
  20. prediction = predict_nationality(new_surname, classifier, vectorizer)
  21. print("{} -> {} (p={:0.2f})".format(new_surname,
  22. prediction['nationality'],
  23. prediction['probability']))


  1. def predict_topk_nationality(surname, classifier, vectorizer, k=5):
  2. """Predict the top K nationalities from a new surname
  3. Args:
  4. surname (str): the surname to classifier
  5. classifier (SurnameClassifer): an instance of the classifier
  6. vectorizer (SurnameVectorizer): the corresponding vectorizer
  7. k (int): the number of top nationalities to return
  8. Returns:
  9. list of dictionaries, each dictionary is a nationality and a probability
  10. """
  11. vectorized_surname = vectorizer.vectorize(surname)
  12. vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0)
  13. prediction_vector = classifier(vectorized_surname, apply_softmax=True)
  14. probability_values, indices = torch.topk(prediction_vector, k=k)
  15. # returned size is 1,k
  16. probability_values = probability_values[0].detach().numpy()
  17. indices = indices[0].detach().numpy()
  18. results = []
  19. for kth_index in range(k):
  20. nationality = vectorizer.nationality_vocab.lookup_index(indices[kth_index])
  21. probability_value = probability_values[kth_index]
  22. results.append({'nationality': nationality,
  23. 'probability': probability_value})
  24. return results
  25. new_surname = input("Enter a surname to classify: ")
  26. k = int(input("How many of the top predictions to see? "))
  27. if k > len(vectorizer.nationality_vocab):
  28. print("Sorry! That's more than the # of nationalities we have.. defaulting you to max size :)")
  29. k = len(vectorizer.nationality_vocab)
  30. predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)
  31. print("Top {} predictions:".format(k))
  32. print("===================")
  33. for prediction in predictions:
  34. print("{} -> {} (p={:0.2f})".format(new_surname,
  35. prediction['nationality'],
  36. prediction['probability']))

