赞
踩
在进行自然语言处理时,对文字进行编码一个十分必要的步骤文字编码的目的是将文本数据转换为计算机可以理解和处理的数字表示形式。以下是进行文字编码的几个主要原因:
以下为几种常用的编码方式,也是本文介绍的主要内容:
one-hot是一种词嵌入方式,编码方式较为较为简单,就是将每一个词或字都表示为一个向量,仅在该词或字所在的位置设置为1,其余位置均为零。该编码方式的优缺点如下:
优点:
缺点:
word2vec也是一种词嵌入方式,有两种训练方式,如下图所示为两种训练方式的框架图:
- #导入所需库
- import torch
- import torch.nn as nn
- import torch.optim as optim
- from torch.utils.data import DataLoader, Dataset
-
- #定义数据类
- class Word2VecDataset(Dataset):
- def __init__(self, corpus, window_size):
- self.corpus = corpus
- self.window_size = window_size
- self.data = self.generate_data()
-
- def generate_data(self):
- data = []
- for sentence in self.corpus:
- for i in range(len(sentence)):
- center_word = sentence[i]
- context_words = sentence[max(i - self.window_size, 0):i] + sentence[i + 1:i + self.window_size + 1]
- data.append((center_word, context_words))
- return data
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, index):
- center_word, context_words = self.data[index]
- return center_word, context_words
-
- #定义模型--CBOW
- class Word2Vec(nn.Module):
- def __init__(self, vocab_size, embed_size):
- super(Word2Vec, self).__init__()
- self.embeddings = nn.Embedding(vocab_size, embed_size)
- self.linear1 = nn.Linear(embed_size, vocab_size)
-
- def forward(self, center_word, context_words):
- embedded = self.embeddings(center_word)
- hidden = torch.sum(embedded, dim=0)
- output = self.linear1(hidden)
- return output, embedded
-
- #定义训练函数
- def train(model, dataset, batch_size, num_epochs, learning_rate):
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model.to(device)
-
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.SGD(model.parameters(), lr=learning_rate)
-
- dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
-
- for epoch in range(num_epochs):
- total_loss = 0
- for center_word, context_words in dataloader:
- center_word = center_word.to(device)
- context_words = context_words.to(device)
-
- optimizer.zero_grad()
- output, _ = model(center_word, context_words)
- loss = criterion(output, context_words.flatten())
- loss.backward()
- optimizer.step()
-
- total_loss += loss.item()
-
- print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
-
- #准备数据和训练
- corpus = [...] # 原始语料库,每个元素是一个句子列表或单词列表
- window_size = 2 # 上下文窗口大小
- embedding_size = 100 # 词嵌入维度
- batch_size = 64
- num_epochs = 10
- learning_rate = 0.01
-
- dataset = Word2VecDataset(corpus, window_size)
- vocab_size = len(set([word for sentence in corpus for word in sentence]))
- model = Word2Vec(vocab_size, embedding_size)
- train(model, dataset, batch_size, num_epochs, learning_rate)
- class Word2Vec(nn.Module):
- def __init__(self, vocab_size, embed_size):
- super(Word2Vec, self).__init__()
- self.embeddings = nn.Embedding(vocab_size, embed_size)
- self.linear1 = nn.Linear(embed_size, vocab_size)
-
- def forward(self, target_word):
- embedded = self.embeddings(target_word)
- output = self.linear1(embedded)
- return output, embedded
-
- def train(model, dataset, batch_size, num_epochs, learning_rate):
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model.to(device)
-
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.SGD(model.parameters(), lr=learning_rate)
-
- dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
-
- for epoch in range(num_epochs):
- total_loss = 0
- for target_word, context_word in dataloader:
- target_word = target_word.to(device)
- context_word = context_word.to(device)
-
- optimizer.zero_grad()
- output, _ = model(target_word)
- loss = criterion(output, context_word.flatten())
- loss.backward()
- optimizer.step()
-
- total_loss += loss.item()
-
- print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
与 CBOW 模型相比,Skip-gram 模型的不同之处在于输入是目标单词(target_word),而不是上下文单词。在模型的 forward 方法中,只需将目标单词的词向量传递给线性层(self.linear1)进行输出。其余部分的训练流程与 CBOW 模型相似。
BERT编码方式是依赖于huggingface官网所提供的预训练模型进行的,在使用时可以根据文字的具体类型和需要,在官网上下载,并使用pytorch调用模型对数据编码。在下面的示例中,我们首先加载了预训练的 BERT 模型和分词器(bert-base-uncased
)。然后,我们对输入文本进行分词并将其转换为整数 ID,然后使用预训练的 BERT 模型获取嵌入表示。最后,我们打印出输入文本的嵌入表示。:
- import torch
- from transformers import BertTokenizer, BertModel
-
- # 加载预训练的 BERT 模型和分词器
- model_name = 'bert-base-uncased'
- tokenizer = BertTokenizer.from_pretrained(model_name)
- model = BertModel.from_pretrained(model_name)
-
- # 输入文本
- text = "Hello, how are you?"
-
- # 使用分词器将文本转换为标记序列
- tokens = tokenizer.tokenize(text)
- tokens = ['[CLS]'] + tokens + ['[SEP]'] # 添加开始和结束标记
-
- # 将标记序列转换为对应的整数 ID
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_ids = torch.tensor(input_ids).unsqueeze(0) # 添加批次维度
-
- # 获取 BERT 的编码结果
- with torch.no_grad():
- outputs = model(input_ids)
- embeddings = outputs.last_hidden_state
-
- # embeddings 是输入文本的嵌入表示,可以在下游任务中使用
-
- # 打印输入文本的嵌入表示
- print(embeddings)
输出结果如下:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。