BERT(Bidirectional Encoder Representations from Transformers)模型是一种基于Transformer架构的深度学习模型,主要用于自然语言处理任务。以下是对BERT模型的系统解释:
双向编码器(Bidirectional Encoder):
为了使用BERT进行文本分类,我们可以使用Hugging Face的transformers库。以下是一个简单的例子,展示如何加载预训练的BERT模型,对文本进行分类,并进行微调:
- from transformers import BertTokenizer, BertForSequenceClassification, AdamW
- from torch.utils.data import DataLoader, Dataset
- from sklearn.model_selection import train_test_split
- import torch
- import torch.nn as nn
- from tqdm import tqdm
- # 示例数据:假设有一个包含文本和标签的数据集
- texts = ["This is a positive sentence.", "This is a negative sentence.", "Another positive example."]
- labels = [1, 0, 1] # 1 represents positive, 0 represents negative
- # 划分训练集和测试集
- train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
- # 定义自定义数据集类
- class CustomDataset(Dataset):
- def __init__(self, texts, labels, tokenizer, max_length):
- self.texts = texts
- self.labels = labels
- self.tokenizer = tokenizer
- self.max_length = max_length
- def __len__(self):
- return len(self.texts)
- def __getitem__(self, idx):
- text = str(self.texts[idx])
- label = int(self.labels[idx])
- encoding = self.tokenizer.encode_plus(
- text,
- add_special_tokens=True,
- max_length=self.max_length,
- return_token_type_ids=False,
- padding='max_length',
- truncation=True,
- return_attention_mask=True,
- return_tensors='pt',
- )
- return {
- 'text': text,
- 'input_ids': encoding['input_ids'].flatten(),
- 'attention_mask': encoding['attention_mask'].flatten(),
- 'label': torch.tensor(label, dtype=torch.long)
- }
- # 定义BERT模型和微调函数
- class BERTForTextClassification(nn.Module):
- def __init__(self, num_classes=2):
- super(BERTForTextClassification, self).__init__()
- self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
- def forward(self, input_ids, attention_mask):
- outputs = self.bert(input_ids, attention_mask=attention_mask)
- return outputs.logits
- def fine_tune_bert(train_loader, test_loader, num_epochs=3, learning_rate=2e-5):
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model = BERTForTextClassification(num_classes=2).to(device)
- optimizer = AdamW(model.parameters(), lr=learning_rate)
- criterion = nn.CrossEntropyLoss()
- for epoch in range(num_epochs):
- model.train()
- total_loss = 0
- for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
- input_ids = batch['input_ids'].to(device)
- attention_mask = batch['attention_mask'].to(device)
- labels = batch['label'].to(device)
- optimizer.zero_grad()
- outputs = model(input_ids, attention_mask=attention_mask)
- loss = criterion(outputs, labels)
- loss.backward()
- optimizer.step()
- total_loss += loss.item()
- average_loss = total_loss / len(train_loader)
- print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")
- # Evaluate on the test set
- model.eval()
- correct = 0
- total = 0
- with torch.no_grad():
- for batch in tqdm(test_loader, desc="Testing"):
- input_ids = batch['input_ids'].to(device)
- attention_mask = batch['attention_mask'].to(device)
- labels = batch['label'].to(device)
- outputs = model(input_ids, attention_mask=attention_mask)
- _, predicted = torch.max(outputs, 1)
- total += labels.size(0)
- correct += (predicted == labels).sum().item()
- accuracy = correct / total
- print(f"Test Accuracy: {accuracy * 100:.2f}%")
- # 定义训练和测试数据集
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- max_length = 32
- train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length)
- test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_length)
- # 使用DataLoader加载数据
- batch_size = 2
- train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
- test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
- # 进行微调
- fine_tune_bert(train_loader, test_loader)

