赞
踩
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
在线聊天的总体架构与工具介绍:Flask web、Redis、Gunicorn服务组件、Supervisor服务监控器、Neo4j图数据库
linux 安装 neo4j、linux 安装 Redis、supervisor 安装
在线部分:werobot服务、主要逻辑服务、句子相关模型服务、BERT中文预训练模型+微调模型(目的:比较两句话text1和text2之间是否有关联)、模型在Flask部署
离线部分+在线部分:命名实体审核任务RNN模型、命名实体识别任务BiLSTM+CRF模型、BERT中文预训练+微调模型、werobot服务+flask
- import torch
- import torch.nn as nn
-
- # 导入bert的模型
- model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-chinese')
-
- # 导入字符映射器
- tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-chinese')
-
-
- def get_bert_encode_for_single(text):
- """
- 功能: 使用bert-chinese预训练模型对中文文本进行编码
- text: 要进行编码的中文文本
- return : 编码后的张量
- """
-
- # 首先使用字符映射器对每个汉子进行映射
- # bert中的tokenizer映射后会加入开始和结束的标记, 101, 102, 这两个标记对我们不需要,采用切片的方式去除
- indexed_tokens = tokenizer.encode(text)[1:-1]
-
- # 封装成tensor张量
- tokens_tensor = torch.tensor([indexed_tokens])
- # print(tokens_tensor)
-
- # 预测部分需要使得模型不自动求导
- with torch.no_grad():
- encoded_layers, _ = model(tokens_tensor)
-
- # print(encoded_layers.shape)
- # 模型的输出都是三维张量,第一维是1,使用[0]来进行降维,只提取我们需要的后两个维度的张量
- encoded_layers = encoded_layers[0]
- return encoded_layers
-
-
- if __name__ == '__main__':
- text = "你好,周杰伦"
- outputs = get_bert_encode_for_single(text)
- # print(outputs)
- # print(outputs.shape)
-
- import torch
- import torch.nn as nn
-
-
- class RNN(nn.Module):
- def __init__(self, input_size, hidden_size, output_size):
- # input_size: 输入张量最后一个维度的大小
- # hidden_size: 隐藏层张量最后一个维度的大小
- # output_size: 输出张量最后一个维度的大小
- super(RNN, self).__init__()
-
- # 将隐藏层的大小写成类的内部变量
- self.hidden_size = hidden_size
-
- # 构建第一个线性层, 输入尺寸是input_size + hidden_size,因为真正进入全连接层的张量是X(t) + h(t-1)
- # 输出尺寸是hidden_size
- self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
-
- # 构建第二个线性层, 输入尺寸是input_size + hidden_size
- # 输出尺寸是output_size
- self.i2o = nn.Linear(input_size + hidden_size, output_size)
-
- # 定义最终输出的softmax处理层
- self.softmax = nn.LogSoftmax(dim=-1)
-
- def forward(self, input1, hidden1):
- # 首先要进行输入张量的拼接, 将X(t)和h(t-1)拼接在一起
- combined = torch.cat((input1, hidden1), 1)
-
- # 让输入经过隐藏层获得hidden
- hidden = self.i2h(combined)
-
- # 让输入经过输出层获得output
- output = self.i2o(combined)
-
- # 让output经过softmax层
- output = self.softmax(output)
-
- # 返回两个张量,output, hidden
- return output, hidden
-
- def initHidden(self):
- # 将隐藏层初始化为一个[1, hidden_size]的全0张量
- return torch.zeros(1, self.hidden_size)
-
- import pandas as pd
- from collections import Counter
- import random
- from bert_chinese_encode import get_bert_encode_for_single
- import torch
- import torch.nn as nn
- import math
- import time
- import matplotlib.pyplot as plt
-
-
- # 读取数据
- train_data_path = './train_data.csv'
- train_data = pd.read_csv(train_data_path, header=None, sep='\t')
-
- # 打印一下正负标签比例
- # print(dict(Counter(train_data[0].values)))
-
- # 打印若干数据展示一下
- train_data = train_data.values.tolist()
- # print(train_data1[:10])
-
-
- def randomTrainingExample(train_data):
- # 随机选取数据, train_data是训练集的列表形式的数据
- category, line = random.choice(train_data)
-
- # 首先将文字部分利用bert进行编码
- line_tensor = get_bert_encode_for_single(line)
-
- # 将分类标签封装成tensor
- category_tensor = torch.tensor([int(category)])
-
- # 依次将读取出来的原始数据,以及封装后的tensor返回
- return category, line, category_tensor, line_tensor
-
-
- # for i in range(10):
- # category, line, category_tensor, line_tensor = randomTrainingExample(train_data)
- # print('category = ', category, ' / line = ', line)
-
-
- # 编写RNN类的代码
- class RNN(nn.Module):
- def __init__(self, input_size, hidden_size, output_size):
- # input_size: 输入张量最后一个维度的大小
- # hidden_size: 隐藏层张量最后一个维度的大小
- # output_size: 输出张量最后一个维度的大小
- super(RNN, self).__init__()
-
- # 将隐藏层的大小写成类的内部变量
- self.hidden_size = hidden_size
-
- # 构建第一个线性层, 输入尺寸是input_size + hidden_size,因为真正进入全连接层的张量是X(t) + h(t-1)
- # 输出尺寸是hidden_size
- self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
-
- # 构建第二个线性层, 输入尺寸是input_size + hidden_size
- # 输出尺寸是output_size
- self.i2o = nn.Linear(input_size + hidden_size, output_size)
-
- # 定义最终输出的softmax处理层
- self.softmax = nn.LogSoftmax(dim=-1)
-
- def forward(self, input1, hidden1):
- # 首先要进行输入张量的拼接, 将X(t)和h(t-1)拼接在一起
- combined = torch.cat((input1, hidden1), 1)
-
- # 让输入经过隐藏层获得hidden
- hidden = self.i2h(combined)
-
- # 让输入经过输出层获得output
- output = self.i2o(combined)
-
- # 让output经过softmax层
- output = self.softmax(output)
-
- # 返回两个张量,output, hidden
- return output, hidden
-
- def initHidden(self):
- # 将隐藏层初始化为一个[1, hidden_size]的全0张量
- return torch.zeros(1, self.hidden_size)
-
-
- # 选取损失函数为nn.NLLLoss()
- criterion = nn.NLLLoss()
-
- hidden_size = 128
- # 预训练模型bert输出的维度
- input_size = 768
- n_categories = 2
- rnn = RNN(input_size, hidden_size, n_categories)
-
- # 把学习率设定为0.005
- learning_rate = 0.005
-
- def train(category_tensor, line_tensor):
- # category_tensor: 代表类别的张量, line_tensor: 代表经过bert编码后的文本张量
- # 初始化隐藏层
- hidden = rnn.initHidden()
-
- # 训练前一定要将梯度归零
- rnn.zero_grad()
-
- # 遍历line_tensor中的每一个字符的张量
- for i in range(line_tensor.size()[0]):
- # 传入rnn中的参数必须是二维张量,如果不是,需要扩展维度 unsqueeze(0)
- output, hidden = rnn(line_tensor[i].unsqueeze(0), hidden)
-
- # 调用损失函数, 输入分别是rnn预测的结果和真实的类别标签
- loss = criterion(output, category_tensor)
-
- # 开启反向传播
- loss.backward()
-
- # 为大家显示的更新模型中的所有参数
- for p in rnn.parameters():
- # 利用梯度下降法更新, add_()功能是参数的梯度乘以学习率,然后结果相加来更新参数
- p.data.add_(-learning_rate, p.grad.data)
-
-
- return output, loss.item()
-
-
- def valid(category_tensor, line_tensor):
- # category_tensor: 类别标签的张量, line_tensor: 经过了bert编码后的文本张量
- # 初始化隐藏层
- hidden = rnn.initHidden()
-
- # 注意: 验证函数中要保证模型不自动求导
- with torch.no_grad():
- # 遍历文本张量中的每一个字符的bert编码
- for i in range(line_tensor.size()[0]):
- # 注意: 输入rnn的参数必须是二维张量,如果不足,利用unsqueeze()来进行扩展
- output, hidden = rnn(line_tensor[i].unsqueeze(0), hidden)
-
- loss = criterion(output, category_tensor)
-
- return output, loss.item()
-
-
- def timeSince(since):
- # 功能:获取每次打印的时间消耗, since是训练开始的时间
- # 获取当前的时间
- now = time.time()
-
- # 获取时间差, 就是时间消耗
- s = now - since
-
- # 获取时间差的分钟数
- m = math.floor(s/60)
-
- # 获取时间差的秒数
- s -= m*60
-
- return '%dm %ds' % (m, s)
-
-
- # 设置训练的迭代次数
- n_iters = 1000
-
- # 设置打印间隔为100
- plot_every = 100
-
- # 初始化训练和验证的损失,准确率
- train_current_loss = 0
- train_current_acc = 0
- valid_current_loss = 0
- valid_current_acc = 0
-
- # 为后续的画图做准备,存储每次打印间隔之间的平均损失和平均准确率
- all_train_loss = []
- all_train_acc = []
- all_valid_loss = []
- all_valid_acc = []
-
- # 获取整个训练的开始时间
- start = time.time()
-
- # 进入主循环,遍历n_iters次
- for iter in range(1, n_iters + 1):
- # 分别调用两次随机获取数据的函数,分别获取训练数据和验证数据
- category, line, category_tensor, line_tensor = randomTrainingExample(train_data)
- category_, line_, category_tensor_, line_tensor_ = randomTrainingExample(train_data)
-
- # 分别调用训练函数,和验证函数,得到输出和损失
- train_output, train_loss = train(category_tensor, line_tensor)
- valid_output, valid_loss = valid(category_tensor_, line_tensor_)
-
- # 累加训练的损失,训练的准确率,验证的损失,验证的准确率
- train_current_loss += train_loss
- train_current_acc += (train_output.argmax(1) == category_tensor).sum().item()
- valid_current_loss += valid_loss
- valid_current_acc += (valid_output.argmax(1) == category_tensor_).sum().item()
-
- # 每隔plot_every次数打印一下信息
- if iter % plot_every == 0:
- train_average_loss = train_current_loss / plot_every
- train_average_acc = train_current_acc / plot_every
- valid_average_loss = valid_current_loss / plot_every
- valid_average_acc = valid_current_acc / plot_every
-
- # 打印迭代次数,时间消耗,训练损失,训练准确率,验证损失,验证准确率
- print("Iter:", iter, "|", "TimeSince:", timeSince(start))
- print("Train Loss:", train_average_loss, "|", "Train Acc:", train_average_acc)
- print("Valid Loss:", valid_average_loss, "|", "Valid Acc:", valid_average_acc)
-
- # 将损失,准确率的结果保存起来,为后续的画图使用
- all_train_loss.append(train_average_loss)
- all_train_acc.append(train_average_acc)
- all_valid_loss.append(valid_average_loss)
- all_valid_acc.append(valid_average_acc)
-
- # 将每次打印间隔的训练损失,准确率,验证损失,准确率,归零操作
- train_current_loss = 0
- train_current_acc = 0
- valid_current_loss = 0
- valid_current_acc = 0
-
-
- plt.figure(0)
- plt.plot(all_train_loss, label="Train Loss")
- plt.plot(all_valid_loss, color="red", label="Valid Loss")
- plt.legend(loc="upper left")
- plt.savefig("./loss.png")
-
- plt.figure(1)
- plt.plot(all_train_acc, label="Train Acc")
- plt.plot(all_valid_acc, color="red", label="Valid Acc")
- plt.legend(loc="upper left")
- plt.savefig("./acc.png")
-
-
- # 模型的保存,首先给定保存的路径
- MODEL_PATH = './BERT_RNN.pth'
-
- torch.save(rnn.state_dict(), MODEL_PATH)
- # 导入若干包
- import os
- import torch
- import torch.nn as nn
-
- # 导入RNN类
- from RNN_MODEL import RNN
-
- # 导入bert预训练模型的编码函数
- from bert_chinese_encode import get_bert_encode_for_single
-
- # 设定预加载的模型路径
- MODEL_PATH = './BERT_RNN.pth'
-
- # 设定若干参数, 注意:这些参数一定要和训练的时候保持完全一致
- n_hidden = 128
- input_size = 768
- n_categories = 2
-
- # 实例化RNN模型,并加载保存的模型参数
- rnn = RNN(input_size, n_hidden, n_categories)
- rnn.load_state_dict(torch.load(MODEL_PATH))
-
-
- # 编写测试函数
- def _test(line_tensor):
- # 功能:本函数为预测函数服务,用于调用RNN模型并返回结果
- # line_tensor: 代表输入中文文本的张量标识
- # 初始化隐藏层
- hidden = rnn.initHidden()
-
- # 遍历输入文本中的每一个字符张量
- for i in range(line_tensor.size()[0]):
- output, hidden = rnn(line_tensor[i].unsqueeze(0), hidden)
-
- # 返回RNN模型的最终输出
- return output
-
-
- # 编写预测函数
- def predict(input_line):
- # 功能:完成模型的预测
- # input_line: 代表需要预测的中文文本信息
- # 注意: 所有的预测必须保证不自动求解梯度
- with torch.no_grad():
- # 将input_line使用bert模型进行编码,然后将张量传输给_test()函数
- output = _test(get_bert_encode_for_single(input_line))
-
- # 从output中取出最大值对应的索引,比较的维度是1
- _, topi = output.topk(1, 1)
- return topi.item()
-
-
- # 编写批量预测的函数
- def batch_predict(input_path, output_path):
- # 功能: 批量预测函数
- # input_path: 以原始文本的输入路径(等待进行命名实体审核的文件)
- # output_path: 预测后的输出文件路径(经过命名实体审核通过的所有数据)
- csv_list = os.listdir(input_path)
-
- # 遍历每一个csv文件
- for csv in csv_list:
- # 要以读的方式打开每一个csv文件
- with open(os.path.join(input_path, csv), "r") as fr:
- # 要以写的方式打开输出路径下的同名csv文件
- with open(os.path.join(output_path, csv), "w") as fw:
- # 读取csv文件的每一行
- input_lines = fr.readlines()
- for input_line in input_lines:
- # 调用预测函数,利用RNN模型进行审核
- res = predict(input_line)
- if res:
- # 如果res==1, 说明通过了审核
- fw.write(input_line + "\n")
- else:
- pass
-
-
-
- if __name__ == '__main__':
- # input_line = "点淤样尖针性发多"
- # result = predict(input_line)
- # print("result:", result)
- input_path = "/data/doctor_offline/structured/noreview/"
- output_path = "/data/doctor_offline/structured/reviewed/"
- batch_predict(input_path, output_path)
-
- import os
- import fileinput
- from neo4j import GraphDatabase
- from config import NEO4J_CONFIG
-
- driver = GraphDatabase.driver( **NEO4J_CONFIG)
-
- # 导入数据的函数
- def _load_data(path):
- """
- 功能:将path参数目录下的csv文件以指定的格式加载到内存中
- path: 经历了命名实体审核后,所有的疾病-症状的csv文件
- return: 返回疾病:症状的字典 {疾病1:[症状1,症状2,...],疾病2:[症状1,症状2,...]}
- """
-
- # 获得所有疾病对应的csv文件的列表
- disease_csv_list = os.listdir(path)
-
- # 将文件名的后缀.csv去除掉,获得所有疾病名称的列表
- disease_list = list(map(lambda x: x.split('.')[0], disease_csv_list))
-
- # 将每一种疾病对应的所有症状放在症状列表中
- symptom_list = []
- for disease_csv in disease_csv_list:
- # 将一个疾病文件中所有的症状提取到一个列表中
- symptom = list(map(lambda x: x.strip(), fileinput.FileInput(os.path.join(path, disease_csv))))
-
- # 过滤掉所有长度异常的症状名称
- symptom = list(filter(lambda x: 0<len(x)<100, symptom))
- symptom_list.append(symptom)
-
- return dict(zip(disease_list, symptom_list))
-
-
- # 写入图数据库的函数
- def write(path):
- """
- 功能: 将csv数据全部写入neo4j图数据库中
- path: 经历了命名实体审核后,所有的疾病-症状的csv文件
- """
-
- # 导入数据成为字典类型
- disease_symptom_dict = _load_data(path)
-
- # 开启一个会话,进行数据库的操作
- with driver.session() as session:
- for key, value in disease_symptom_dict.items():
- # 创建疾病名的节点
- cypher = "MERGE (a:Disease{name:%r}) RETURN a" %key
- session.run(cypher)
- # 循环处理症状名称的列表
- for v in value:
- # 创建症状的节点
- cypher = "MERGE (b:Symptom{name:%r}) RETURN b" %v
- session.run(cypher)
- # 创建疾病名-疾病症状之间的关系
- cypher = "MATCH (a:Disease{name:%r}) MATCH (b:Symptom{name:%r}) \
- WITH a,b MERGE (a)-[r:dis_to_sym]-(b)" %(key, v)
- session.run(cypher)
-
- # 创建Disease节点的索引
- cypher = "CREATE INDEX ON:Disease(name)"
- session.run(cypher)
- # 创建Symptom节点的索引
- cypher = "CREATE INDEX ON:Symptom(name)"
- session.run(cypher)
-
-
- if __name__ == '__main__':
- path = "./structured/reviewed/"
- write(path)
- # 导入包
- import numpy as np
- import torch
- import torch.utils.data as Data
-
-
- # 创建生成批量训练数据的函数
- def load_dataset(data_file, batch_size):
- '''
- data_file: 代表待处理的文件
- batch_size: 代表每一个批次样本的数量
- '''
- # 将train.npz文件带入到内存中
- data = np.load(data_file)
-
- # 分别提取data中的特征和标签
- x_data = data['x_data']
- y_data = data['y_data']
-
- # 将数据封装成Tensor张量
- x = torch.tensor(x_data, dtype=torch.long)
- y = torch.tensor(y_data, dtype=torch.long)
-
- # 将数据再次封装
- dataset = Data.TensorDataset(x, y)
-
- # 求解一下数据的总量
- total_length = len(dataset)
-
- # 确认一下将80%的数据作为训练集, 剩下的20%的数据作为测试集
- train_length = int(total_length * 0.8)
- validation_length = total_length - train_length
-
- # 利用Data.random_split()直接切分数据集, 按照80%, 20%的比例进行切分
- train_dataset, validation_dataset = Data.random_split(dataset=dataset, lengths=[train_length, validation_length])
-
- # 将训练数据集进行DataLoader封装
- # dataset: 代表训练数据集
- # batch_size: 代表一个批次样本的数量, 若数据集的总样本数无法被batch_size整除, 则最后一批数据的大小为余数,
- # 若设置另一个参数drop_last=True, 则自动忽略最后不能被整除的数量
- # shuffle: 是否每隔批次为随机抽取, 若设置为True, 代表每个批次的数据样本都是从数据集中随机抽取的
- # num_workers: 设置有多少子进程负责数据加载, 默认为0, 即数据将被加载到主进程中
- # drop_last: 是否把最后一个批次的数据(指那些无法被batch_size整除的余数数据)忽略掉
- train_loader = Data.DataLoader(dataset=train_dataset, batch_size=batch_size,
- shuffle=True, num_workers=2, drop_last=False)
-
- validation_loader = Data.DataLoader(dataset=validation_dataset, batch_size=batch_size,
- shuffle=True, num_workers=2, drop_last=False)
-
- # 将两个数据生成器封装成一个字典类型
- data_loaders = {'train': train_loader, 'validation': validation_loader}
-
- # 将两个数据集的长度也封装成一个字典类型
- data_size = {'train': train_length, 'validation': validation_length}
-
- return data_loaders, data_size
-
-
- # 批次的大小
- BATCH_SIZE = 32
-
- # 训练数据集的文件路径
- DATA_FILE = './data/total.npz'
-
- if __name__ == '__main__':
- data_loader, data_size = load_dataset(DATA_FILE, BATCH_SIZE)
- print('data_loader:', data_loader, '\ndata_size:', data_size)
-
- import json
- import numpy as np
-
- # 创建训练数据集, 从原始训练文件中将中文字符进行数字化编码, 同时也将标签进行数字化的编码
- def create_train_data(train_data_file, result_file, json_file, tag2id, max_length=100):
- '''
- train_data_file: 原始训练文件
- result_file: 处理后的结果文件
- json_file: 中文字符向id的映射表, 也是一个文件char_to_id.json
- tag2id: 标签向id的映射表, 提前已经写好了
- '''
- # 导入json格式的中文字符向id的映射表
- char2id = json.load(open(json_file, mode='r', encoding='utf-8'))
-
- char_data, tag_data = [], []
-
- # 打开原始训练文件
- with open(train_data_file, mode='r', encoding='utf-8') as f:
- # 初始化一条语句数字化编码后的列表
- char_ids = [0] * max_length
- tag_ids = [0] * max_length
- idx = 0
- # 遍历文件中的每一行
- for line in f.readlines():
- # char \t tag
- line = line.strip('\n').strip()
- # 如果不是空行, 并且当前语句的长度没有超过max_length,则进行字符到id的映射
- if line and len(line) > 0 and idx < max_length:
- ch, tag = line.split('\t')
- # 如果当前字符在映射表中,则直接映射为对应的id值
- if char2id.get(ch):
- char_ids[idx] = char2id[ch]
- # 否则直接用"UNK"的id值进行赋值, 代表的是未知的字符
- else:
- char_ids[idx] = char2id['UNK']
- # 将标签对应的id值进行数字化编码映射
- tag_ids[idx] = tag2id[tag]
- idx += 1
- # 如果是空行, 或者当前语句的长度超过了max_length
- else:
- # 如果当前语句的长度超过了max_length,直接将[0: max_length]的部分直接进行结果赋值
- if idx <= max_length:
- char_data.append(char_ids)
- tag_data.append(tag_ids)
- # 遇到空行, 说明当前一条完整的语句已经结束了, 需要将初始化列表进行清零操作, 为了下一个句子的迭代做准备
- char_ids = [0] * max_length
- tag_ids = [0] * max_length
- idx = 0
-
- # 将数字化编码后的数据封装成numpy的数组类型, 数字化编码采用int32
- x_data = np.array(char_data, dtype=np.int32)
- y_data = np.array(tag_data, dtype=np.int32)
-
- # 直接利用np.savez()将数据存储成.npz类型的文件
- np.savez(result_file, x_data=x_data, y_data=y_data)
- print("create_train_data Finished!".center(100, "-"))
-
- json_file = './data/char_to_id.json'
-
- # 参数2:标签码表对照字典
- tag2id = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, "<START>": 5, "<STOP>": 6}
-
- # 参数3:训练数据文件路径
- train_data_file = './data/total.txt'
-
- # 参数4:创建的npz文件保路径(训练数据)
- result_file = './data/total.npz'
-
-
- if __name__ == '__main__':
- create_train_data(train_data_file, result_file, json_file, tag2id)
-
- import torch
- import torch.nn as nn
- import torch.optim as optim
-
-
- # 添加几个辅助函数, 为log_sum_exp()服务
- def to_scalar(var):
- # 返回一个python float类型的值
- return var.view(-1).data.tolist()[0]
-
-
- def argmax(vec):
- # 返回列的维度上最大值的下标, 而且下标是一个标量float类型
- _, idx = torch.max(vec, 1)
- return to_scalar(idx)
-
-
- def log_sum_exp(vec):
- # 求向量中的最大值
- max_score = vec[0, argmax(vec)]
- # 构造一个最大值的广播变量
- max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
- # 先减去最大值, 再求解log_sum_exp, 最终的返回值上再加上max_score
- return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
-
- # 函数sentence_map()完成中文文本信息的数字编码, 将中文语句变成数字化张量
- def sentence_map(sentence_list, char_to_id, max_length):
- # 首先对一个批次的所有语句按照句子的长短进行排序, 这个操作并非必须
- sentence_list.sort(key=lambda x: len(x), reverse=True)
- # 定义一个最终存储结果特征张量的空列表
- sentence_map_list = []
- # 循环遍历一个批次内所有的语句
- for sentence in sentence_list:
- # 采用列表生成式来完成中文字符到id值的映射
- sentence_id_list = [char_to_id[c] for c in sentence]
- # 长度不够max_length的部分用0填充
- padding_list = [0] * (max_length - len(sentence))
- # 将每一个语句扩充为相同长度的张量
- sentence_id_list.extend(padding_list)
- # 追加进最终存储结果的列表中
- sentence_map_list.append(sentence_id_list)
-
- # 返回一个标量类型的张量
- return torch.tensor(sentence_map_list, dtype=torch.long)
-
-
- class BiLSTM_CRF(nn.Module):
- def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,
- num_layers, batch_size, sequence_length):
- '''
- vocab_size: 单词总数量
- tag_to_ix: 标签到id的映射字典
- embedding_dim: 词嵌入的维度
- hidden_dim: 隐藏层的维度
- num_layers: 堆叠的LSTM层数
- batch_size: 批次的大小
- sequence_length: 语句的最大长度
- '''
-
- # 继承函数的初始化
- super(BiLSTM_CRF, self).__init__()
- # 设置单词的总数量
- self.vocab_size = vocab_size
- # 设置标签到id的映射字典
- self.tag_to_ix = tag_to_ix
- # 设置标签的总数
- self.tagset_size = len(tag_to_ix)
- # 设置词嵌入的维度
- self.embedding_dim = embedding_dim
- # 设置隐藏层的维度
- self.hidden_dim = hidden_dim
- # 设置LSTM层数
- self.num_layers = num_layers
- # 设置批次的大小
- self.batch_size = batch_size
- # 设置语句的长度
- self.sequence_length = sequence_length
-
- # 构建词嵌入层, 两个参数分别单词总数量, 词嵌入维度
- self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
-
- # 构建双向LSTM层, 输入参数包括词嵌入维度, 隐藏层大小, LSTM层数, 是否双向标志
- self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=self.num_layers, bidirectional=True)
-
- # 构建全连线性层, 一端对接BiLSTM, 另一端对接输出层, 注意输出层维度是tagset_size
- self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
-
- # 初始化转移矩阵, 注意转移矩阵的维度[tagset_size, tagset_size]
- self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
-
- # 任何合法的句子不会转移到"START_TAG",设置为-10000
- # 任何合法的句子不会从"STOP_TAG"继续转移, 设置为-10000
- self.transitions.data[tag_to_ix["<START>"], :] = -10000
- self.transitions.data[:, tag_to_ix["<STOP>"]] = -10000
-
- # 初始化隐藏层, 利用类中的函数init_hidden()来完成
- self.hidden = self.init_hidden()
-
- def init_hidden(self):
- # 为了符合LSTM的要求, 返回h0, c0, 这两个张量拥有相同的shape
- # shape: [2 * num_layers, batch_size, hidden_dim // 2]
- return (torch.randn(2 * self.num_layers, self.batch_size, self.hidden_dim // 2),
- torch.randn(2 * self.num_layers, self.batch_size, self.hidden_dim //2))
-
- # 在类中将文本信息经过词嵌入层, BiLSTM层, 线性层的处理, 最终输出句子的张量
- def _get_lstm_features(self, sentence):
- self.hidden = self.init_hidden()
-
- # 让sentence经历词嵌入层
- embeds = self.word_embeds(sentence).view(self.sequence_length, self.batch_size, -1)
-
- # 将词嵌入层的输出, 进入BiLSTM层, LSTM输入的两个参数: 词嵌入后的张量, 随机初始化的隐藏层张量
- lstm_out, self.hidden = self.lstm(embeds, self.hidden)
-
- # 保证输出张量的形状:[sequence_length, batch_size, hidden_dim]
- lstm_out = lstm_out.view(self.sequence_length, self.batch_size, self.hidden_dim)
-
- # 最后经过线性层的处理, 得到最后输出张量的shape: [sequence_length, batch_size, tagset_size]
- lstm_feats = self.hidden2tag(lstm_out)
- return lstm_feats
-
-
- def _forward_alg(self, feats):
- # 初始化一个alphas张量, 代表转移矩阵的起始位置
- init_alphas = torch.full((1, self.tagset_size), -10000)
- # 仅仅将"START_TAG"赋值为0, 代表着接下来的矩阵转移只能从START_TAG开始
- init_alphas[0][self.tag_to_ix["<START>"]] = 0
-
- # 将初始化的init_alphas赋值为前向计算变量, 为了后续在反向传播求导的时候可以自动更新参数
- forward_var = init_alphas
-
- # 输入进来的feats - shape:[20, 8, 7], 为了后续按句子为单位进行计算, 需要将batch_size放在第一个维度上
- feats = feats.transpose(1, 0)
-
- # 初始化一个最终的结果张量
- result = torch.zeros((1, self.batch_size))
- idx = 0
-
- # 遍历每一行文本, 总共循环batch_size次
- for feat_line in feats:
- # feats: [8, 20, 7], feat_line: [20, 7]
- # 遍历每一行, 每一个feat代表一个time_step
- for feat in feat_line:
- # 当前的time_step,初始化一个前向计算张量
- alphas_t = []
- # 每一个时间步, 遍历所有可能的转移标签, 进行累加计算
- for next_tag in range(self.tagset_size):
- # 构造发射分数的广播张量
- emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
-
- # 当前时间步, 转移到next_tag标签的转移分数
- trans_score = self.transitions[next_tag].view(1, -1)
-
- # 将前向计算矩阵, 发射矩阵, 转移矩阵累加
- next_tag_var = forward_var + trans_score + emit_score
-
- # 计算log_sum_exp()的值, 并添加进alphas_t列表中
- alphas_t.append(log_sum_exp(next_tag_var).view(1))
-
- # 将列表张量转换为二维张量
- forward_var = torch.cat(alphas_t).view(1, -1)
-
- # 添加最后一步转移到"STOP_TAG"的分数, 就完成了整条语句的分数计算
- terminal_var = forward_var + self.transitions[self.tag_to_ix["<STOP>"]]
-
- # 将terminal_var放进log_sum_exp()中进行计算, 得到一条样本语句最终的分数
- alpha = log_sum_exp(terminal_var)
- # 将得分添加进最终的结果列表中, 作为整个函数的返回结果
- result[0][idx] = alpha
- idx += 1
- return result
-
-
- def _score_sentence(self, feats, tags):
- '''
- feats: [20, 8, 7], 经历了_get_lstm_features()处理后的特征张量
- tags: [8, 20], 代表的是训练语句真实的标签矩阵
- '''
- # 初始化一个0值的tensor,为后续的累加做准备
- score = torch.zeros(1)
- # 要在tags矩阵的第一列添加,这一列全部都是START_TAG
- temp = torch.tensor(torch.full((self.batch_size, 1), self.tag_to_ix["<START>"]), dtype=torch.long)
- tags = torch.cat((temp, tags), dim=1)
-
- # 将传入的feats形状转变为[batch_size, sequence_length, tagset_size]
- feats = feats.transpose(1, 0)
-
- # 初始化最终的结果分数张量, 每一个句子得到一个分数
- result = torch.zeros((1, self.batch_size))
- idx = 0
- # 遍历所有的语句特征向量
- for feat_line in feats:
- # 此处feat_line: [20, 7]
- # 遍历每一个时间步, 注意: 最重要的区别在于这里是在真实标签tags的指导下进行的转移矩阵和发射矩阵的累加分数求和
- for i, feat in enumerate(feat_line):
- score = score + self.transitions[tags[idx][i+1], tags[idx][i]] + feat[tags[idx][i+1]]
- # 遍历完当前语句所有的时间步之后, 最后添加上"STOP_TAG"的转移分数
- score = score + self.transitions[self.tag_to_ix["<STOP>"], tags[idx][-1]]
- # 将该条语句的最终得分添加进结果列表中
- result[0][idx] = score
- idx += 1
- score = torch.zeros(1)
- return result
-
-
- def _viterbi_decode(self, feats):
- # 根据传入的语句特征feats,推断出标签序列
- # 初始化一个最佳路径结果的存放列表
- result_best_path = []
- # 将输入的张量形状变为 [batch_size, sequence_length, tagset_size]
- feats = feats.transpose(1, 0)
-
- # 对批次中的每一个语句进行遍历, 每个语句产生一个最优的标注序列
- for feat_line in feats:
- backpointers = []
-
- # 初始化前向传播的张量, 同时设置START_TAG等于0, 约束了合法的序列只能从START_TAG开始
- init_vvars = torch.full((1, self.tagset_size), -10000)
- init_vvars[0][self.tag_to_ix["<START>"]] = 0
-
- # 将初始化的变量赋值给forward_var, 在第i个time_step中, 张量forward_var保存的是第i-1个time_step的viterbi张量
- forward_var = init_vvars
-
- # 遍历从i=0, 到序列最后一个time_step, 每一个时间步
- for feat in feat_line:
- # 初始化保存当前time_step的回溯指针
- bptrs_t = []
- # 初始化保存当前tme_step的viterbi变量
- viterbivars_t = []
-
- # 遍历所有可能的转移标签
- for next_tag in range(self.tagset_size):
- # next_tag_var[i]保存了tag_i在前一个time_step的viterbi变量
- # 通过前向传播张量forward_var加上从tag_i转移到next_tag的转移分数, 赋值给next_tag_var
- # 注意: 在这里不去加发射矩阵的分数, 因为发射矩阵分数一致, 不影响求最大值下标
- next_tag_var = forward_var + self.transitions[next_tag]
-
- # 将最大的标签所对应的id加入到当前time_step的回溯列表中
- best_tag_id = argmax(next_tag_var)
- bptrs_t.append(best_tag_id)
- viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
-
- # 此处再将发射矩阵的分数feat添加上来, 继续赋值给forward_var, 作为下一个time_step的前向传播变量
- forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
-
- # 将当前time_step的回溯指针添加进当前样本行的总体回溯指针中
- backpointers.append(bptrs_t)
-
- # 最后加上转移到STOP_TAG的分数
- terminal_var = forward_var + self.transitions[self.tag_to_ix["<STOP>"]]
- best_tag_id = argmax(terminal_var)
-
- # 根据回溯指针, 解码最佳路径
- best_path = [best_tag_id]
- # 从后向前回溯最佳路径
- for bptrs_t in reversed(backpointers):
- # 通过第i个time_step得到的最佳id, 找到第i-1个time_step的最佳id
- best_tag_id = bptrs_t[best_tag_id]
- best_path.append(best_tag_id)
-
- # 将START_TAG去除掉
- start = best_path.pop()
- # print(start)
- # 确认一下最佳路径的第一个标签是START_TAG
- # if start != self.tag_to_ix["<START>"]:
- # print(start)
- assert start == self.tag_to_ix["<START>"]
-
- # 因为是从后向前进行回溯, 所以在此对列表进行逆序操作得到从前向后的真实路径
- best_path.reverse()
- # 将当前这一行的样本结果添加到最终的结果列表中
- result_best_path.append(best_path)
-
- return result_best_path
-
-
- # 对数似然函数, 输入两个参数: 数字化编码后的张量, 和真实的标签
- # 注意: 这个函数是未来真实训练中要用到的损失函数, 虚拟化的forward()
- def neg_log_likelihood(self, sentence, tags):
- # 第一步先得到BiLSTM层的输出特征张量
- feats = self._get_lstm_features(sentence)
-
- # feats: [20, 8, 7]代表一个批次8个样本, 每个样本长度20, 每一个字符映射成7个标签。每一个word映射到7个标签的概率, 发射矩阵。
- # feats本质上就是发射矩阵
- # forward_score, 代表公式推导中损失函数loss的第一项
- forward_score = self._forward_alg(feats)
-
- # gold_score, 代表公式推导中损失函数loss的第二项
- gold_score = self._score_sentence(feats, tags)
-
- # 注意: 在这里,通过forward_score和gold_score的差值作为loss,进行梯度下降的优化求解训练模型
- # 按行求和的时候, 在torch.sum()函数中, 需要设置dim=1;同理, 如果要按列求和, 需要设置dim=0
- return torch.sum(forward_score - gold_score, dim=1)
-
-
- # 编写正式的forward()函数, 注意应用场景是在预测的时候, 模型训练的时候并没有用到forward()函数
- def forward(self, sentence):
- # 首先获取BiLSTM层的输出特征, 得到发射矩阵
- lstm_feats = self._get_lstm_features(sentence)
-
- # 通过维特比算法直接解码出最优路径
- result_sequence = self._viterbi_decode(lstm_feats)
- return result_sequence
-
-
-
- # 开始字符和结束字符
- START_TAG = "<START>"
- STOP_TAG = "<STOP>"
- # 标签和序号的对应码表
- tag_to_ix = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, START_TAG: 5, STOP_TAG: 6}
- # 词嵌入的维度
- EMBEDDING_DIM = 200
- # 隐藏层神经元的数量
- HIDDEN_DIM = 100
- # 批次的大小
- BATCH_SIZE = 8
- """ 在仅运行当前文件进行测试时,设置SENTENCE_LENGTH为20 """
- # 设置最大语句限制长度
- # SENTENCE_LENGTH = 20
- SENTENCE_LENGTH = 100
- # 默认神经网络的层数
- NUM_LAYERS = 1
- # 初始化的字符和序号的对应码表
- # char_to_id = {"双": 0, "肺": 1, "见": 2, "多": 3, "发": 4, "斑": 5, "片": 6,
- # "状": 7, "稍": 8, "高": 9, "密": 10, "度": 11, "影": 12, "。": 13}
-
- '''
- model = BiLSTM_CRF(vocab_size=len(char_to_id), tag_to_ix=tag_to_ix, embedding_dim=EMBEDDING_DIM,
- hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, batch_size=BATCH_SIZE, sequence_length=SENTENCE_LENGTH)
- print(model)
- '''
-
- sentence_list = [
- "确诊弥漫大b细胞淋巴瘤1年",
- "反复咳嗽、咳痰40年,再发伴气促5天。",
- "生长发育迟缓9年。",
- "右侧小细胞肺癌第三次化疗入院",
- "反复气促、心悸10年,加重伴胸痛3天。",
- "反复胸闷、心悸、气促2多月,加重3天",
- "咳嗽、胸闷1月余, 加重1周",
- "右上肢无力3年, 加重伴肌肉萎缩半年"
- ]
-
-
- # 真实标签数据, 对应为tag_to_ix中的数字标签
- tag_list = [
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- ]
- # 将标签转为标量tags
- tags = torch.tensor(tag_list, dtype=torch.long)
-
-
- char_to_id = {"<PAD>": 0}
-
- """ 在仅运行当前文件进行测试时,设置SENTENCE_LENGTH为20 """
- if __name__ == '__main__':
- for sentence in sentence_list:
- for c in sentence:
- # 如果当前字符不在映射字典中, 追加进字典
- if c not in char_to_id:
- char_to_id[c] = len(char_to_id)
-
- # 首先利用char_to_id完成中文文本的数字化编码
- sentence_sequence = sentence_map(sentence_list, char_to_id, SENTENCE_LENGTH)
- # print("sentence_sequence:\n", sentence_sequence)
-
- # 构建类的实例, 去得到语句的特征张量
- model = BiLSTM_CRF(vocab_size=len(char_to_id), tag_to_ix=tag_to_ix, embedding_dim=EMBEDDING_DIM,
- hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, batch_size=BATCH_SIZE,
- sequence_length=SENTENCE_LENGTH)
-
- # 调用类内部的_get_lstm_features()函数, 得到特征张量
- # sentence_features = model._get_lstm_features(sentence_sequence)
- # print("sentence_features:\n", sentence_features)
-
- # 定义优化器
- optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
-
- for epoch in range(1):
- model.zero_grad()
-
- # feats = model._get_lstm_features(sentence_sequence)
-
- # forward_score = model._forward_alg(feats)
- # print(forward_score)
-
- # gold_score = model._score_sentence(feats, tags)
- # print(gold_score)
-
- # result_tag = model._viterbi_decode(feats)
- # print(result_tag)
-
- loss = model.neg_log_likelihood(sentence_sequence, tags)
- print(loss)
-
- loss.backward()
- optimizer.step()
-
- result = model(sentence_sequence)
- print(result)
-
- import torch
- import torch.nn as nn
-
- # 评估模型的准确率, 召回率, F1等指标
- def evaluate(sentence_list, true_tag, predict_tag, id2char, id2tag):
- '''
- sentence_list: 文本向量化后的句子张量
- true_tag: 真实的标签
- predict_tag: 预测的标签
- id2tag: id值到中文字符的映射表
- id2tag: id值到tag标签的映射表
- '''
- # 初始化真实的命名实体, 预测的命名实体, 接下来比较两者的异同来评估指标
- true_entities, true_entity = [], []
- predict_entities, predict_entity = [], []
-
- # 逐条的遍历批次中的所有语句
- for line_num, sentence in enumerate(sentence_list):
- # 遍历一条样本语句中的每一个字符编码(这里面都是数字化之后的编码)
- for char_num in range(len(sentence)):
- # 如果编码等于0, 表示PAD, 说明后续全部都是填充的0, 可以跳出当前for循环
- if sentence[char_num] == 0:
- break
-
- # 依次提取真实的语句字符, 真实的样本标签, 预测的样本标签
- char_text = id2char[sentence[char_num]]
- true_tag_type = id2tag[true_tag[line_num][char_num]]
- predict_tag_type = id2tag[predict_tag[line_num][char_num]]
-
- # 先对真实的标签进行命名实体的匹配
- # 如果第一个字符是"B", 表示一个实体的开始, 将"字符/标签"的格式添加进实体列表中
- if true_tag_type[0] == "B":
- true_entity = [char_text + "/" + true_tag_type]
- # 如果第一个字符是"I", 表示处于一个实体的中间
- # 如果真实的命名实体列表非空, 并且最后一个添加进去的标签类型和当前的标签类型一样, 则继续添加
- # 意思就是比如true_entity = ["中/B-Person", "国/I-Person"], 此时"人/I-Person"就可以进行添加
- elif true_tag_type[0] == "I" and len(true_entity) != 0 and true_entity[-1].split("/")[1][1:] == true_tag_type[1:]:
- true_entity.append(char_text + "/" + true_tag_type)
- # 如果第一个字符是"O", 并且true_entity非空, 表示一个命名实体已经匹配结束
- elif true_tag_type[0] == "O" and len(true_entity) != 0:
- true_entity.append(str(line_num) + "_" + str(char_num))
- # 将匹配结束的一个命名实体加入到最终的真实实体列表中
- true_entities.append(true_entity)
- # 清空true_entity,为了下一个命名实体的匹配做准备
- true_entity = []
- # 除了上述3种情况, 说明当前没有匹配出任何的实体, 则清空true_entity, 继续下一轮匹配
- else:
- true_entity = []
-
- # 对预测的标签进行命名实体的匹配
- # 如果第一个字符是"B", 表示一个实体的开始, 将"字符/标签"的格式添加进实体列表中
- if predict_tag_type[0] == "B":
- predict_entity = [char_text + "/" + predict_tag_type]
- # 如果第一个字符是"I", 表示处于一个实体的中间
- # 如果预测命名实体列表非空, 并且最后一个添加进去的标签类型和当前的标签类型一样, 则继续添加
- elif predict_tag_type[0] == "I" and len(predict_entity) != 0 and predict_entity[-1].split("/")[1][1:] == predict_tag_type[1:]:
- predict_entity.append(char_text + "/" + predict_tag_type)
- # 如果第一个字符是"O", 并且predict_entity非空, 表示一个完整的命名实体已经匹配结束了
- elif predict_tag_type[0] == "O" and len(predict_entity) != 0:
- predict_entity.append(str(line_num) + "_" + str(char_num))
- # 将这个匹配结束的预测命名实体添加到最终的预测实体列表中
- predict_entities.append(predict_entity)
- # 清空predict_entity, 为下一个命名实体的匹配做准备
- predict_entity = []
- # 除了上述3种情况, 说明当前没有匹配出任何的实体, 则清空predict_entity, 继续下一轮的匹配
- else:
- predict_entity = []
-
- # 遍历所有预测出来的实体列表, 只有那些在真实命名实体列表中的实体才是正确的预测
- acc_entities = [entity for entity in predict_entities if entity in true_entities]
-
- # 计算正确实体的个数, 预测实体的个数, 真实实体的个数
- acc_entities_length = len(acc_entities)
- predict_entities_length = len(predict_entities)
- true_entities_length = len(true_entities)
-
- # 至少争取预测了一个实体的情况下, 才计算准确率, 召回率, F1值
- if acc_entities_length > 0:
- accuracy = float(acc_entities_length / predict_entities_length)
- recall = float(acc_entities_length / true_entities_length)
- f1_score = 2.0 * accuracy * recall / (accuracy + recall)
- return accuracy, recall, f1_score, acc_entities_length, predict_entities_length, true_entities_length
- else:
- return 0, 0, 0, acc_entities_length, predict_entities_length, true_entities_length
-
-
- # 真实标签数据
- tag_list = [
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- ]
-
- # 预测标签数据
- predict_tag_list = [
- [0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0],
- [3, 4, 0, 3, 4, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- ]
-
- # 编码与字符对照字典
- id2char = {0: '<PAD>', 1: '确', 2: '诊', 3: '弥', 4: '漫', 5: '大', 6: 'b', 7: '细', 8: '胞', 9: '淋', 10: '巴', 11: '瘤', 12: '1', 13: '年', 14: '反', 15: '复', 16: '咳', 17: '嗽', 18: '、', 19: '痰', 20: '4', 21: '0', 22: ',', 23: '再', 24: '发', 25: '伴', 26: '气', 27: '促', 28: '5', 29: '天', 30: '。', 31: '生', 32: '长', 33: '育', 34: '迟', 35: '缓', 36: '9', 37: '右', 38: '侧', 39: '小', 40: '肺', 41: '癌', 42: '第', 43: '三', 44: '次', 45: '化', 46: '疗', 47: '入', 48: '院', 49: '心', 50: '悸', 51: '加', 52: '重', 53: '胸', 54: '痛', 55: '3', 56: '闷', 57: '2', 58: '多', 59: '月', 60: '余', 61: ' ', 62: '周', 63: '上', 64: '肢', 65: '无', 66: '力', 67: '肌', 68: '肉', 69: '萎', 70: '缩', 71: '半'}
-
- # 编码与标签对照字典
- id2tag = {0: 'O', 1: 'B-dis', 2: 'I-dis', 3: 'B-sym', 4: 'I-sym'}
-
- # 输入的数字化sentences_sequence, 由下面的sentence_list经过映射函数sentence_map()转化后得到
- sentence_list = [
- "确诊弥漫大b细胞淋巴瘤1年",
- "反复咳嗽、咳痰40年,再发伴气促5天。",
- "生长发育迟缓9年。",
- "右侧小细胞肺癌第三次化疗入院",
- "反复气促、心悸10年,加重伴胸痛3天。",
- "反复胸闷、心悸、气促2多月,加重3天",
- "咳嗽、胸闷1月余, 加重1周",
- "右上肢无力3年, 加重伴肌肉萎缩半年"
- ]
-
-
- # 添加中文字符的数字化编码函数
- def sentence_map(sentence_list, char_to_id, max_length=100):
- sentence_list.sort(key=lambda x: len(x), reverse=True)
- sentence_map_list = []
- for sentence in sentence_list:
- sentence_id_list = [char_to_id[c] for c in sentence]
- padding_list = [0] * (max_length - len(sentence))
- sentence_id_list.extend(padding_list)
- sentence_map_list.append(sentence_id_list)
- return torch.tensor(sentence_map_list, dtype=torch.long)
-
- char_to_id = {"<PAD>": 0}
-
- SENTENCE_LENGTH = 20
-
- for sentence in sentence_list:
- for c in sentence:
- if c not in char_to_id:
- char_to_id[c] = len(char_to_id)
-
-
- if __name__ == '__main__':
- sentence_sequence = sentence_map(sentence_list, char_to_id, SENTENCE_LENGTH)
- accuracy, recall, f1_score, acc_entities_length, predict_entities_length, true_entities_length = evaluate(sentence_sequence.tolist(), tag_list, predict_tag_list, id2char, id2tag)
-
- print("accuracy:", accuracy,
- "\nrecall:", recall,
- "\nf1_score:", f1_score,
- "\nacc_entities_length:", acc_entities_length,
- "\npredict_entities_length:", predict_entities_length,
- "\ntrue_entities_length:", true_entities_length)
-
- # 导入包
- import json
- import time
- from tqdm import tqdm
- import matplotlib.pyplot as plt
- import torch
- import torch.optim as optim
- from torch.autograd import Variable
- # 导入之前编写好的包, 包括类, 数据集加载, 评估函数
- from 项目一.AI_doctor.doctor_offline.ner_model.bilstm_crf import BiLSTM_CRF
- from 项目一.AI_doctor.doctor_offline.ner_model.loader_data import load_dataset
- from 项目一.AI_doctor.doctor_offline.ner_model.evaluate_model import evaluate
-
- # 训练模型的函数
- def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim,
- sentence_length, num_layers, epochs, learning_rate, tag2id,
- model_saved_path, train_log_path,
- validate_log_path, train_history_image_path):
- '''
- data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造
- data_size: 训练集和测试集的样本数量
- batch_size: 批次的样本个数
- embedding_dim: 词嵌入的维度
- hidden_dim: 隐藏层的维度
- sentence_length: 文本限制的长度
- num_layers: 神经网络堆叠的LSTM层数
- epochs: 训练迭代的轮次
- learning_rate: 学习率
- tag2id: 标签到id的映射字典
- model_saved_path: 模型保存的路径
- train_log_path: 训练日志保存的路径
- validate_log_path: 测试集日志保存的路径
- train_history_image_path: 训练数据的相关图片保存路径
- '''
- # 将中文字符和id的对应码表加载进内存
- char2id = json.load(open("./data/char_to_id.json", mode="r", encoding="utf-8"))
- # 初始化BiLSTM_CRF模型
- model = BiLSTM_CRF(vocab_size=len(char2id), tag_to_ix=tag2id,
- embedding_dim=embedding_dim, hidden_dim=hidden_dim,
- batch_size=batch_size, num_layers=num_layers,
- sequence_length=sentence_length)
-
- # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam)
- # 参数说明如下:
- # lr: 优化器学习率
- # momentum: 优化下降的动量因子, 加速梯度下降过程
- # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4)
- optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4)
-
- # 设定优化器学习率更新策略
- # 参数说明如下:
- # optimizer: 优化器
- # step_size: 更新频率, 每过多少个epoch更新一次优化器学习率
- # gamma: 学习率衰减幅度,
- # 按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1
- # 例如:
- # 初始学习率 lr = 0.5, step_size = 20, gamma = 0.1
- # lr = 0.5 if epoch < 20
- # lr = 0.05 if 20 <= epoch < 40
- # lr = 0.005 if 40 <= epoch < 60
- # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8)
-
- # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标
- train_loss_list = []
- train_acc_list = []
- train_recall_list = []
- train_f1_list = []
- train_log_file = open(train_log_path, mode="w", encoding="utf-8")
- # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标
- validate_loss_list = []
- validate_acc_list = []
- validate_recall_list = []
- validate_f1_list = []
- validate_log_file = open(validate_log_path, mode="w", encoding="utf-8")
- # 利用tag2id生成id到tag的映射字典
- id2tag = {v:k for k, v in tag2id.items()}
- # 利用char2id生成id到字符的映射字典
- id2char = {v:k for k, v in char2id.items()}
-
- # 按照参数epochs的设定来循环epochs次
- for epoch in range(epochs):
- # 在进度条打印前, 先输出当前所执行批次
- tqdm.write("Epoch {}/{}".format(epoch + 1, epochs))
- # 定义要记录的正确总实体数, 识别实体数以及真实实体数
- total_acc_entities_length, \
- total_predict_entities_length, \
- total_gold_entities_length = 0, 0, 0
- # 定义每batch步数, 批次loss总值, 准确度, f1值
- step, total_loss, correct, f1 = 1, 0.0, 0, 0
-
- # 开启当前epochs的训练部分
- for inputs, labels in tqdm(data_loader["train"]):
- # 将数据以Variable进行封装
- inputs, labels = Variable(inputs), Variable(labels)
- # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加
- optimizer.zero_grad()
- # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数
- loss = model.neg_log_likelihood(inputs, labels)
- # 获取当前步的loss, 由tensor转为数字
- step_loss = loss.data
- # 累计每步损失值
- total_loss += step_loss
- # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
- best_path_list = model(inputs)
- # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数
- step_acc, step_recall, f1_score, acc_entities_length, \
- predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
- labels.tolist(),
- best_path_list,
- id2char,
- id2tag)
- # 训练日志内容
- '''
- log_text = "Epoch: %s | Step: %s " \
- "| loss: %.5f " \
- "| acc: %.5f " \
- "| recall: %.5f " \
- "| f1 score: %.5f" % \
- (epoch, step, step_loss, step_acc, step_recall,f1_score)
- '''
-
- # 分别累计正确总实体数、识别实体数以及真实实体数
- total_acc_entities_length += acc_entities_length
- total_predict_entities_length += predict_entities_length
- total_gold_entities_length += gold_entities_length
-
- # 对损失函数进行反向传播
- loss.backward()
- # 通过optimizer.step()计算损失, 梯度和更新参数
- optimizer.step()
- # 记录训练日志
- # train_log_file.write(log_text + "\n")
- step += 1
-
- # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量)
- epoch_loss = total_loss / data_size["train"]
- # 计算当前epochs准确率
- if total_predict_entities_length > 0:
- total_acc = total_acc_entities_length / total_predict_entities_length
- # 计算当前epochs召回率
- if total_gold_entities_length > 0:
- total_recall = total_acc_entities_length / total_gold_entities_length
- # 计算当前epochs的F1值
- total_f1 = 0
- if total_acc + total_recall != 0:
- total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall)
- log_text = "Epoch: %s " \
- "| mean loss: %.5f " \
- "| total acc: %.5f " \
- "| total recall: %.5f " \
- "| total f1 scroe: %.5f" % (epoch, epoch_loss,
- total_acc,
- total_recall,
- total_f1)
- print(log_text)
- # 当前epochs训练后更新学习率, 必须在优化器更新之后
- # scheduler.step()
-
- # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值
- train_loss_list.append(epoch_loss)
- train_acc_list.append(total_acc)
- train_recall_list.append(total_recall)
- train_f1_list.append(total_f1)
- train_log_file.write(log_text + "\n")
-
-
- # 定义要记录的正确总实体数, 识别实体数以及真实实体数
- total_acc_entities_length, \
- total_predict_entities_length, \
- total_gold_entities_length = 0, 0, 0
- # 定义每batch步数, 批次loss总值, 准确度, f1值
- step, total_loss, correct, f1 = 1, 0.0, 0, 0
-
- # 开启当前epochs的验证部分
- with torch.no_grad():
- for inputs, labels in tqdm(data_loader["validation"]):
- # 将数据以 Variable 进行封装
- inputs, labels = Variable(inputs), Variable(labels)
- # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数
- # 返回最终的 CRF 的对数似然结果
- try:
- loss = model.neg_log_likelihood(inputs, labels)
- except:
- continue
- # 获取当前步的 loss 值,由 tensor 转为数字
- step_loss = loss.data
- # 累计每步损失值
- total_loss += step_loss
- # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
- best_path_list = model(inputs)
- # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数
- step_acc, step_recall, f1_score, acc_entities_length, \
- predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
- labels.tolist(),
- best_path_list,
- id2char,
- id2tag)
-
- # 训练日志内容
- '''
- log_text = "Epoch: %s | Step: %s " \
- "| loss: %.5f " \
- "| acc: %.5f " \
- "| recall: %.5f " \
- "| f1 score: %.5f" % \
- (epoch, step, step_loss, step_acc, step_recall,f1_score)
- '''
-
- # 分别累计正确总实体数、识别实体数以及真实实体数
- total_acc_entities_length += acc_entities_length
- total_predict_entities_length += predict_entities_length
- total_gold_entities_length += gold_entities_length
-
- # 记录验证集损失日志
- # validate_log_file.write(log_text + "\n")
- step += 1
-
- # 获取当前批次平均损失值(每一批次损失总值除以数据量)
- epoch_loss = total_loss / data_size["validation"]
- # 计算总批次准确率
- if total_predict_entities_length > 0:
- total_acc = total_acc_entities_length / total_predict_entities_length
- # 计算总批次召回率
- if total_gold_entities_length > 0:
- total_recall = total_acc_entities_length / total_gold_entities_length
- # 计算总批次F1值
- total_f1 = 0
- if total_acc + total_recall != 0.0:
- total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall)
- log_text = "Epoch: %s " \
- "| mean loss: %.5f " \
- "| total acc: %.5f " \
- "| total recall: %.5f " \
- "| total f1 scroe: %.5f" % (epoch, epoch_loss,
- total_acc,
- total_recall,
- total_f1)
- print(log_text)
- # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值
- validate_loss_list.append(epoch_loss)
- validate_acc_list.append(total_acc)
- validate_recall_list.append(total_recall)
- validate_f1_list.append(total_f1)
- validate_log_file.write(log_text + "\n")
-
-
- # 保存模型
- torch.save(model.state_dict(), model_saved_path)
-
- # 将loss下降历史数据转为图片存储
- save_train_history_image(train_loss_list,
- validate_loss_list,
- train_history_image_path,
- "Loss")
- # 将准确率提升历史数据转为图片存储
- save_train_history_image(train_acc_list,
- validate_acc_list,
- train_history_image_path,
- "Acc")
- # 将召回率提升历史数据转为图片存储
- save_train_history_image(train_recall_list,
- validate_recall_list,
- train_history_image_path,
- "Recall")
- # 将F1上升历史数据转为图片存储
- save_train_history_image(train_f1_list,
- validate_f1_list,
- train_history_image_path,
- "F1")
- print("train Finished".center(100, "-"))
-
-
- # 按照传入的不同路径, 绘制不同的训练曲线
- def save_train_history_image(train_history_list,
- validate_history_list,
- history_image_path,
- data_type):
- # 根据训练集的数据列表, 绘制折线图
- plt.plot(train_history_list, label="Train %s History" % (data_type))
- # 根据测试集的数据列表, 绘制折线图
- plt.plot(validate_history_list, label="Validate %s History" % (data_type))
- # 将图片放置在最优位置
- plt.legend(loc="best")
- # 设置x轴的图标为轮次Epochs
- plt.xlabel("Epochs")
- # 设置y轴的图标为参数data_type
- plt.ylabel(data_type)
- # 将绘制好的图片保存在特定的路径下面, 并修改图片名字中的"plot"为对应的data_type
- plt.savefig(history_image_path.replace("plot", data_type))
- plt.close()
-
-
-
- # 参数1:批次大小
- BATCH_SIZE = 8
- # 参数2:训练数据文件路径
- train_data_file_path = "./data/total.npz"
- # 参数3:加载 DataLoader 数据
- data_loader, data_size = load_dataset(train_data_file_path, BATCH_SIZE)
- # 参数4:记录当前训练时间(拼成字符串用)
- time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime(time.time()))
- # 参数5:标签码表对照
- tag_to_id = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, "<START>": 5, "<STOP>": 6}
- # 参数6:训练文件存放路径
- model_saved_path = "model/bilstm_crf_state_dict_%s.pt" % (time_str)
- # 参数7:训练日志文件存放路径
- train_log_path = "log/train_%s.log" % (time_str)
- # 参数8:验证打印日志存放路径
- validate_log_path = "log/validate_%s.log" % (time_str)
- # 参数9:训练历史记录图存放路径
- train_history_image_path = "log/bilstm_crf_train_plot_%s.png" % (time_str)
- # 参数10:字向量维度
- EMBEDDING_DIM = 300
- # 参数11:隐层维度
- HIDDEN_DIM = 128
- # 参数12:句子长度
- SENTENCE_LENGTH = 100
- # 参数13:堆叠 LSTM 层数
- NUM_LAYERS = 1
- # 参数14:训练批次
- EPOCHS = 3
- # 参数15:初始化学习率
- LEARNING_RATE = 0.05
-
-
- if __name__ == '__main__':
- train(data_loader, data_size, BATCH_SIZE, EMBEDDING_DIM, HIDDEN_DIM,
- SENTENCE_LENGTH, NUM_LAYERS, EPOCHS, LEARNING_RATE, tag_to_id,
- model_saved_path, train_log_path, validate_log_path,
- train_history_image_path)
-
- import os
- import torch
- import json
- from bilstm_crf import BiLSTM_CRF
-
- def singel_predict(model_path, content, char_to_id_json_path, batch_size, embedding_dim,
- hidden_dim, num_layers, sentence_length, offset, target_type_list, tag2id):
-
- char_to_id = json.load(open(char_to_id_json_path, mode="r", encoding="utf-8"))
- # 将字符串转为码表id列表
- char_ids = content_to_id(content, char_to_id)
- # 处理成 batch_size * sentence_length 的 tensor 数据
- # 定义模型输入列表
- model_inputs_list, model_input_map_list = build_model_input_list(content,
- char_ids,
- batch_size,
- sentence_length,
- offset)
- # 加载模型
- model = BiLSTM_CRF(vocab_size=len(char_to_id),
- tag_to_ix=tag2id,
- embedding_dim=embedding_dim,
- hidden_dim=hidden_dim,
- batch_size=batch_size,
- num_layers=num_layers,
- sequence_length=sentence_length)
- # 加载模型字典
- model.load_state_dict(torch.load(model_path))
-
- tag_id_dict = {v: k for k, v in tag_to_id.items() if k[2:] in target_type_list}
- # 定义返回实体列表
- entities = []
- with torch.no_grad():
- for step, model_inputs in enumerate(model_inputs_list):
- prediction_value = model(model_inputs)
- # 获取每一行预测结果
- for line_no, line_value in enumerate(prediction_value):
- # 定义将要识别的实体
- entity = None
- # 获取当前行每个字的预测结果
- for char_idx, tag_id in enumerate(line_value):
- # 若预测结果 tag_id 属于目标字典数据 key 中
- if tag_id in tag_id_dict:
- # 取符合匹配字典id的第一个字符,即B, I
- tag_index = tag_id_dict[tag_id][0]
- # 计算当前字符确切的下标位置
- current_char = model_input_map_list[step][line_no][char_idx]
- # 若当前字标签起始为 B, 则设置为实体开始
- if tag_index == "B":
- entity = current_char
- # 若当前字标签起始为 I, 则进行字符串追加
- elif tag_index == "I" and entity:
- entity += current_char
- # 当实体不为空且当前标签类型为 O 时,加入实体列表
- if tag_id == tag_to_id["O"] and entity:
- # 满足当前字符为O,上一个字符为目标提取实体结尾时,将其加入实体列表
- entities.append(entity)
- # 重置实体
- entity = None
- return entities
-
-
- def content_to_id(content, char_to_id):
- # 定义字符串对应的码表 id 列表
- char_ids = []
- for char in list(content):
- # 判断若字符不在码表对应字典中,则取 NUK 的编码(即 unknown),否则取对应的字符编码
- if char_to_id.get(char):
- char_ids.append(char_to_id[char])
- else:
- char_ids.append(char_to_id["UNK"])
- return char_ids
-
-
- def build_model_input_list(content, char_ids, batch_size, sentence_length, offset):
- # 定义模型输入数据列表
- model_input_list = []
- # 定义每个批次句子 id 数据
- batch_sentence_list = []
- # 将文本内容转为列表
- content_list = list(content)
- # 定义与模型 char_id 对照的文字
- model_input_map_list = []
- # 定义每个批次句子字符数据
- batch_sentence_char_list = []
- # 判断是否需要 padding
- if len(char_ids) % sentence_length > 0:
- # 将不足 batch_size * sentence_length 的部分填充0
- padding_length = (batch_size * sentence_length
- - len(char_ids) % batch_size * sentence_length
- - len(char_ids) % sentence_length)
- char_ids.extend([0] * padding_length)
- content_list.extend(["#"] * padding_length)
- # 迭代字符 id 列表
- # 数据满足 batch_size * sentence_length 将加入 model_input_list
- for step, idx in enumerate(range(0, len(char_ids) + 1, sentence_length)):
- # 起始下标,从第一句开始增加 offset 个字的偏移
- start_idx = 0 if idx == 0 else idx - step * offset
- # 获取长度为 sentence_length 的字符 id 数据集
- sub_list = char_ids[start_idx:start_idx + sentence_length]
- # 获取长度为 sentence_length 的字符数据集
- sub_char_list = content_list[start_idx:start_idx + sentence_length]
- # 加入批次数据集中
- batch_sentence_list.append(sub_list)
- # 批量句子包含字符列表
- batch_sentence_char_list.append(sub_char_list)
- # 每当批次长度达到 batch_size 时候,将其加入 model_input_list
- if len(batch_sentence_list) == batch_size:
- # 将数据格式转为 tensor 格式,大小为 batch_size * sentence_length
- model_input_list.append(torch.tensor(batch_sentence_list))
- # 重置 batch_sentence_list
- batch_sentence_list = []
- # 将 char_id 对应的字符加入映射表中
- model_input_map_list.append(batch_sentence_char_list)
- # 重置批字符串内容
- batch_sentence_char_list = []
- # 返回模型输入列表
- return model_input_list, model_input_map_list
-
-
- # 参数1:待识别文本
- content = "本病是由DNA病毒的单纯疱疹病毒所致。人类单纯疱疹病毒分为两型," \
- "即单纯疱疹病毒Ⅰ型(HSV-Ⅰ)和单纯疱疹病毒Ⅱ型(HSV-Ⅱ)。" \
- "Ⅰ型主要引起生殖器以外的皮肤黏膜(口腔黏膜)和器官(脑)的感染。" \
- "Ⅱ型主要引起生殖器部位皮肤黏膜感染。" \
- "病毒经呼吸道、口腔、生殖器黏膜以及破损皮肤进入体内," \
- "潜居于人体正常黏膜、血液、唾液及感觉神经节细胞内。" \
- "当机体抵抗力下降时,如发热胃肠功能紊乱、月经、疲劳等时," \
- "体内潜伏的HSV被激活而发病。"
- # 参数2:模型保存文件路径
- model_path = "./model/bilstm_crf_state_dict_20200129_210417.pt"
- # 参数3:批次大小
- BATCH_SIZE = 8
- # 参数4:字向量维度
- EMBEDDING_DIM = 300
- # 参数5:隐层维度
- HIDDEN_DIM = 128
- NUM_LAYERS = 1
- # 参数6:句子长度
- SENTENCE_LENGTH = 100
- # 参数7:偏移量
- OFFSET = 10
- # 参数8:标签码表对照字典
- tag_to_id = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, "<START>": 5, "<STOP>": 6}
- # 参数9:字符码表文件路径
- char_to_id_json_path = "./data/char_to_id.json"
- # 参数10:预测结果存储路径
- prediction_result_path = "prediction_result"
- # 参数11:待匹配标签类型
- target_type_list = ["sym"]
-
-
- entities = singel_predict(model_path,
- content,
- char_to_id_json_path,
- BATCH_SIZE,
- EMBEDDING_DIM,
- HIDDEN_DIM,
- NUM_LAYERS,
- SENTENCE_LENGTH,
- OFFSET,
- target_type_list,
- tag_to_id)
-
- # print("entities:\n", entities)
-
-
- # 构建批量文本预测的函数
- def batch_predict(data_path, model_path, char_to_id_json_path, batch_size, embedding_dim, hidden_dim, sentence_length,
- offset, target_type_list, prediction_result_path, tag_to_id):
- # data_path: 待预测的批量文本所在的文件夹路径
- # 遍历文件夹下的所有文件
- for fn in os.listdir(data_path):
- # 拼接出完整的文件路径
- fullpath = os.path.join(data_path, fn)
- # 定义输出实体结果的文件
- entities_file = open(os.path.join(prediction_result_path, fn), mode="w", encoding="utf-8")
-
- # 打开文件进行预测
- with open(fullpath, mode="r", encoding="utf-8") as f:
- # 读取文件的内容
- content = f.readline()
- # 通过单文本预测函数进行预测
- entities = single_predict(model_path, content, char_to_id_json_path, batch_size, embedding_dim,
- hidden_dim, sentence_length, offset, target_type_list, tag_to_id)
-
- # 将预测出的实体写入到结果文件中
- entities_file.write("\n".join(entities))
-
- print("batch_predict Finished.".center(100, "-"))
-
-
- data_path = "origin_data"
-
- # 进行批量预测函数的调用
- batch_predict(data_path, model_path, cahr_to_id_json_path, BATCH_SIZE, EMBEDDING_DIM, HIDDEN_DIM, SENTENCE_LENGTH,
- OFFSET, target_type_list, prediction_result_path, tag_to_id)
- # -*- coding: utf-8 -*-#
- import torch
- import torch.autograd as autograd
- import torch.nn as nn
- import torch.optim as optim
-
- torch.manual_seed(1)
- START_TAG = "<START>"
- STOP_TAG = "<STOP>"
-
-
- # 获取最大值的下标
- def argmax(vec):
- # 返回列的维度上的最大值下标, 此下标是一个标量float
- _, idx = torch.max(vec, 1)
- return idx.item()
-
-
- # 辅助完成损失函数中的公式计算
- def log_sum_exp(vec):
- max_score = vec[0, argmax(vec)]
- # max_score维度是1, max_score.view(1,-1)维度是1 * 1, max_score.view(1, -1).expand(1, vec.size()[1])的维度1 * 7
- # 经过expand()之后的张量, 里面所有的值都相同, 都是最大值max_score
- max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
- # 先减去max_score,最后再加上max_score, 是为了防止数值爆炸, 纯粹是代码上的小技巧
- return max_score + \
- torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
-
-
- class BiLSTM_CRF(nn.Module):
-
- def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
- '''
- description: 模型初始化
- :param vocab_size: 所有句子包含字符大小
- :param tag_to_ix: 标签与id对照字典
- :param embedding_dim: 字嵌入维度(即LSTM输入层维度input_size)
- :param hidden_dim: 隐藏层向量维度
- '''
- super(BiLSTM_CRF, self).__init__()
- self.embedding_dim = embedding_dim
- self.hidden_dim = hidden_dim
- self.vocab_size = vocab_size
- self.tag_to_ix = tag_to_ix
- self.tagset_size = len(tag_to_ix)
-
- # 构建词嵌入层, 两个参数分别是单词总数, 词嵌入维度
- self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
- # 构建双向LSTM层, 输入参数包括词嵌入维度, 隐藏层大小, 堆叠的LSTM层数, 是否双向标志位
- self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
- num_layers=1, bidirectional=True)
-
- # 构建全连接线性层, 一端对接LSTM隐藏层, 另一端对接输出层, 相应的维度就是标签数量tagset_size
- self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
-
- # 初始化转移矩阵
- self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
-
- # 按照损失函数小节的定义, 任意的合法句子不会转移到"START_TAG", 因此设置为-10000
- # 同理, 任意合法的句子不会从"STOP_TAG"继续向下转移, 也设置为-10000
- self.transitions.data[tag_to_ix[START_TAG], :] = -10000
- self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
-
- # 初始化隐藏层, 利用单独的类函数init_hidden()来完成
- self.hidden = self.init_hidden()
-
- def init_hidden(self):
- # 为了符合LSTM的输入要求, 我们返回h0, c0, 这两个张量的shape完全一致
- return (torch.randn(2, 1, self.hidden_dim // 2),
- torch.randn(2, 1, self.hidden_dim // 2))
-
- # 计算损失函数第一项的分值函数, 本质上是发射矩阵和转移矩阵的累加和
- def _forward_alg(self, feats):
- # 初始化一个alphas张量, 代表转移矩阵的起始位置
- init_alphas = torch.full((1, self.tagset_size), -10000.)
- # 仅仅把START_TAG赋值为0, 代表着接下来的转移只能从START_TAG开始
- init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
-
- # 前向计算变量的赋值, 这样在反向求导的过程中就可以自动更新参数
- forward_var = init_alphas
-
- # 遍历一行语句, 每一个feat代表一个time_step
- for feat in feats:
- # 当前time_step的一个forward tensors
- alphas_t = []
- # 在当前time_step, 遍历所有可能的转移标签, 进行累加计算
- for next_tag in range(self.tagset_size):
- # 广播发射矩阵的分数
- emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
-
- # 第i个time_step循环时, 转移到next_tag标签的转移概率
- trans_score = self.transitions[next_tag].view(1, -1)
-
- # 将前向矩阵, 转移矩阵, 发射矩阵累加
- next_tag_var = forward_var + trans_score + emit_score
-
- # 计算log_sum_exp()函数值
- alphas_t.append(log_sum_exp(next_tag_var).view(1))
-
- # 将列表张量转变为二维张量
- forward_var = torch.cat(alphas_t).view(1, -1)
-
- # 添加最后一步转移到"STOP_TAG"的分数, 就完成了整条语句的分数计算
- terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
-
- # 计算log_sum_exp()函数值, 作为一条样本语句的最终得分
- alpha = log_sum_exp(terminal_var)
- return alpha
-
- # 在类中将文本信息经过词嵌入层, BiLSTM层, 线性层的处理, 最终输出句子张量
- def _get_lstm_features(self, sentence):
- self.hidden = self.init_hidden()
-
- # LSTM的输入要求形状为 [sequence_length, batch_size, embedding_dim]
- embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
-
- # LSTM的两个输入参数: 词嵌入后的张量, 随机初始化的隐藏层张量
- lstm_out, self.hidden = self.lstm(embeds, self.hidden)
-
- # 要保证输出张量的shape: [sequence_length, hidden_dim]
- lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
-
- # 将BiLSTM的输出经过一个全连接层
- lstm_feats = self.hidden2tag(lstm_out)
- return lstm_feats
-
- # 计算损失函数第二项的分值函数
- def _score_sentence(self, feats, tags):
- # 初始化一个0值的tensor, 为后续累加做准备
- score = torch.zeros(1)
-
- # 将START_TAG和真实标签tags做列维度上的拼接
- tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
-
- # 注意: 此处最重要的是这是在真实标签指导下的转移矩阵和发射矩阵的累加分数
- for i, feat in enumerate(feats):
- score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
- score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
- return score
-
- # 根据传入的语句特征feats, 推断出标签序列
- def _viterbi_decode(self, feats):
- backpointers = []
-
- # 初始化前向传播的张量, 设置START_TAG等于0, 约束合法序列只能从START_TAG开始
- init_vvars = torch.full((1, self.tagset_size), -10000.)
- init_vvars[0][self.tag_to_ix[START_TAG]] = 0
-
- # 在第i个time_step, 张量forward_var保存第i-1个time_step的viterbi变量
- forward_var = init_vvars
-
- # 依次遍历i=0, 到序列最后的每一个time_step
- for feat in feats:
- # 保存当前time_step的回溯指针
- bptrs_t = []
- # 保存当前time_step的viterbi变量
- viterbivars_t = []
-
- for next_tag in range(self.tagset_size):
- # next_tag_var[i]保存了tag_i 在前一个time_step的viterbi变量
- # 前向传播张量forward_var加上从tag_i转移到next_tag的分数, 赋值给next_tag_var
- # 注意此处没有加发射矩阵分数, 因为求最大值不需要发射矩阵
- next_tag_var = forward_var + self.transitions[next_tag]
-
- # 将最大的标签id加入到当前time_step的回溯列表中
- best_tag_id = argmax(next_tag_var)
- bptrs_t.append(best_tag_id)
- viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
-
- # 此处再将发射矩阵分数feat加上, 赋值给forward_var, 作为下一个time_step的前向传播张量
- forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
- backpointers.append(bptrs_t)
-
- # 最后加上转移到STOP_TAG的分数
- terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
- best_tag_id = argmax(terminal_var)
- path_score = terminal_var[0][best_tag_id]
-
- # 首先把最后一步的id值加入
- best_path = [best_tag_id]
- # 从后向前回溯最佳路径
- for bptrs_t in reversed(backpointers):
- best_tag_id = bptrs_t[best_tag_id]
- best_path.append(best_tag_id)
-
- # 将START_TAG删除
- start = best_path.pop()
- # 确认一下最佳路径中的第一个标签是START_TAG
- assert start == self.tag_to_ix[START_TAG]
- # 因为是从后向前回溯, 所以再次逆序得到总前向后的真实路径
- best_path.reverse()
- return path_score, best_path
-
- # 对数似然函数的计算, 输入的是数字化编码后的语句, 和真实的标签
- # 注意: 这个函数是未来真实训练中要用到的"虚拟化的forward()"
- def neg_log_likelihood(self, sentence, tags):
- # 第一步先得到BiLSTM层的输出特征张量
- feats = self._get_lstm_features(sentence)
-
- # forward_score 代表公式推导中损失函数loss的第一项
- forward_score = self._forward_alg(feats)
-
- # gold_score 代表公式推导中损失函数loss的第二项
- gold_score = self._score_sentence(feats, tags)
- return forward_score - gold_score
-
- # 此处的forward()真实场景是用在预测部分, 训练的时候并没有用到
- def forward(self, sentence):
- # 获取从BiLSTM层得到的发射矩阵
- lstm_feats = self._get_lstm_features(sentence)
-
- # 通过维特比算法直接解码最佳路径
- score, tag_seq = self._viterbi_decode(lstm_feats)
- return score, tag_seq
- # -*- coding: utf-8 -*-#
-
- import os
- import torch
- import json
- from tqdm import tqdm
- from pytorch_ner import BiLSTM_CRF
-
- def prepare_sequence(seq, char_to_id):
- char_ids = []
- for idx, char in enumerate(seq):
- # 判断若字符不在码表对应字典中,则取 NUK 的编码(即 unknown),否则取对应的字符编码
- if char_to_id.get(char):
- char_ids.append(char_to_id[char])
- else:
- char_ids.append(char_to_id["<UNK>"])
- return torch.tensor(char_ids, dtype=torch.long)
-
- def singel_predict(model_path,
- content,
- char_to_id_json_path,
- embedding_dim,
- hidden_dim,
- target_type_list,
- tag_to_id):
- """
- description: 单句命名实体识别预测,返回实体列表
- :param model_path: 模型文件路径
- :param content: 待预测文本
- :param char_to_id_json_path: 字符码表文件路径
- :param embedding_dim: 字向量维度
- :param hidden_dim: BiLSTM 隐藏层向量维度
- :param target_type_list: 待匹配类型,符合条件的实体将会被提取出来
- :param tag_to_id: 标签码表对照字典,标签对应 id
- :return: 实体列表
- """
- # 加载码表文件,转为码表字典
- char_to_id = json.load(open(char_to_id_json_path, mode="r", encoding="utf8"))
- # 加载模型
- model = BiLSTM_CRF(len(char_to_id), tag_to_id, embedding_dim, hidden_dim)
- # 加载模型字典
- model.load_state_dict(torch.load(model_path))
- # 获取需要提取的 tag 对应的 id 列表
- tag_id_dict = {v: k for k, v in tag_to_id.items() if k[2:] in target_type_list}
- # 定义返回实体列表
- entities = []
- # 初始化梯度
- with torch.no_grad():
- # 将组装的模型输入数据分批进行预测
- sentence_in = prepare_sequence(content, char_to_id)
- score, best_path_list = model(sentence_in)
- entity = None
- for char_idx, tag_id in enumerate(best_path_list):
- # 若预测结果 tag_id 属于目标字典数据 key 中
- if tag_id in tag_id_dict:
- # 取符合匹配字典id的第一个字符,即【B、I】
- tag_index = tag_id_dict[tag_id][0]
- # 计算当前字符确切的下标位置
- current_char = content[char_idx]
- # 若当前字标签起始为 B,则设置为实体开始
- if tag_index == "B":
- entity = current_char
- # 若当前字标签起始为 I,则进行字符串追加
- elif tag_index == "I" and entity:
- entity += current_char
- # 当实体不为空且当前标签类型为 O 时,加入实体列表
- if tag_id == tag_to_id["O"] and entity:
- # 满足当前字符为O,上一个字符为目标提取实体结尾时,将其加入实体列表
- if "、" not in entity\
- and "~" not in entity\
- and "。" not in entity\
- and "”" not in entity\
- and ":" not in entity\
- and ":" not in entity\
- and "," not in entity\
- and "," not in entity\
- and "." not in entity\
- and ";" not in entity\
- and ";" not in entity\
- and "【" not in entity\
- and "】" not in entity\
- and "[" not in entity\
- and "]" not in entity:
- entities.append(entity)
- # 重置实体
- entity = None
- return set(entities)
-
- def batch_predict(data_path,
- model_path,
- char_to_id_json_path,
- embedding_dim,
- hidden_dim,
- target_type_list,
- prediction_result_path,
- tag_to_id):
- """
- description: 批量预测,查询文件目录下数据,
- 从中提取符合条件的实体并存储至新的目录【prediction_result_path】
- :param data_path: 数据文件路径
- :param model_path: 模型文件路径
- :param char_to_id_json_path: 字符码表文件路径
- :param batch_size: 训练批次大小
- :param embedding_dim: 字向量维度
- :param hidden_dim: BiLSTM 隐藏层向量维度
- :param sentence_length: 句子长度(句子做了 padding )
- :param offset: 设定偏移量,
- 当字符串超出 sentence_length 时,换行时增加偏移量【经验值】
- :param target_type_list: 待匹配类型,符合条件的实体将会被提取出来
- :param prediction_result_path: 预测结果保存路径
- :param tag_to_id: 标签码表对照字典,标签对应 id
- :return: 无返回
- """
- # 迭代路径,读取文件名
- for fn in tqdm(os.listdir(data_path)):
- # 拼装全路径
- fullpath = os.path.join(data_path, fn)
- # 定义输出结果文件
- entities_file = open(os.path.join(prediction_result_path, fn.replace("txt", "csv")),
- mode="w",
- encoding="utf8")
- with open(fullpath, mode="r", encoding="utf8") as f:
- # 读取文件内容
- content = f.readline()
- # 调用单个预测模型,输出为目标类型实体文本列表
- entities = singel_predict(model_path,
- content,
- char_to_id_json_path,
- embedding_dim,
- hidden_dim,
- target_type_list,
- tag_to_id)
- # 写入识别结果文件
- entities_file.write("\n".join(entities))
- print("batch_predict Finished".center(100, "-"))
-
-
- if __name__ == '__main__':
- # 待识别文本
- # content = "本病是由DNA病毒的单纯疱疹病毒所致。人类单纯疱疹病毒分为两型," \
- # "即单纯疱疹病毒Ⅰ型(HSV-Ⅰ)和单纯疱疹病毒Ⅱ型(HSV-Ⅱ)。" \
- # "Ⅰ型主要引起生殖器以外的皮肤黏膜(口腔黏膜)和器官(脑)的感染。" \
- # "Ⅱ型主要引起生殖器部位皮肤黏膜感染。" \
- # "病毒经呼吸道、口腔、生殖器黏膜以及破损皮肤进入体内," \
- # "潜居于人体正常黏膜、血液、唾液及感觉神经节细胞内。" \
- # "当机体抵抗力下降时,如发热胃肠功能紊乱、月经、疲劳等时," \
- # "体内潜伏的HSV被激活而发病。"
- # 模型保存路径
- model_path = "model/bilstm_crf_state_dict_20200603_172556.pt"
- # 字向量维度
- EMBEDDING_DIM = 200
- # 隐层维度
- HIDDEN_DIM = 100
- # 标签码表对照字典
- tag_to_id = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, "<START>": 5, "<STOP>": 6}
- # 字符码表文件路径
- char_to_id_json_path = "char_to_id.json"
- # 预测结果存储路径
- prediction_result_path = "prediction_result"
- # 待匹配标签类型
- target_type_list = ["sym"]
- # # 单独文本预测,获得实体结果
- # entities = singel_predict(model_path,
- # content,
- # char_to_id_json_path,
- # EMBEDDING_DIM,
- # HIDDEN_DIM,
- # target_type_list,
- # tag_to_id)
- # # 打印实体结果
- # print("entities:\n", entities)
- # 待预测文本文件所在目录
- data_path = "./data/unstructed_data"
- # 批量文本预测,并将结果写入文件中
- batch_predict(data_path,
- model_path,
- char_to_id_json_path,
- EMBEDDING_DIM,
- HIDDEN_DIM,
- target_type_list,
- prediction_result_path,
- tag_to_id)
- # -*- coding: utf-8 -*-#
- import torch
- import torch.autograd as autograd
- import torch.nn as nn
- import torch.optim as optim
- from pytorch_ner import BiLSTM_CRF
- import train_indicator
- from tqdm import tqdm
- import json
- import random
- import time
- import matplotlib.pyplot as plt
-
-
- # 通过函数来准备训练序列
- def prepare_sequence(seq, char_to_id):
- char_ids = []
- for idx, char in enumerate(seq):
- # 判断若字符不在码表对应字典中,则取 NUK 的编码(即 unknown),否则取对应的字符编码
- if char_to_id.get(char):
- char_ids.append(char_to_id[char])
- else:
- char_ids.append(char_to_id["UNK"])
- return torch.tensor(char_ids, dtype=torch.long)
-
-
- # 读取文件中的数据
- def get_train_data():
- train_data_file_path = "data/train_data.txt"
- validate_file_path = "data/validate_data.txt"
- train_data_list = []
- validate_data_list = []
- for line in open(train_data_file_path, mode="r", encoding="utf8"):
- data = json.loads(line)
- train_data_list.append(data)
- for line in open(validate_file_path, mode="r", encoding="utf8"):
- data = json.loads(line)
- validate_data_list.append(data)
- return train_data_list, validate_data_list
-
-
- # 保存训练中的图片
- def save_train_history_image(train_history_list,
- validate_history_list,
- history_image_path,
- data_type):
- """
- description: 存储训练历史图片
- :param train_history_list: 训练历史结果
- :param validate_history_list: 验证历史结果
- :param history_image_path: 历史数据生成图像保存路径
- :param data_type: 数据类型[用于替换label,y轴以及保存文件名中数据类型]
- :return: 无,直接将数据转为图片进行存储
- """
- # 存储训练历史图片
- plt.plot(train_history_list, label="Train %s History" % (data_type))
- plt.plot(validate_history_list, label="Validate %s History" % (data_type))
- plt.legend(loc="best")
- plt.xlabel("Epochs")
- plt.ylabel(data_type)
- plt.savefig(history_image_path.replace("plot", data_type))
- plt.close()
-
-
- # 训练模型的主函数
- if __name__ == '__main__':
- # 初始化若干参数
- EMBEDDING_DIM = 200
- HIDDEN_DIM = 100
- train_data_list, validate_data_list = get_train_data()
- char_to_id_json_path = "char_to_id.json"
- char_to_id = json.load(open('char_to_id.json', mode='r', encoding='utf8'))
- tag_to_ix = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4, "<START>": 5, "<STOP>": 6}
-
- # 实例化模型对象, 实例化优化器
- model = BiLSTM_CRF(len(char_to_id), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
- optimizer = optim.SGD(model.parameters(), lr=0.5, momentum=0.85, weight_decay=1e-4)
- # 调转字符标签与id值
- id_to_tag = {v: k for k, v in tag_to_ix.items()}
- # 调转字符编码与id值
- id_to_char = {v: k for k, v in char_to_id.items()}
- time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime(time.time()))
- model_saved_path = "model/bilstm_crf_state_dict_%s.pt" % (time_str)
- train_history_image_path = "log/bilstm_crf_train_plot_%s.png" % (time_str)
- log_file = open("log/train_%s.log"%(time_str), mode="w", encoding="utf8")
-
- # 模型训练10个轮次
- epochs = 10
- # 初始化未来的画图数据列表
- train_loss_history, train_acc_history, train_recall_history, train_f1_history = [], [], [], []
- validate_loss_history, validate_acc_history, validate_recall_history, validate_f1_history = [], [], [], []
-
- for epoch in range(epochs):
- tqdm.write("Epoch {}/{}".format(epoch + 1, epochs))
- mode = "train"
- total_acc_length, total_prediction_length, total_gold_length, total_loss = 0, 0, 0, 0
- # 在每一个轮次epoch中, 首先应用训练集来训练模型
- for train_data in tqdm(train_data_list):
- model.zero_grad()
-
- # 获取特征数据和标签, 并进行数字化封装
- sentence, tags = train_data.get("text"), train_data.get("label")
- sentence_in = prepare_sequence(sentence, char_to_id)
- targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
-
- # 得到损失值并反向传播, 更新参数
- loss = model.neg_log_likelihood(sentence_in, targets)
- loss.backward()
- optimizer.step()
-
- # 直接调用当前模型得到最佳路径的解码结果
- score, best_path_list = model(sentence_in)
- # 累加损失值
- step_loss = loss.data.numpy()
- total_loss += step_loss
- sentence_in_unq = sentence_in.unsqueeze(0)
- targets_unq = targets.unsqueeze(0)
- best_path_list_up = [best_path_list]
- # 调用评估函数得到当前的准确率, 召回率, F1的累加值
- step_acc, step_recall, f1_score, acc_entities_length, predict_entities_length, gold_entities_length = train_indicator.indicator(sentence_in_unq.tolist(), targets_unq.tolist(), best_path_list_up, id_to_char, id_to_tag)
- total_acc_length += acc_entities_length
- total_prediction_length += predict_entities_length
- total_gold_length += gold_entities_length
-
- print("train:", total_acc_length, total_prediction_length, total_gold_length)
- # 具体的计算平均损失值, 准确率, 召回率, F1值
- if total_prediction_length > 0:
- train_mean_loss = total_loss / len(train_data_list)
- train_epoch_acc = total_acc_length / total_prediction_length
- train_epoch_recall = total_acc_length / total_gold_length
- train_epoch_f1 = 2 * train_epoch_acc * train_epoch_recall / (train_epoch_acc + train_epoch_recall)
- else:
- log_file.write("train_total_prediction_length is zero" + "\n")
-
- # 训练之后, 直接在当前轮次epoch下进入验证集的验证过程
- mode = "validate"
- total_acc_length, total_prediction_length, total_gold_length, total_loss = 0, 0, 0, 0
- # 验证保持模型参数不变
- with torch.no_grad():
- for validate_data in tqdm(validate_data_list):
- # 获取验证集的特征数据和标签
- sentence, tags = validate_data.get("text"), validate_data.get("label")
- sentence_in = prepare_sequence(sentence, char_to_id)
- targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
-
- # 得到损失值
- loss = model.neg_log_likelihood(sentence_in, targets)
-
- # 直接调用当前模型得到最佳路径的解码结果
- score, best_path_list = model(sentence_in)
- # 累加损失值
- step_loss = loss.data.numpy()
- total_loss += step_loss
- sentence_in_unq = sentence_in.unsqueeze(0)
- targets_unq = targets.unsqueeze(0)
- best_path_list_up = [best_path_list]
-
- # 调用评估函数得到当前的准确率, 召回率, F1的累加值
- step_acc, step_recall, f1_score, acc_entities_length, predict_entities_length, gold_entities_length = train_indicator.indicator(sentence_in_unq.tolist(), targets_unq.tolist(), best_path_list_up, id_to_char, id_to_tag)
- total_acc_length += acc_entities_length
- total_prediction_length += predict_entities_length
- total_gold_length += gold_entities_length
-
- print("validate:", total_acc_length, total_prediction_length, total_gold_length)
-
- # 具体的计算平均损失值, 准确率, 召回率, F1值
- # 这里面有一个前提, 就是最少要预测正确一个才有意义
- if total_acc_length != 0 and total_prediction_length != 0:
- validate_mean_loss = total_loss / len(validate_data_list)
- validate_epoch_acc = total_acc_length / total_prediction_length
- validate_epoch_recall = total_acc_length / total_gold_length
- validate_epoch_f1 = 2 * validate_epoch_acc * validate_epoch_recall / (validate_epoch_acc + validate_epoch_recall)
- log_text = "Epoch: %s | train loss: %.5f |train acc: %.3f |train recall: %.3f |train f1 score: %.3f" \
- " | validate loss: %.5f |validate acc: %.3f |validate recall: %.3f |validate f1 score: %.3f" % \
- (epoch,
- train_mean_loss, train_epoch_acc, train_epoch_recall, train_epoch_f1,
- validate_mean_loss, validate_epoch_acc, validate_epoch_recall, validate_epoch_f1)
- log_file.write(log_text+"\n")
- train_loss_history.append(train_mean_loss)
- train_acc_history.append(train_epoch_acc)
- train_recall_history.append(train_epoch_recall)
- train_f1_history.append(train_epoch_f1)
- validate_loss_history.append(validate_mean_loss)
- validate_acc_history.append(validate_epoch_acc)
- validate_recall_history.append(validate_epoch_recall)
- validate_f1_history.append(validate_epoch_f1)
- else:
- log_file.write("validate_total_prediction_length is zero" + "\n")
-
- # 当10轮epochs全部完成之后, 就可以保存模型, 并开始绘图了
- # 保存模型
- torch.save(model.state_dict(), model_saved_path)
-
- # 将 loss 下降历史数据转为图片存储
- save_train_history_image(train_loss_history,
- validate_loss_history,
- train_history_image_path,
- "Loss")
- # 将准确率提升历史数据转为图片存储
- save_train_history_image(train_acc_history,
- validate_acc_history,
- train_history_image_path,
- "Acc")
- # 将召回提升历史数据转为图片存储
- save_train_history_image(train_recall_history,
- validate_recall_history,
- train_history_image_path,
- "Recall")
- # 将F1上升历史数据转为图片存储
- save_train_history_image(train_f1_history,
- validate_f1_history,
- train_history_image_path,
- "F1")
- print("train Finished".center(100, "-"))
- for name, parameters in model.named_parameters():
- print(name, ':', parameters.size())
- # -*- coding: utf-8 -*-#
- import torch
-
-
- def indicator(sentence_list, gold_tags, predict_tags, id2char, id2tag):
- """
- description: 评价模型指标方法
- :param sentence_list: 句子列表
- :param gold_tags: 标签序列
- :param predict_tags: 预测标签序列
- :param id2char: 文字码表
- :param id2tag: 标签码表
- :return: step_acc, 当前批次准确率
- step_recall, 当前批次召回率
- f1_score, 当前批次 f1 值
- acc_entities_length, 当前批次正确识别实体总数
- predict_entities_length, 当前批次识别出的实体总数
- gold_entities_length 当前批次金标准的实体总数
- """
- # 金标准实体集合以及每个实体的字与标签列表
- gold_entities, gold_entity = [], []
- # 预测实体集合以及每个实体的字与标签列表
- predict_entities, predict_entity = [], []
- # 迭代句子列表
- for line_no, sentence in enumerate(sentence_list):
- # 迭代句子每个字符
- for char_no in range(len(sentence)):
- # 判断:若句子的id值对应的是0(即:<PAD>)则跳过循环
- if sentence[char_no]==0:
- break
- # 获取当前句子中的每一个文字
- char_text = id2char[sentence[char_no]]
- # 获取当前字金标准实体标注类型
- gold_tag_type = id2tag[gold_tags[line_no][char_no]]
- # 获取当前预测实体标注类型
- predict_tag_type = id2tag[predict_tags[line_no][char_no]]
- # 判断 id2tag 第一个字符是否为 B ,表示实体开始
- if gold_tag_type[0] == "B":
- # 将实体文字与类别加入实体列表中
- gold_entity = [char_text + "/" + gold_tag_type]
- # 判断 id2tag 第一个字符是否为 I ,表示实体中部到结尾
- # 判断依据: I 开头; entiry 不为空; 实体类别相同.
- elif gold_tag_type[0] == "I" \
- and len(gold_entity) != 0 \
- and gold_entity[-1].split("/")[1][1:] == gold_tag_type[1:]:
- # 加入实体列表中
- gold_entity.append(char_text + "/" + gold_tag_type)
- # 判断依据: O 开头; entiry 不为空.
- elif gold_tag_type[0] == "O" and len(gold_entity) != 0 :
- # 增加唯一标识[实体后的O的索引位置]
- gold_entity.append(str(line_no) + "_" + str(char_no))
- # 将实体文字与类别加入实体列表中
- gold_entities.append(gold_entity)
- # 重置实体列表
- gold_entity=[]
- else:
- # 重置实体列表
- gold_entity=[]
-
-
- # 判断 id2tag 第一个字符是否为 B ,表示实体开始
- if predict_tag_type[0] == "B":
- # 将实体文字与类别加入实体列表中
- predict_entity = [char_text + "/" + predict_tag_type]
- # 判断 id2tag 第一个字符是否为 I ,表示实体中部到结尾
- # 判断依据: I 开头; entiry 不为空; 实体类别相同.
- elif predict_tag_type[0] == "I" \
- and len(predict_entity) != 0 \
- and predict_entity[-1].split("/")[1][1:] == predict_tag_type[1:]:
- # 将实体文字与类别加入实体列表中
- predict_entity.append(char_text + "/" + predict_tag_type)
- # 判断依据: O 开头; entiry 不为空.
- elif predict_tag_type[0] == "O" and len(predict_entity) != 0:
- # 增加唯一标识[实体后的O的索引位置]
- predict_entity.append(str(line_no) + "_" + str(char_no))
- # 将实体加入列表中
- predict_entities.append(predict_entity)
- # 重置实体列表
- predict_entity = []
- else:
- # 重置实体列表
- predict_entity = []
- # 获取预测正确的实体集合
- acc_entities = [entity for entity in predict_entities if entity in gold_entities]
- # 预测正确实体长度[用于计算准确率\召回\F1值]
- acc_entities_length = len(acc_entities)
- # 预测出实体个数
- predict_entities_length = len(predict_entities)
- # 真实实体列表个数
- gold_entities_length = len(gold_entities)
- # 如果准确实体个数大于 0,则计算准确度\召回率\f1值
- if acc_entities_length > 0:
- # 准确率
- step_acc = float(acc_entities_length / predict_entities_length)
- # 召回率
- step_recall = float(acc_entities_length / gold_entities_length)
- # f1 值
- f1_score = 2 * step_acc * step_recall / (step_acc + step_recall)
- # 返回评估值与各个实体长度(用于整体计算)
- return step_acc, step_recall, f1_score, acc_entities_length, predict_entities_length, gold_entities_length
- else:
- # 准确率\召回率\f1值均为0
- return 0, 0, 0, acc_entities_length, predict_entities_length, gold_entities_length
-
-
- def sentence_map(sentence_list, char_to_id, max_length):
- """
- description: 将句子中的每一个文字映射到码表中
- :param sentence: 待映射句子,类型为字符串或列表
- :param char_to_id: 码表,类型为字典,格式为{“字1”:1, “字2”:2}
- :return: 每一个字对应的顺序码,类型为tensor
- """
- # 字符串按照逆序进行排序
- sentence_list.sort(key=lambda c: len(c), reverse=True)
- # 定义句子映射列表
- sentence_map_list = []
- for sentence in sentence_list:
- # 生成句子中每个字对应的 id 列表
- sentence_id_list = [char_to_id[c] for c in sentence]
- # 计算所要填充 0 的长度
- padding_list = [0] * (max_length - len(sentence))
- # 组合
- sentence_id_list.extend(padding_list)
- # 将填充后的列表加入句子映射总表中
- sentence_map_list.append(sentence_id_list)
- # 返回句子映射集合,转为标量
- return torch.tensor(sentence_map_list, dtype=torch.long)
-
-
- if __name__ == '__main__':
- sentence_list = [
- "确诊弥漫大b细胞淋巴瘤1年",
- "反复咳嗽、咳痰40年,再发伴气促5天。",
- "生长发育迟缓9年。",
- "右侧小细胞肺癌第三次化疗入院",
- "反复气促、心悸10年,加重伴胸痛3天。",
- "反复胸闷、心悸、气促2多月,加重3天",
- "咳嗽、胸闷1月余, 加重1周",
- "右上肢无力3年, 加重伴肌肉萎缩半年"
- ]
- # 码表与id对照
- char_to_id = {"<PAD>": 0}
- # 迭代句子集合,获取每一个句子
- for sentence in sentence_list:
- # 获取句子中的每一个字
- for _char in sentence:
- # 判断是否在码表 id 对照字典中存在
- if _char not in char_to_id:
- # 加入字符id对照字典
- char_to_id[_char] = len(char_to_id)
- # 标签码表对照
- tag_to_id = {"O": 0, "B-dis": 1, "I-dis": 2, "B-sym": 3, "I-sym": 4}
- sentences_sequence = sentence_map(sentence_list, char_to_id, 20)
-
- # 真实标签数据,对应为 tag_to_id 中的 id
- tag_list = [
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- ]
-
- # 预测标签数据,对应为 tag_to_id 中的 id
- predict_tag_list = [
- [0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
- [0, 0, 3, 4, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0],
- [3, 4, 0, 3, 4, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- [0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- ]
- # 调转字符标签与id值
- id_to_tag = {v:k for k, v in tag_to_id.items()}
- # 调转字符编码与id值
- id_to_char = {v:k for k, v in char_to_id.items()}
- # 获取返回结果
- step_acc, \
- step_recall, \
- f1_score, \
- acc_entities_length, \
- predict_entities_length, \
- gold_entities_length = indicator(sentences_sequence.tolist(),
- tag_list,
- predict_tag_list,
- id_to_char,
- id_to_tag)
- # 打印输出
- print("step_acc:", step_acc, #当前批次准确率
- "\nstep_recall:", step_recall, #当前批次召回率
- "\nf1_score:", f1_score, #当前批次 f1 值
- "\nacc_entities_length:", acc_entities_length, #当前批次正确识别实体总数
- "\npredict_entities_length:", predict_entities_length, #当前批次识别出的实体总数
- "\ngold_entities_length:", gold_entities_length) #当前批次金标准的实体总数
- import torch
- import torch.nn as nn
-
- # 使用torch.hub加载bert中文模型的字映射器
- tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-chinese')
- # 使用torch.hub加载bert中文模型
- model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-chinese')
-
-
- # 编写获取bert编码的函数
- def get_bert_encode(text_1, text_2, mark=102, max_len=10):
- '''
- 功能: 使用bert中文模型对输入的文本进行编码
- text_1: 代表输入的第一句话
- text_2: 代表输入的第二句话
- mark: 分隔标记, 是bert预训练模型tokenizer的一个自身特殊标记, 当输入两个文本的时候, 有中间的特殊分隔符, 102
- max_len: 限制的最大语句长度, 如果大于max_len, 进行截断处理, 如果小于max_len, 进行0填充的处理
- return: 输入文本的bert编码
- '''
- # 第一步使用tokenizer进行两个文本的字映射
- indexed_tokens = tokenizer.encode(text_1, text_2)
- # 接下来要对两个文本进行补齐, 或者截断的操作
- # 首先要找到分隔标记的位置
- k = indexed_tokens.index(mark)
-
- # 第二步处理第一句话, 第一句话是[:k]
- if len(indexed_tokens[:k]) >= max_len:
- # 长度大于max_len, 进行截断处理
- indexed_tokens_1 = indexed_tokens[:max_len]
- else:
- # 长度小于max_len, 需要对剩余的部分进行0填充
- indexed_tokens_1 = indexed_tokens[:k] + (max_len - len(indexed_tokens[:k])) * [0]
-
- # 第三步处理第二句话, 第二句话是[k:]
- if len(indexed_tokens[k:]) >= max_len:
- # 长度大于max_len, 进行截断处理
- indexed_tokens_2 = indexed_tokens[k:k+max_len]
- else:
- # 长度小于max_len, 需要对剩余的部分进行0填充
- indexed_tokens_2 = indexed_tokens[k:] + (max_len - len(indexed_tokens[k:])) * [0]
-
- # 接下来将处理后的indexed_tokens_1和indexed_tokens_2进行相加合并
- indexed_tokens = indexed_tokens_1 + indexed_tokens_2
-
- # 需要一个额外的标志列表, 来告诉模型那部分是第一句话, 哪部分是第二句话
- # 利用0元素来表示第一句话, 利用1元素来表示第二句话
- # 注意: 两句话的长度都已经被我们规范成了max_len
- segments_ids = [0] * max_len + [1] * max_len
-
- # 利用torch.tensor将两个列表封装成张量
- tokens_tensor = torch.tensor([indexed_tokens])
- segments_tensor = torch.tensor([segments_ids])
-
- # 利用模型进行编码不求导
- with torch.no_grad():
- # 使用bert模型进行编码, 传入参数tokens_tensor和segments_tensor, 最终得到模型的输出encoded_layers
- encoded_layers, _ = model(tokens_tensor, token_type_ids=segments_tensor)
-
- return encoded_layers
-
-
- text_1 = "人生该如何起头"
- text_2 = "改变要如何起手"
-
- encoded_layers = get_bert_encode(text_1, text_2)
- print(encoded_layers)
- print(encoded_layers.shape)
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
-
-
- # 创建微调模型类Net
- class Net(nn.Module):
- def __init__(self, char_size=20, embedding_size=768, dropout=0.2):
- '''
- char_size: 输入句子中的字符数量, 因为在bert继承中规范化后的句子长度为10, 所以这里面等于两个句子的长度2*char_size
- embedding_size: 字嵌入的维度, 因为使用了bert中文模型, 而bert的嵌入维度是768, 因此这里的词嵌入维度也要是768
- dropout: 为了防止过拟合, 网络中引入dropout层, dropout为置0的比例, 默认等于0.2
- '''
- super(Net, self).__init__()
- # 将传入的参数变成类内部的变量
- self.char_size = char_size
- self.embedding_size = embedding_size
- # 实例化Dropout层
- self.dropout = nn.Dropout(p=dropout)
- # 定义第一个全连接层
- self.fc1 = nn.Linear(char_size * embedding_size, 8)
- # 定义第二个全连接层
- self.fc2 = nn.Linear(8, 2)
-
- def forward(self, x):
- # 首先要对输入的张量形状进行变化, 要满足匹配全连接层
- x = x.view(-1, self.char_size * self.embedding_size)
-
- # 使用Dropout层
- x = self.dropout(x)
-
- # 将x输入进第一个全连接层
- x = F.relu(self.fc1(x))
-
- # 再次使用Dropout层
- x = self.dropout(x)
-
- # 将x输入进第二个全连接层
- x = F.relu(self.fc2(x))
-
- return x
-
-
- embedding_size = 768
- char_size = 20
- dropout = 0.2
-
- x = torch.randn(1, 20, 768)
-
- net = Net(char_size, embedding_size, dropout)
- res = net(x)
- print(res)
-
-
-
- import pandas as pd
- from sklearn.utils import shuffle
- from functools import reduce
- from collections import Counter
- from bert_chinese_encode import get_bert_encode
- import torch
- import torch.nn as nn
-
-
- # 定义数据加载器构造函数
- def data_loader(data_path, batch_size, split=0.2):
- '''
- data_path: 训练数据的路径
- batch_size: 训练集和验证集的批次大小
- split: 训练集和验证集的划分比例
- return: 训练数据生成器, 验证数据的生成器, 训练数据的大小, 验证数据的大小
- '''
- # 首先读取数据
- data = pd.read_csv(data_path, header=None, sep="\t")
-
- # 打印一下整体数据集上正负样本的数量
- print("数据集的正负样本数量:")
- print(dict(Counter(data[0].values)))
-
- # 要对读取的数据进行散乱顺序的操作
- data = shuffle(data).reset_index(drop=True)
-
- # 划分训练集和验证集
- split_point = int(len(data) * split)
- valid_data = data[:split_point]
- train_data = data[split_point:]
-
- # 保证验证集中的数据总数至少能够满足一个批次
- if len(valid_data) < batch_size:
- raise("Batch size or split not match!")
-
- # 定义获取每个批次数据生成器的函数
- def _loader_generator(data):
- # data: 训练数据或者验证数据
- # 以每个批次大小的间隔来遍历数据集
- for batch in range(0, len(data), batch_size):
- # 初始化batch数据的存放张量列表
- batch_encoded = []
- batch_labels = []
- # 逐条进行数据的遍历
- for item in data[batch: batch + batch_size].values.tolist():
- # 对每条数据进行bert预训练模型的编码
- encoded = get_bert_encode(item[1], item[2])
- # 将编码后的每条数据放进结果列表中
- batch_encoded.append(encoded)
- # 将标签放入结果列表中
- batch_labels.append([item[0]])
-
- # 使用reduce高阶函数将列表中的数据转换成模型需要的张量形式
- # encoded的形状[batch_size, 2 * max_len, embedding_size]
- encoded = reduce(lambda x, y: torch.cat((x, y), dim=0), batch_encoded)
- labels = torch.tensor(reduce(lambda x, y: x + y, batch_labels))
- # 以生成器的方式返回数据和标签
- yield (encoded, labels)
-
- return _loader_generator(train_data), _loader_generator(valid_data), len(train_data), len(valid_data)
-
-
- data_path = "./train_data.csv"
- batch_size = 32
- max_len = 10
-
- train_data_labels, valid_data_labels, train_data_length, valid_data_length = data_loader(data_path, batch_size)
- # print(next(train_data_labels))
- # print(next(valid_data_labels))
- # print("train_data_length:", train_data_length)
- # print("valid_data_length:", valid_data_length)
-
-
- from finetuning_net import Net
- import torch.optim as optim
-
- # 初始化若干参数
- embedding_size = 768
- char_size = 2 * max_len
-
- # 实例化微调网络
- net = Net(embedding_size, char_size)
- # 定义交叉熵损失函数
- criterion = nn.CrossEntropyLoss()
- # 定义优化器
- optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
-
- def train(train_data_labels):
- # train_data_labels: 代表训练数据和标签的生成器对象
- # return: 整个训练过程的平均损失和, 正确标签数量的累加和
- # 初始化损失变量和准确数量
- train_running_loss = 0.0
- train_running_acc = 0.0
-
- # 遍历数据生成器
- for train_tensor, train_labels in train_data_labels:
- # 首先将优化器的梯度归零
- optimizer.zero_grad()
- # 将训练数据传入模型得到输出结果
- train_outputs = net(train_tensor)
- # 计算当前批次的平均损失
- train_loss = criterion(train_outputs, train_labels)
- # 累加损失
- train_running_loss += train_loss.item()
- # 训练模型, 反向传播
- train_loss.backward()
- # 优化器更新模型参数
- optimizer.step()
- # 将该批次样本中正确的预测数量进行累加
- train_running_acc += (train_outputs.argmax(1) == train_labels).sum().item()
-
- # 整个循环结束后, 训练完毕, 得到损失和以及正确样本的总量
- return train_running_loss, train_running_acc
-
-
- def valid(valid_data_labels):
- # valid_data_labels: 代表验证数据和标签的生成器对象
- # return: 整个验证过程中的平均损失和和正确标签的数量和
- # 初始化损失值和正确标签数量
- valid_running_loss = 0.0
- valid_running_acc = 0
-
- # 循环遍历验证数据集的生成器
- for valid_tensor, valid_labels in valid_data_labels:
- # 测试阶段梯度不被更新
- with torch.no_grad():
- # 将特征输入网络得到预测张量
- valid_outputs = net(valid_tensor)
- # 计算当前批次的损失值
- valid_loss = criterion(valid_outputs, valid_labels)
- # 累加损失和
- valid_running_loss += valid_loss.item()
- # 累加正确预测的标签数量
- valid_running_acc += (valid_outputs.argmax(1) == valid_labels).sum().item()
-
- # 返回整个验证过程中的平均损失和, 累加的正确标签数量
- return valid_running_loss, valid_running_acc
-
- epochs = 20
-
- # 定义每个轮次的损失和准确率的列表初始化, 用于未来画图
- all_train_losses = []
- all_valid_losses = []
- all_train_acc = []
- all_valid_acc = []
-
- for epoch in range(epochs):
- # 打印轮次
- print("Epoch:", epoch + 1)
- # 首先通过数据加载函数, 获得训练数据和验证数据的生成器, 以及对应的训练样本数和验证样本数
- train_data_labels, valid_data_labels, train_data_len, valid_data_len = data_loader(data_path, batch_size)
-
- # 调用训练函数进行训练
- train_running_loss, train_running_acc = train(train_data_labels)
- # 调用验证函数进行验证
- valid_running_loss, valid_running_acc = valid(valid_data_labels)
-
- # 计算平均损失, 每个批次的平均损失之和乘以批次样本数量, 再除以本轮次的样本总数
- train_average_loss = train_running_loss * batch_size / train_data_len
- valid_average_loss = valid_running_loss * batch_size / valid_data_len
-
- # 计算准确率, 本轮次总的准确样本数除以本轮次的总样本数
- train_average_acc = train_running_acc / train_data_len
- valid_average_acc = valid_running_acc / valid_data_len
-
- # 接下来将4个值添加进画图的列表中
- all_train_losses.append(train_average_loss)
- all_valid_losses.append(valid_average_loss)
- all_train_acc.append(train_average_acc)
- all_valid_acc.append(valid_average_acc)
-
- # 打印本轮次的训练损失, 准确率, 以及验证损失, 准确率
- print("Train Loss:", train_average_loss, "|", "Train Acc:", train_average_acc)
- print("Valid Loss:", valid_average_loss, "|", "Valid Acc:", valid_average_acc)
-
- print("Finished Training.")
-
-
- # 导入画图的工具包
- import matplotlib.pyplot as plt
- from matplotlib.pyplot import MultipleLocator
-
- # 创建第一张画布
- plt.figure(0)
-
- # 绘制训练损失曲线
- plt.plot(all_train_losses, label="Train Loss")
- # 绘制验证损失曲线, 同时将颜色设置为红色
- plt.plot(all_valid_losses, color="red", label="Valid Loss")
-
- # 定义横坐标间隔对象, 间隔等于1, 代表一个轮次一个坐标点
- x_major_locator = MultipleLocator(1)
- # 获得当前坐标图的句柄
- ax = plt.gca()
- # 在句柄上设置横坐标的刻度间隔
- ax.xaxis.set_major_locator(x_major_locator)
- # 设置横坐标取值范围
- plt.xlim(1, epochs)
- # 将图例放在左上角
- plt.legend(loc='upper left')
- # 保存图片
- plt.savefig("./loss.png")
-
-
- # 创建第二张画布
- plt.figure(1)
-
- # 绘制训练准确率曲线
- plt.plot(all_train_acc, label="Train Acc")
- # 绘制验证准确率曲线, 同时将颜色设置为红色
- plt.plot(all_valid_acc, color="red", label="Valid Acc")
- # 定义横坐标间隔对象, 间隔等于1, 代表一个轮次一个坐标点
- x_major_locator = MultipleLocator(1)
- # 获得当前坐标图的句柄
- ax = plt.gca()
- # 在句柄上设置横坐标的刻度间隔
- ax.xaxis.set_major_locator(x_major_locator)
- # 设置横坐标的取值范围
- plt.xlim(1, epochs)
- # 将图例放在左上角
- plt.legend(loc='upper left')
- # 保存图片
- plt.savefig("./acc.png")
-
-
- # 保存模型时间
- time_ = int(time.time())
- # 设置保存路径和模型名称
- MODEL_PATH = './model/BERT_net_%d.pth' % time_
- # 保存模型
- torch.save(rnn.state_dict(), MODEL_PATH)
- # 导入若干工具包
- from flask import Flask
- from flask import request
- app = Flask(__name__)
-
- import torch
- # 导入中文预训练模型的编码函数
- from bert_chinese_encode import get_bert_encode
- # 导入微调网络模型
- from finetuning_net import Net
-
- # 设置训练好的模型路径
- MODEL_PATH = "./model/BERT_net.pth"
-
- # 定义实例化的模型参数
- embedding_size = 768
- char_size = 20
- dropout = 0.2
-
- # 初始化微调模型
- net = Net(embedding_size, char_size, dropout)
- # 加载已经训练好的模型
- net.load_state_dict(torch.load(MODEL_PATH))
- # 因为是在线部分, 所以采用评估模式, 本质是模型参数不发生变化
- net.eval()
-
- # 定义服务请求的路径和方式
- @app.route('/v1/recognition/', methods=["POST"])
- def recognition():
- # 首先接收数据
- text_1 = request.form['text1']
- text_2 = request.form['text2']
- # 对原始文本进行编码
- inputs = get_bert_encode(text_1, text_2, mark=102, max_len=10)
- # 使用微调模型进行预测
- outputs = net(inputs)
- # 从预测张量中获取结果
- _, predicted = torch.max(outputs, 1)
- # 返回字符串类型的结果
- return str(predicted.item())
- # 服务框架使用Flask
- # 导入相关的包
- from flask import Flask
- from flask import request
- app = Flask(__name__)
-
- # 导入发送http请求的requests工具
- import requests
-
- # 导入redis
- import redis
-
- # 导入json工具
- import json
-
- # 导入已经编写好的Unit API文件
- from unit import unit_chat
-
- # 导入操作neo4j数据库的工具
- from neo4j import GraphDatabase
-
- # 从配置文件config.py导入需要的若干配置信息
- # 导入neo4j的相关信息
- from config import NEO4J_CONFIG
- # 导入redis的相关信息
- from config import REDIS_CONFIG
- # 导入句子相关模型服务的请求地址
- from config import model_serve_url
- # 导入句子相关模型服务的超时时间
- from config import TIMEOUT
- # 导入规则对话模型的加载路径
- from config import reply_path
- # 导入用户对话信息保存的过期时间
- from config import ex_time
-
- # 建立redis的连接池
- pool = redis.ConnectionPool( **REDIS_CONFIG)
-
- # 初始化neo4j的驱动对象
- _driver = GraphDatabase.driver( **NEO4J_CONFIG)
-
-
- # 查询neo4j图数据的函数
- def query_neo4j(text):
- ''''
- 功能: 根据用户对话文本中可能存在的疾病症状, 来查询图数据库, 返回对应的疾病名称
- text: 用户输入的文本语句
- return: 用户描述的症状所对应的的疾病名称列表
- '''
- # 开启一个会话session来操作图数据库
- with _driver.session() as session:
- # 构建查询的cypher语句, 匹配句子中存在的所有症状节点
- # 保存这些临时的节点, 并通过关系dis_to_sym进行对应疾病名称的查找, 返回找到的疾病名称列表
- cypher = "MATCH(a:Symptom) WHERE(%r contains a.name) WITH \
- a MATCH(a)-[r:dis_to_sym]-(b:Disease) RETURN b.name LIMIT 5" %text
- # 通过会话session来运行cypher语句
- record = session.run(cypher)
- # 从record中读取真正的疾病名称信息, 并封装成List返回
- result = list(map(lambda x: x[0], record))
- return result
-
-
- # 主要逻辑服务类Handler类
- class Handler(object):
- def __init__(self, uid, text, r, reply):
- '''
- uid: 用户唯一标识uid
- text: 标识该用户本次输入的文本信息
- r: 代表redis数据库的一个链接对象
- reply: 规则对话模板加载到内存中的对象(字典对象)
- '''
- self.uid = uid
- self.text = text
- self.r = r
- self.reply = reply
-
- # 编写非首句处理函数, 该用户不是第一句问话
- def non_first_sentence(self, previous):
- '''
- previous: 代表该用户当前语句的上一句文本信息
- '''
- # 尝试请求语句模型服务, 如果失败, 打印错误信息
- # 在此处打印信息, 说明服务已经可以进入到首句处理函数中
- print("准备请求句子相关模型服务!")
- try:
- data = {"text1": previous, "text2": self.text}
- # 直接向语句服务模型发送请求
- result = requests.post(model_serve_url, data=data, timeout=TIMEOUT)
- # 如果回复为空, 说明服务暂时不提供信息, 转去百度机器人回复
- if not result.text:
- return unit_chat(self.text)
- # 此处打印信息, 说明句子相关模型服务请求成功且不为空
- print("句子相关模型服务请求成功, 返回结果为:", result.text)
- except Exception as e:
- print("模型服务异常:", e)
- return unit_chat(self.text)
-
- # 此处打印信息, 说明程序已经准备进行neo4j数据库查询
- print("骑牛模型服务后, 准备请求neo4j查询服务!")
- # 查询图数据库, 并得到疾病名称的列表结果
- s = query_neo4j(self.text)
- # 此处打印信息, 说明已经成功获得了neo4j的查询结果
- print("neo4j查询服务请求成功, 返回结果是:", s)
- # 判断如果结果为空, 继续用百度机器人回复
- if not s:
- return unit_chat(self.text)
- # 如果结果不是空, 从redis中获取上一次已经回复给用户的疾病名称
- old_disease = self.r.hget(str(self.uid), "previous_d")
- # 如果曾经回复过用户若干疾病名称, 将新查询的疾病和已经回复的疾病做并集, 再次存储
- # 新查询的疾病, 要和曾经回复过的疾病做差集, 这个差集再次回复给用户
- if old_disease:
- # new_disease是本次需要存储进redis数据库的疾病, 做并集得来
- new_disease = list(set(s) | set(eval(old_disease)))
- # 返回给用户的疾病res, 是本次查询结果和曾经的回复结果之间的差集
- res = list(set(s) - set(eval(old_disease)))
- else:
- # 如果曾经没有给该用户的回复疾病, 则存储的数据和返回给用户的数据相同, 都是从neo4j数据库查询返回的结果
- res = new_disease = list(set(s))
-
- # 将new_disease存储进redis数据库中, 同时覆盖掉之前的old_disease
- self.r.hset(str(self.uid), "previous_d", str(new_disease))
- # 设置redis数据的过期时间
- self.r.expire(str(self.uid), ex_time)
-
- # 此处打印信息, 说明neo4j查询后已经处理完了redis任务, 开始使用规则对话模板
- print("使用规则对话模板进行返回对话的生成!")
- # 将列表转化为字符串, 添加进规则对话模板中返回给用户
- if not res:
- return self.reply["4"]
- else:
- res = ",".join(res)
- return self.reply["2"] %res
-
- # 编码首句请求的代码函数
- def first_sentence(self):
- # 此处打印信息, 说明程序逻辑进入了首句处理函数, 并且马上要进行neo4j查询
- print("该用户近期首次发言, 不必请求模型服务, 准备请求neo4j查询服务!")
- # 直接查询neo4j图数据库, 并得到疾病名称列表的结果
- s = query_neo4j(self.text)
- # 此处打印信息, 说明已经成功完成了neo4j查询服务
- print("neo4j查询服务请求成功, 返回结果:", s)
- # 判断如果结果为空列表, 再次访问百度机器人
- if not s:
- return unit_chat(self.text)
-
- # 将查询回来的结果存储进redis, 并且做为下一次访问的"上一条语句"previous
- self.r.hset(str(self.uid), "previous_d", str(s))
- # 设置数据库的过期时间
- self.r.expire(str(self.uid), ex_time)
- # 将列表转换为字符串, 添加进规则对话模板中返回给用户
- res = ",".join(s)
- # 此处打印信息, 说明neo4j查询后有结果并且非空, 接下来将使用规则模板进行对话生成
- print("使用规则对话生成模板进行返回对话的生成!")
- return self.reply["2"] %res
-
-
- # 定义主要逻辑服务的主函数
- # 首先设定主要逻辑服务的路由和请求方法
- @app.route('/v1/main_serve/', methods=["POST"])
- def main_serve():
- # 此处打印信息, 说明werobot服务成功的发送了请求
- print("已经进入主要逻辑服务, werobot服务正常运行!")
- # 第一步接收来自werobot服务的相关字段, uid: 用户唯一标识, text: 用户输入的文本信息
- uid = request.form['uid']
- text = request.form['text']
-
- # 从redis连接池中获得一个活跃的连接
- r = redis.StrictRedis(connection_pool=pool)
-
- # 获取该用户上一次说的话(注意: 可能为空)
- previous = r.hget(str(uid), "previous")
- # 将当前输入的text存入redis, 作为下一次访问时候的"上一句话"
- r.hset(str(uid), "previous", text)
-
- # 此处打印信息, 说明redis能够正常读取数据和写入数据
- print("已经完成了初次会话管理, redis运行正常!")
- # 将规则对话模板的文件Load进内存
- reply = json.load(open(reply_path, "r"))
-
- # 实例化Handler类
- handler = Handler(uid, text, r, reply)
-
- # 如果上一句话存在, 调用非首句服务函数
- if previous:
- return handler.non_first_sentence(previous)
- # 如果上一句话不存在, 调用首句服务函数
- else:
- return handler.first_sentence()
-
- '''
- if __name__ == '__main__':
- text = "我最近腹痛!"
- result = query_neo4j(text)
- print("疾病列表:", result)
- '''
-
- # 设置redis相关的配置信息
- REDIS_CONFIG = {
- "host": "0.0.0.0",
- "port": 6379,
- "decode_responses":True
- }
-
- # 设置neo4j图数据库的配置信息
- NEO4J_CONFIG = {
- "uri": "bolt://0.0.0.0:7687",
- "auth": ("neo4j", "Itcast2019"),
- "encrypted": False
- }
-
- # 设置句子相关服务的请求地址
- model_serve_url = "http://0.0.0.0:5001/v1/recognition/"
-
- # 设置服务的超时时间
- TIMEOUT = 2
-
- # 设置规则对话的模板加载路径
- reply_path = "./reply.json"
-
- # 用户对话信息保存的过期时间
- ex_time = 36000
- # -*- coding: utf-8 -*-
- import json
- import random
- import requests
-
- # client_id 为官网获取的AK, client_secret 为官网获取的SK
- client_id = "xxxxxxx"
- client_secret = "xxxxxxx"
-
-
- def unit_chat(chat_input, user_id="88888"):
- """
- description:调用百度UNIT接口,回复聊天内容
- Parameters
- ----------
- chat_input : str
- 用户发送天内容
- user_id : str
- 发起聊天用户ID,可任意定义
- Return
- ----------
- 返回unit回复内容
- """
- # 设置默认回复内容, 一旦接口出现异常, 回复该内容
- chat_reply = "不好意思,俺们正在学习中,随后回复你。"
- # 根据 client_id 与 client_secret 获取 token 的 url https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s
- url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s" % (client_id, client_secret)
- res = requests.get(url)
- access_token = eval(res.text)["access_token"]
- # 根据 access_token 获取聊天机器人接口数据 https://aip.baidubce.com/rpc/2.0/unit/service/chat?access_token=
- unit_chatbot_url = "https://aip.baidubce.com/rpc/2.0/unit/service/chat?access_token=" + access_token
- # 拼装聊天接口对应请求发送数据,主要是填充 query 值
- post_data = {
- "log_id": str(random.random()),
- "request": {
- "query": chat_input,
- "user_id": user_id
- },
- "session_id": "",
- "service_id": "S23245",
- "version": "2.0"
- }
- # 将聊天接口对应请求数据转为 json 数据
- #request_post_data = json.dumps(post_data)
- res = requests.post(url=unit_chatbot_url, json=post_data)
-
- # print(res.content)
- # 获取聊天接口返回数据
- unit_chat_obj = json.loads(res.content)
- # print(unit_chat_obj)
- # 打印返回的结果
- # 判断聊天接口返回数据是否出错 error_code == 0 则表示请求正确
- if unit_chat_obj["error_code"] != 0: return chat_reply
- # 解析聊天接口返回数据,找到返回文本内容 result -> response_list -> schema -> intent_confidence(>0) -> action_list -> say
- unit_chat_obj_result = unit_chat_obj["result"]
- unit_chat_response_list = unit_chat_obj_result["response_list"]
- # 随机选取一个"意图置信度"[+response_list[].schema.intent_confidence]不为0的技能作为回答
- unit_chat_response_obj = random.choice(
- [unit_chat_response for unit_chat_response in unit_chat_response_list if
- unit_chat_response["schema"]["intent_confidence"] > 0.0])
- unit_chat_response_action_list = unit_chat_response_obj["action_list"]
- unit_chat_response_action_obj = random.choice(unit_chat_response_action_list)
- unit_chat_response_say = unit_chat_response_action_obj["say"]
- return unit_chat_response_say
-
-
- if __name__ == "__main__":
- #chat_input = "今晚吃啥呢想想"
- #chat_reply = unit_chat(chat_input)
- #print("用户输入 >>>", chat_input)
- #print("Unit回复 >>>", chat_reply)
-
-
- while True:
- chat_input = input("请输入:")
- print(chat_input)
- chat_reply = unit_chat(chat_input)
- print("用户输入 >>>", chat_input)
- print("Unit回复 >>>", chat_reply)
- if chat_input == 'Q' or chat_input == 'q':
- break
-
-
- # 导入werobot和发送请求的requests
- import werobot
- import requests
-
- # 设定主要逻辑服务的请求URL
- url = "http://xxx.xxx.xxx.xxx:5000/v1/main_serve/"
-
- # 设定服务超时的时间
- TIMEOUT = 3
-
- # 声明微信访问的请求
- robot = werobot.WeRoBot(token="doctoraitoken")
-
- # 设置所有请求的入口
- @robot.handler
- def doctor(message, session):
- try:
- # 获取用户的Id
- uid = message.source
- try:
- # 检查session, 判断用户是否第一次发言
- if session.get(uid, None) != "1":
- # 将添加{uid: "1"}
- session[uid] = "1"
- # 返回用户一个打招呼的话
- return '您好, 我是智能客服小艾, 有什么需要帮忙的吗?'
- # 获取message中的用户发言内容
- text = message.content
- except:
- # 有时候会出现特殊情况, 用户很可能取消关注后来又再次关注
- # 直接通过session判断, 会发现该用户已经不是第一次发言, 执行message.content语句
- # 真实情况是该用户登录后并没有任何的发言, 获取message.content的时候就会报错
- # 在这种情况下, 我们也通过打招呼的话回复用户
- return '您好, 我是智能客服小艾, 有什么需要帮忙的吗?'
-
- # 向主逻辑服务发送请求, 获得发送的数据体
- data = {"uid": uid, "text": text}
- # 利用requests发送请求
- res = requests.post(url, data=data, timeout=TIMEOUT)
- # 将返回的文本内容返回给用户
- return res.text
- except Exception as e:
- print("出现异常:", e)
- return "对不起, 机器人客服正在休息..."
-
- # 让服务监听在0.0.0.0:80
- robot.config["HOST"] = "0.0.0.0"
- robot.config["PORT"] = 80
- robot.run()
-
- ; author: zhoumingzhen
- ; Sample supervisor config file.
- ;
- ; For more information on the config file, please see:
- ; http://supervisord.org/configuration.html
- ;
- ; Notes:
- ; - Shell expansion ("~" or "$HOME") is not supported. Environment
- ; variables can be expanded using this syntax: "%(ENV_HOME)s".
- ; - Quotes around values are not supported, except in the case of
- ; the environment= options as shown below.
- ; - Comments must have a leading space: "a=b ;comment" not "a=b;comment".
- ; - Command will be truncated if it looks like a config file comment, e.g.
- ; "command=bash -c 'foo ; bar'" will truncate to "command=bash -c 'foo ".
-
- [unix_http_server]
- file=/tmp/supervisor.sock ; the path to the socket file
- ;chmod=0700 ; socket file mode (default 0700)
- ;chown=nobody:nogroup ; socket file uid:gid owner
- ;username=user ; default is no username (open server)
- ;password=123 ; default is no password (open server)
-
- [inet_http_server] ; inet (TCP) server disabled by default
- port=0.0.0.0:9001 ; ip_address:port specifier, *:port for all iface
- ;username=user ; default is no username (open server)
- ;password=123 ; default is no password (open server)
-
- [supervisord]
- logfile=./log/supervisord.log ; main log file; default $CWD/supervisord.log
- logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB
- logfile_backups=10 ; # of main logfile backups; 0 means none, default 10
- loglevel=info ; log level; default info; others: debug,warn,trace
- pidfile=./log/supervisord.pid ; supervisord pidfile; default supervisord.pid
- ;nodaemon=true ; start in foreground if true; default false
- minfds=1024 ; min. avail startup file descriptors; default 1024
- minprocs=200 ; min. avail process descriptors;default 200
- ;umask=022 ; process file creation umask; default 022
- ;user=chrism ; default is current user, required if root
- ;identifier=supervisor ; supervisord identifier, default is 'supervisor'
- ;directory=/tmp ; default is not to cd during start
- ;nocleanup=true ; don't clean up tempfiles at start; default false
- ;childlogdir=/tmp ; 'AUTO' child log dir, default $TEMP
- ;environment=KEY="value" ; key value pairs to add to environment
- ;strip_ansi=false ; strip ansi escape codes in logs; def. false
- ; The rpcinterface:supervisor section must remain in the config file for
- ; RPC (supervisorctl/web interface) to work. Additional interfaces may be
- ; added by defining them in separate [rpcinterface:x] sections.
- [rpcinterface:supervisor]
- supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
- ; The supervisorctl section configures how supervisorctl will connect to
- ; supervisord. configure it match the settings in either the unix_http_server
- ; or inet_http_server section.
- [supervisorctl]
- serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket
- serverurl=http://0.0.0.0:9001 ; use an http:// url to specify an inet socket
- ;username=chris ; should be same as in [*_http_server] if set
- ;password=123 ; should be same as in [*_http_server] if set
- ;prompt=mysupervisor ; cmd line prompt (default "supervisor")
- ;history_file=~/.sc_history ; use readline history if available
- ; The sample program section below shows all possible program subsection values.
- ; Create one or more 'real' program: sections to be able to control them under
- ; supervisor.
- [program:main_server]
- command=gunicorn -w 1 -b 0.0.0.0:5000 app:app ; the program (relative uses PATH, can take args)
- ;process_name=%(program_name)s ; process_name expr (default %(program_name)s)
- ;numprocs=1 ; number of processes copies to start (def 1)
- ;directory=/tmp ; directory to cwd to before exec (def no cwd)
- ;umask=022 ; umask for process (default None)
- ;priority=999 ; the relative start priority (default 999)
- ;autostart=true ; start at supervisord start (default: true)
- ;startsecs=1 ; # of secs prog must stay up to be running (def. 1)
- ;startretries=3 ; max # of serial start failures when starting (default 3)
- ;autorestart=unexpected ; when to restart if exited after running (def: unexpected)
- ;exitcodes=0,2 ; 'expected' exit codes used with autorestart (default 0,2)
- stopsignal=QUIT ; signal used to kill process (default TERM)
- ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
- stopasgroup=false ; send stop signal to the UNIX process group (default false)
- killasgroup=false ; SIGKILL the UNIX process group (def false)
- ;user=chrism ; setuid to this UNIX account to run the program
- ;redirect_stderr=true ; redirect proc stderr to stdout (default false)
- stdout_logfile=./log/main_server_out ; stdout log path, NONE for none; default AUTO
- stdout_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB)
- ;stdout_logfile_backups=10 ; # of stdout logfile backups (0 means none, default 10)
- ;stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
- ;stdout_events_enabled=false ; emit events on stdout writes (default false)
- stderr_logfile=./log/main_server_error ; stderr log path, NONE for none; default AUTO
- stderr_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB)
- ;stderr_logfile_backups=10 ; # of stderr logfile backups (0 means none, default 10)
- ;stderr_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0)
- ;stderr_events_enabled=false ; emit events on stderr writes (default false)
- ;environment=A="1",B="2" ; process environment additions (def no adds)
- ;serverurl=AUTO ; override serverurl computation (childutils)
- [program:redis]
- command=redis-server
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。