赞
踩
目录
(2)分词、噪声字段、空格、数字、大小写替换、过滤停止词字典停止词
腾讯800w预训练静态词向量:Tencent AI Lab Embedding Corpus for Chinese Words and Phrases
腾讯800w预训练静态词向量加载方法:腾讯词向量使用
- class Config():
- model_name = 'lstm_attention' # 可以使用的模型:"lstm_attention"、"lstm"
- learning_rate = 0.0006 # 学习率
- max_seq = 64 # LSTM 输入最长的序列长度,该长度不是模型训练时batch的真实长度,dataloader会截取batch真实数据的最长长度,也就是每一个batch的序列长度可能是不同的
- batch_size = 32 # batch size
- epochs = 200 # iter 次数
- embedding_dim = 200 # 词 embedding
- layer_num = 2 # LSTM 层数
- num_classes = 2 # label 类别数
- dropout = 0.1 # drop 保留 1 - dropout
- bidirectional = True # 是否使用双向LSTM
- hidden_dim = 200 # LSTM hidden_size
- vocab_most_common = 55000 # 取词频前50000词构建词汇表【词汇表最大为64221】
- pretrain_w2v_limit = 500000 # 腾讯预训练词embedding加载个数
- w2v_grad = True # 词 embedding 是否参与训练
- focal_loss = False # 是否使用focal_loss
- num_workers = 4 # 进程数
- info_interval = 160 # 训练时多少个batch打印一次log
- stop_word_path = './data/stopword.txt' # 停止词文件
- pretrain_w2v = './data/Tencent_AILab_ChineseEmbedding.txt' # 腾讯800w词预训练静态词向量
- vocab_save_path = './word2vec/Vocab_MostCommon{}.txt'.format(vocab_most_common) # 保存经过过滤词并排序后的vocab,过滤的两个方向:① 停止词 ②低频词
- embedding_path = './word2vec/Embedding_PretrianLimit{}.txt'.format(vocab_most_common,pretrain_w2v_limit)
- source_data = './data/online_shopping_10_cats.csv'
- train_data = './data/train.txt'
- val_data = './data/validation.txt'
- test_data = './data/test.txt'
- predict_data = './data/predict.txt' # 预测predict的数据
- checkpoint = './model/{}.ckpt'.format(model_name)

- class CreateModelData():
- """
- 给定 一个csv原始数据分成3分,生成 7:3:1的train,数据,格式为:target text
- """
- def __init__(self):
- pass
-
- def load_csv_data(self,csv_data):
- """
- 加载、去重、shuffle
- """
- source_df = pd.read_csv(csv_data)
- # 去除首尾有空格的行
- source_df.iloc[:,-1] = source_df.iloc[:,-1].str.strip()
- # 只要有空行就删除
- source_df = source_df.dropna(how='any')
- # 打乱顺讯
- index_shuffle = np.random.permutation(len(source_df))
- source_df = source_df.iloc[index_shuffle,:]
- return source_df
-
- def split_data_to_train_eval_test(self,dataframe):
- """
- 对每个一cat类型、label、类别别分割为tran、eval、test,分割比例 7:2:1
- """
- cats = dataframe.loc[:,'cat'].unique()
- labels = dataframe.loc[:,'label'].unique()
- train_df = pd.DataFrame(columns=dataframe.columns[-2:])
- val_df = pd.DataFrame(columns=dataframe.columns[-2:])
- test_df = pd.DataFrame(columns=dataframe.columns[-2:])
- for cat in cats:
- dataframe_cat = dataframe[dataframe.loc[:,'cat'] == cat].loc[:,dataframe.columns[-2:]]
- for label in labels:
- dataframe_label = dataframe_cat[dataframe_cat.loc[:,'label'] == label]
- size = dataframe_label.shape[0]
- train_end_idx = int(size * 0.7)
- val_end_idx = int(size * 0.9)
- train_df = pd.concat([train_df,dataframe_label.iloc[:train_end_idx,:]],axis=0)
- val_df = pd.concat([val_df, dataframe_label.iloc[train_end_idx:val_end_idx, :]], axis=0)
- test_df = pd.concat([test_df, dataframe_label.iloc[val_end_idx:, :]], axis=0)
- return train_df,val_df,test_df
-
- def save_csv(self,dataframe,path):
- """
- 保存文件为 csv
- """
- dataframe.to_csv(path,sep='\t',header=None,index=None)
-
- def forward(self,source_data_path):
- """
- 执行函数
- """
- source_df = self.load_csv_data(csv_data = source_data_path)
- # 分割 7:2:1 为 train val test
- train_df,val_df,test_df = self.split_data_to_train_eval_test(dataframe=source_df)
- # 保存
- print("源数据一共:{}条,分割后train data:{} - eval data:{} - test data:{},保存至:'{}' - '{}' - '{}'".format(len(source_df),
- len(train_df),len(val_df),len(test_df),'./data/train.data','./data/val.data','./data/test.data'))
- self.save_csv(train_df,'./data/train.data')
- self.save_csv(val_df,'./data/val.data')
- self.save_csv(test_df,'./data/test.data')
-

- # 带有标签的数据
- class DataProcessWithTarget():
- """
- ************* 训练集、验证集、测试集 数据预处理(文件带有target) **************
- 数据做以下:
- ① jieba 分词
- ② 去除停止词(低频词在构建 vocab时去除)、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
- ③ 保存分词结果
- """
- def __init__(self):
- pass
-
- def load_csv(self,path):
- data_df = pd.read_csv(path,sep='\t',header=None)
- target = data_df.iloc[:,-2]
- data = data_df.iloc[:,-1]
- return data,target
-
- def load_stopword(self, path):
- """
- 加载停止词
- """
- stop_word = []
- with open(path, 'r', encoding='utf-8-sig') as f:
- for line in f:
- line = line.strip()
- if line:
- stop_word.append(line)
- return stop_word
-
- def jieba_(self,text,stop_word):
- """
- jieba 分词的函数
- ① 这里我进行停止词
- ② 原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
- """
- words = jieba.lcut(text)
- words_list = []
- # 对单词的预处理:
- for word in words:
- if word not in stop_word:
- # 去除分词中的空格,并且将英文转化为小写
- word = word.strip()
- word = word.lower()
- if word:
- words_list.append(word)
- return words_list
-
- def save_file(self,target,data,path):
- if len(target) != len(data):
- raise Exception('长度不一致!')
- with open(path,'w',encoding='utf-8') as w:
- for idx in range(len(data)):
- word_str = ' '.join(data[idx])
- w.write(str(target[idx]))
- w.write('\t')
- w.write(word_str)
- w.write('\n')
-
- def forward(self,source_path,stop_word_path,report_path):
- """
- 主函数
- return 分词结果 X,标签 target
- """
- print('正在预处理:"{}"数据,处理后保存至:"{}",请稍等...'.format(source_path,report_path))
- # 加载csv
- data,target = self.load_csv(path=source_path)
- # 加载 stop word
- stop_word = self.load_stopword(stop_word_path)
- # 分词、停止词、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
- data_list = []
- target_list = []
- for idx in range(len(target)):
- word_list = self.jieba_(data.iloc[idx],stop_word=stop_word)
- if word_list:
- data_list.append(word_list)
- target_list.append(target.iloc[idx])
- else:
- print('数据:"{}",行号:{}数据预处理后有空值,去除处理'.format(source_path,idx+1))
- # 保存
- self.save_file(target=target_list,data=data_list,path = report_path)
- return data_list,target_list

- # 预测时无标签的数据,数据处理必须与train、val、test集相同
- class DataProcessNoTarget():
- """
- 模型predict的数据预处理(模型上线后数据预处理,需要与模型训练时预处理的方法完全相同)
- ruturn predict集 X array
- """
- def __init__(self):
- pass
-
- def load_data(self,path):
- text_list = []
- with open(path,'r',encoding='utf-8') as f:
- for line in f:
- line = line.strip()
- if line:
- text_list.append(line)
- return text_list
-
- def load_stopword(self, path):
- """
- 加载停止词
- """
- stop_word = []
- with open(path, 'r', encoding='utf-8-sig') as f:
- for line in f:
- line = line.strip()
- if line:
- stop_word.append(line)
- return stop_word
-
- def jieba_(self,text,stop_word):
- """
- jieba 分词的函数
- ① 这里我进行停止词
- ② 原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
- ③ 映射为 id,并截取填充
- """
- words = jieba.lcut(text)
- words_list = []
- # 对单词的预处理:
- for word in words:
- if word not in stop_word:
- # 去除分词中的空格,并且将英文转化为小写
- word = word.strip()
- word = word.lower()
- if word:
- words_list.append(word)
- return words_list
-
- def data_2_id(self,vocab_2_id, max_seq, text):
- """
- 将 text 数据生成 model 输入数据 X 与 label。
- 通过 vocab 映射为 id
- ① 确定文本的最长长度,超过进行截取,不足的用 PAD 填充
- ② 由于vocab去除了低词频的词,所以也要用到 UNK 标签
- return: X矩阵,2D 维度 numpy,Y 向量 1D 维度 numpy
- """
-
- def padding(max_seq, X):
- """ Pad 或 截取到相同长度,pad的值放在真实数据的前面 """
- if len(X) < max_seq:
- while len(X) < max_seq:
- X.insert(0,vocab_2_id['<PAD>'])
- else:
- X = X[:max_seq]
- return X
-
- X = []
- for line in text:
- # mapping 为 id,注意 UNK 标签
- line = [vocab_2_id[word] if word in vocab_2_id else vocab_2_id["<UNK>"] for word in line]
- # padding 或 截取 为 固定长度,pad的值放在真实数据的前面
- line = padding(max_seq=max_seq, X=line)
- # 保存 X
- X.append(line)
- return np.array(X)
-
- def forward(self,source_path,stop_word_path,vocab_2_id,max_seq):
- """
- 主函数
- return predict数据映射的id numpy 矩阵
- """
- print('正在预处理:"{}"数据,请稍等...'.format(source_path))
- # 加载csv
- data = self.load_data(path=source_path)
- # 加载 stop word
- stop_word = self.load_stopword(stop_word_path)
- # 分词、停止词、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
- data_list = []
- for idx in range(len(data)):
- word_list = self.jieba_(data[idx],stop_word=stop_word)
- if word_list:
- data_list.append(word_list)
- else:
- print('数据:"{}",行号:{}数据预处理后有空值,去除处理'.format(source_path,idx+1))
-
- # 映射填充为id
- data = self.data_2_id(vocab_2_id=vocab_2_id,max_seq=max_seq,text=data_list)
- return data

- def build_vocab(train_data,val_data,save_path,most_common = None):
- """
- 使用 train data 和 val data 共同生成vocab,添加标签 <PAD> <UNK>,使用过滤词,词频从高到低排序
- ① 低频词去除【保留前 most_common 个词】
- """
- vocab_dict = {}
- paths = [train_data,val_data]
- for _path in paths:
- with open(_path,'r',encoding='utf-8-sig') as f:
- for line in f:
- line = line.strip()
- if line:
- word_list = line.split()[1:] # .split() 默认使用任何空格进行分类
- for word in word_list:
- if word not in vocab_dict:
- vocab_dict[word] = 1
- else:
- vocab_dict[word] = vocab_dict[word] + 1
-
- # 取前 most_common 个词
- if most_common is not None:
- ordered_vocab = Counter(vocab_dict).most_common(most_common)
- else:
- ordered_vocab = Counter(vocab_dict).most_common(sys.maxsize)
-
- # 建立 vocab2id 字典,并加入 <PAD> <UNK> 标签
- vocab_dict = collections.OrderedDict()
- vocab_dict["<PAD>"] = 0
- vocab_dict["<UNK>"] = 1
- for word,counts in ordered_vocab:
- if word not in vocab_dict:
- vocab_dict[word] = len(vocab_dict)
-
- # 保存 vocab_2_id
- vocab_size = len(vocab_dict)
- with open(save_path,'w',encoding = 'utf-8') as w:
- for idx,(k,v) in enumerate(vocab_dict.items()):
- w.write('{}\t{}'.format(k,v))
- if idx + 1 < vocab_size:
- w.write('\n')
- return vocab_dict

- def build_embedding(vocab_2_id,pretrain_w2v,save_path):
- """
- 使用 腾讯 预训练的词向量构建预训练词向量表, 用 numpy 保存txt格式数组
- """
- # 加载腾讯词向量,limit 用于限制加载词向量的个数
- pretrain_w2v_model = KeyedVectors.load_word2vec_format(pretrain_w2v,binary=False,limit=config.pretrain_w2v_limit) # limit 用于限制加载词汇表大小
-
- # 初始化 embedding table
- vocab_dim = len(vocab_2_id)
- embed_dim = pretrain_w2v_model.vector_size
- embedding_table = np.random.uniform(-1.,1.,(vocab_dim,embed_dim))
-
- # 将 预训练词向量 对embedding表进行赋值
- for word,index in vocab_2_id.items():
- try:
- embedding_table[index] = pretrain_w2v_model[word]
- except KeyError:
- pass
-
- # 保存 embedding 表
- np.savetxt(save_path,embedding_table)
-
- return embedding_table

- def data_2_id(vocab_2_id,max_seq,file_path):
- """
- 将 text 数据生成 model 输入数据 X 与 label。
- 通过 vocab 映射为 id
- ① 确定文本的最长长度,超过进行截取,不足的用 PAD 填充
- ② 由于vocab去除了低词频的词,所以也要用到 UNK 标签
- return: X矩阵,2D 维度 numpy,Y 向量 1D 维度 numpy
- """
- def padding(max_seq,X):
- """ Pad 或 截取到相同长度,pad的值放在真实数据的前面 """
- if len(X) < max_seq:
- while len(X) < max_seq:
- X.insert(0,vocab_2_id['<PAD>'])
- else:
- X = X[:max_seq]
- return X
-
- label = []
- X = []
- with open(file_path,'r',encoding='utf-8-sig') as f:
- for line in f:
- line = line.strip()
- if line:
- line_list = line.split() # .split() 默认使用任意个空格作为分隔符
- # 获取 label 标签
- label.append(int(line_list[0])) # 标签需要用 int 转化
- # 获取 X
- X_tmp = line_list[1:]
- # mapping 为 id,注意 UNK 标签
- X_tmp = [vocab_2_id[word] if word in vocab_2_id else vocab_2_id["<UNK>"] for word in X_tmp ]
- # padding 或 截取 为 固定长度,pad的值放在真实数据的前面
- X_tmp = padding(max_seq=max_seq,X=X_tmp)
- # 保存 X
- X.append(X_tmp)
- return np.array(X),np.array(label)

- class Data_Set(Dataset):
- """
- 生成 dataset
- """
- def __init__(self,X,Label=None):
- """
- X: 2D numpy int64
- Label: 1D numpy int64
- """
- self.X = X
- self.Label = Label
-
- def __len__(self):
- return len(self.X)
-
- def __getitem__(self,idx):
- if self.Label is not None:
- X = torch.tensor(self.X[idx],dtype=torch.int64) # 使用torch默认的整形数据
- Label = torch.tensor(self.Label[idx],dtype=torch.int64)
- return X,Label
- # 考虑predict阶段没有label
- else:
- X = torch.tensor(self.X[idx],dtype=torch.int64)
- return X
-
- def collate_fn(batch):
- """
- 参数:batch 是 list 类型
- DataLoader 中定义的 collate_fn 函数,用于对一个batch的数据进行处理
- ② 将 batch 数据转化为tensor
- ① 去除一个batch中多余的 PAD ,将数据最长长度调整为batch中最长样本的真实长度
- """
- def intercept(X):
- """
- X dim: [batch,T]
- 将tensor截取为真实值的最长度,要注意PAD必须为0才可执行
- """
- max_seq = torch.max(torch.sum(X>=1,dim=1))
- return X[:,-max_seq:]
-
- X_list = []
- label_list =[]
- for item in batch:
- if isinstance(item, tuple):
- X,target_label = item # X dim: [batch,T]
- if not (torch.is_tensor(X) and torch.is_tensor(target_label)):
- X = torch.tensor(X)
- target_label = torch.tensor(target_label)
- X_list.append(X)
- label_list.append(target_label)
- # 考虑到预测没有标签
- else:
- X = item
- if not torch.is_tensor(X):
- X = torch.tensor(X)
- X_list.append(X)
-
- if label_list:
- X = torch.stack(X_list,dim=0) # X dim: [batch,T]
- label = torch.stack(label_list,dim=0)
- return intercept(X), label
- else:
- X = torch.stack(X_list,dim=0) # X dim: [batch,T]
- return intercept(X)
-
- def get_vocab(file_path):
- """
- 加载 vocab_2_id
- """
- vocab_dict = collections.OrderedDict()
- with open(file_path,'r',encoding='utf-8-sig') as f:
- for line in f:
- line = line.strip()
- if line:
- key,value = line.split()
- vocab_dict[key] = int(value)
- return vocab_dict
-
- def get_pretrain_embedding(file_path):
- """
- 加载 腾讯预训练 embedding
- """
- embedding = np.loadtxt(file_path)
- return embedding
-
- def sort_eval(X,Y=None):
- """
- X: 2D
- 接受验证集与测试集的 X Y array,对其真实长度从到小进行排序
- return 验证集与测试集排序后的 X,Y
- """
- if Y is not None:
- seq_len = np.sum(X>0,axis=1)
- datas = list(zip(X,Y,seq_len))
- datas = sorted(datas,key=lambda i:i[-1])
- X,Y,_ = zip(*datas)
- return X,Y
- else:
- seq_len = np.sum(X > 0, axis=1)
- datas = list(zip(X, seq_len))
- datas = sorted(datas, key=lambda i: i[-1])
- X, Y, = zip(*datas)
- return X
-
- if __name__ == '__main__':
- pass

- class LSTM_Model(nn.Module):
- def __init__(self,
- vocab_size,
- n_class,
- embedding_dim,
- hidden_dim,
- num_layers,
- dropout,
- bidirectional,
- embedding_weights=None,
- train_w2v=True,
- **kwargs):
- super(LSTM_Model, self).__init__()
- self.vocab_size = vocab_size
- self.n_class = n_class
- self.embedding_dim = embedding_dim
- self.hidden_dim = hidden_dim
- self.num_layers = num_layers
- self.dropout = dropout
- self.bidirectional = bidirectional
- self.embedding_weights = embedding_weights
- self.train_w2v = train_w2v
-
- # 构建 embedding 层
- if self.embedding_weights is not None:
- self.embedding_weights = torch.tensor(self.embedding_weights,
- dtype=torch.float32) # torch 不接受 numpy 64位的浮点型,这里必须转化为32位,否则报错
- self.embedding = nn.Embedding.from_pretrained(self.embedding_weights)
- self.embedding.weight.requires_grad = self.train_w2v
- else: # 保证预测的情况无需传入 预训练的embedding表
- self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
- self.embedding.weight.requires_grad = self.train_w2v
- nn.init.uniform_(self.embedding.weight, -1., 1.)
-
- # 构建 lstm
- self.lstm = nn.LSTM(input_size=self.embedding_dim,
- hidden_size=self.hidden_dim,
- num_layers=self.num_layers,
- dropout=self.dropout,
- bidirectional=self.bidirectional)
-
- # 双向
- if self.bidirectional:
-
- # FC[第一个时刻与最后一个时刻需要拼接]
- self.fc1 = nn.Linear(4 * self.hidden_dim, self.hidden_dim)
- self.fc2 = nn.Linear(self.hidden_dim, self.n_class)
-
- else:
- # FC
- self.fc1 = nn.Linear(self.hidden_dim, self.n_class)
-
- def forward(self, x):
- # 0、embedding
- embeddings = self.embedding(x) # (B,T) --> (B,T,D)
-
- # 1、LSTM
- outputs, states = self.lstm(embeddings.permute([1, 0, 2])) # lstm 默认 输入维度为 (seq,batch,dim),因此这里需要进行转换
-
- if self.bidirectional:
- input_tmp = torch.cat([outputs[0],outputs[-1]],dim=-1)
- outputs = F.relu(self.fc1(input_tmp))
- outputs = self.fc2(outputs)
- else:
- outputs = self.fc1(outputs[-1])
-
- return outputs
-
-
- class LSTM_Attention(nn.Module):
- def __init__(self,
- vocab_size,
- n_class,
- embedding_dim,
- hidden_dim,
- num_layers,
- dropout,
- bidirectional,
- embedding_weights = None,
- train_w2v=True,
- **kwargs):
- super(LSTM_Attention,self).__init__()
- self.vocab_size = vocab_size
- self.n_class = n_class
- self.embedding_dim = embedding_dim
- self.hidden_dim = hidden_dim
- self.num_layers = num_layers
- self.dropout = dropout
- self.bidirectional = bidirectional
- self.embedding_weights = embedding_weights
- self.train_w2v = train_w2v
-
- # 构建 embedding 层
- if self.embedding_weights is not None:
- self.embedding_weights = torch.tensor(self.embedding_weights,dtype=torch.float32) # torch 不接受 numpy 64位的浮点型,这里必须转化为32位,否则报错
- self.embedding = nn.Embedding.from_pretrained(self.embedding_weights)
- self.embedding.weight.requires_grad = self.train_w2v
- else: # 保证预测的情况无需传入 预训练的embedding表
- self.embedding = nn.Embedding(self.vocab_size,self.embedding_dim)
- self.embedding.weight.requires_grad = self.train_w2v
- nn.init.uniform_(self.embedding.weight,-1.,1.)
-
- # 构建 lstm
- self.lstm = nn.LSTM(input_size=self.embedding_dim,
- hidden_size=self.hidden_dim,
- num_layers=self.num_layers,
- dropout=self.dropout,
- bidirectional=self.bidirectional)
-
- # 双向
- if self.bidirectional:
- # attention
- self.attention1 = nn.Linear(2 * self.hidden_dim,2 * self.hidden_dim)
- self.attention2 = nn.Linear(2 * self.hidden_dim,1)
-
- # FC
- self.fc1 = nn.Linear(2 * self.hidden_dim, self.hidden_dim)
- self.fc2 = nn.Linear(self.hidden_dim,self.n_class)
-
- else:
- # attention
- self.attention1 = nn.Linear(self.hidden_dim, self.hidden_dim)
- self.attention2 = nn.Linear(self.hidden_dim,1)
-
- # FC
- self.fc1 = nn.Linear(self.hidden_dim, self.hidden_dim)
- self.fc2 = nn.Linear(self.hidden_dim,self.n_class)
-
- def forward(self,x):
- # 0、embedding
- embeddings = self.embedding(x) # (B,T) --> (B,T,D)
-
- # 1、LSTM
- outputs,states = self.lstm(embeddings.permute([1,0,2])) # lstm 默认 输入维度为 (seq,batch,dim),因此这里需要进行转换
- T,B,D = outputs.size() # D = 2 * hidden_dim
- outputs = outputs.permute([1,0,2])
- # attention
- u = torch.tanh(self.attention1(outputs))
- v = self.attention2(u)
- att_scores = F.softmax(v,dim=1)
- encoding = torch.sum(torch.mul(outputs,att_scores),dim=1)
- # FC
- outputs = F.relu6(self.fc1(encoding))
- outputs=self.fc2(outputs)
- return outputs
-
- if __name__ == '__main__':
- lstm_attention = LSTM_Attention(10000,2,200,256,2,0.2,bidirectional=True,embedding_weights=None,train_w2v=True)
- print(lstm_attention)

- # 模型搭建
- if Config.model_name == 'lstm_attention':
- model = LSTM_Attention( vocab_size = len(vocab_2_id),
- n_class = Config.num_classes,
- embedding_dim = Config.embedding_dim,
- hidden_dim = Config.hidden_dim,
- num_layers = Config.layer_num,
- dropout = Config.dropout,
- bidirectional = Config.bidirectional,
- embedding_weights = embedding_table,
- train_w2v = Config.w2v_grad
- )
- # print(model.embedding.weight)
- else:
- model = LSTM_Model(vocab_size = len(vocab_2_id),
- n_class = Config.num_classes,
- embedding_dim = Config.embedding_dim,
- hidden_dim = Config.hidden_dim,
- num_layers = Config.layer_num,
- dropout = Config.dropout,
- bidirectional = Config.bidirectional,
- embedding_weights = embedding_table,
- train_w2v = Config.w2v_grad
- )
- print('Model-"{}" 细节:\n'.format(Config.model_name),model)
- view_will_trained_params(model,model_name=Config.model_name)

- # 优化器 分层学习率
- # 由于embedding是腾讯预训练词向量生成的,所有需要较小的学习率,一般低于正常神经网络训练学习率的10倍
- special_layers = nn.ModuleList([model.embedding])
- # 获取特等层的参数列表的内存id列表
- special_layers_ids = list(map(lambda x: id(x), special_layers.parameters()))
- # 基础层的参数列表
- basic_params = filter(lambda x: id(x) not in special_layers_ids, model.parameters())
- optimizer = optim.Adam([{'params': filter(lambda p: p.requires_grad, basic_params)},
- {'params': filter(lambda p: p.requires_grad, special_layers.parameters()), 'lr': 8e-5}],
- lr=Config.learning_rate)
- import torch
- import numpy as np
- import torch.nn.functional as F
- import math
- from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
-
- def view_will_trained_params(model,model_name):
- """
- ********** 查看模型哪些层的参数参与训练,哪些层的参数被固定了 ************
- """
- train_params = []
- for name,param in model.named_parameters():
- if param.requires_grad == True:
- train_params.append((name,param.shape))
- print("\n{} 模型将要参与训练的层为:\n".format(model_name),train_params,end='\n\n\n')
-
- def get_device():
- dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
- device = torch.device(dev)
- return device
-
- def focal_loss(output, target, alpha=1.0, gamma=2.0, *args, **kwargs):
- """
- ********** 给定模型前向传播的输出[batch,class]与真实值target[class,],计算loss误差 ************
- 1. 仅仅在训练的时候使用 focal_loss ,验证时不使用 focal_loss
- 2. 默认情况下不进行聚合
- """
- assert np.ndim(output) == 2
- assert np.ndim(target) == 1
- assert len(output) == len(target)
- ce_loss = F.cross_entropy(input=output, target=target, reduction="none") # 这里必须使用 none 模式, ce_loss dim: [B,]
- pt = torch.exp(-ce_loss) # pt dim: [B,]
- # 构建 focal_loss
- focalloss = (alpha * (torch.tensor(1.0) - pt) ** gamma * ce_loss).mean()
- return focalloss
-
- def cross_entropy(output, target, *args, **kwargs):
- """
- 普通的交叉熵损失函数,默认情况下不进行聚合
- """
- assert np.ndim(output) == 2
- assert np.ndim(target) == 1
- assert len(output) == len(target)
- ce_loss = F.cross_entropy(input=output, target=target, reduction="mean") # ce_loss 是一个均值
- return ce_loss
-
- class WarmupCosineLR():
- def __init__(self,optimizer,warmup_iter:int,lrs_min:list = [1e-5,],T_max:int = 10):
- """
- ******************* pytorch自定义学习率 预热warmup + Cosline 余弦衰减 **************************
- 具体可看文章:https://blog.csdn.net/qq_36560894/article/details/114004799?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-13.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-13.control
- Args:
- optimizer (Optimizer): pytotch 优化器
- warmup_iter: 预热的最大epoch
- lrs_min: list, optimizer 学习率一一对应的最小值
- T_max:余弦半周期,该值必须比 warmup_iter 大
- 特点:
- ① 支持分层学习率多组学习率衰减
- """
- self.optimizer = optimizer
- self.warmup_iter = warmup_iter
- self.lrs_min = lrs_min
- self.T_max = T_max
- self.base_lrs = [i['lr'] for i in optimizer.param_groups]
-
- def get_lr(self):
- if self.iter < self.warmup_iter:
- return [i * self.iter *1. / self.warmup_iter for i in self.base_lrs]
- else:
- return [self.lrs_min[idx] + 0.5*(i-self.lrs_min[idx])*(1.0+math.cos((self.iter-self.warmup_iter)/(self.T_max-self.warmup_iter)*math.pi)) \
- for idx,i in enumerate(self.base_lrs)]
-
- def step(self,iter:int):
- if iter == 0:
- iter = iter + 1
- self.iter = iter
- # 获取当前epoch学习率
- decay_lrs = self.get_lr()
-
- # 更新学习率
- for param_group, lr in zip(self.optimizer.param_groups, decay_lrs):
- param_group['lr'] = lr
-
- def get_score(target,predict):
- """
- 给定真实的变迁target 与 预测的标签predict ,计算 acc、recall、precision、F1
- """
- import warnings
- warnings.filterwarnings('ignore')
- assert np.ndim(target) == 1
- assert np.ndim(predict) == 1
- assert np.shape(target) == np.shape(predict)
- con_matrix = confusion_matrix(y_true=target,y_pred=predict)
- # 计算acc
- acc = accuracy_score(y_true=target,y_pred=predict)
- # 计算 macro recall
- recall = recall_score(y_true=target,y_pred=predict,average='macro')
- # 计算 macro precision
- precision = precision_score(y_true=target,y_pred=predict,average='macro')
- # 计算 macro F1
- F1 = f1_score(y_true=target,y_pred=predict,average='macro')
- return (acc,recall,precision,F1),con_matrix
-
- if __name__ == "__main__":
- # 0、WramUp + cosinelr 学习率变化曲线
- import torch.optim as optim
- import matplotlib.pyplot as plt
- optimizer = optim.Adam(params=[torch.ones((3,4),requires_grad=True)],lr=0.01)
- scheduler_ = WarmupCosineLR(optimizer,
- warmup_iter=5,
- lrs_min=[0.001,],
- T_max=50)
- lr = optimizer.param_groups[0]['lr']
- print(lr)
- y = []
- x = []
- for epoch in range(200):
- scheduler_.step(epoch+1)
- print(optimizer.param_groups[0]['lr'])
- y.append(optimizer.param_groups[0]['lr'])
- x.append(epoch+1)
- plt.plot(x,y)
- plt.show()
-
- # 计算分值
- y_t = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
- y_p = [1,1,1,0,0,1,1,0,1,0,2,2,1,1,1,1,0,1,1]
- print(get_score(y_t,y_p))

- 梯度截断的使用步骤:
- 1. 计算loss函数值
- 2. loss 反向传播
- 3. 梯度截断
- 4. 优化器更新梯度参数
-
- optimizer.zero_grad()
- loss, hidden = model(data, hidden, targets)
- loss.backward()
- # 梯度截断
- torch.nn.utils.clip_grad_norm(filter(lambda p: p.requires_grad,model.parameters()), args.clip)
- optimizer.step()
- from __future__ import print_function
- from __future__ import division
- from __future__ import absolute_import
- from __future__ import with_statement
- from os import lseek
- from model import LSTM_Attention,LSTM_Model
- from data_process import data_2_id
- from loader_utils import get_vocab,get_pretrain_embedding,Data_Set,collate_fn,sort_eval
- from model_utils import view_will_trained_params,focal_loss,cross_entropy,WarmupCosineLR,get_score
- from create_config import Config
- from torch.utils.data import DataLoader
- import torch
- import torch.nn as nn
- import torch.optim as optim
- import numpy as np
- import copy
- import os
-
- def train_one_epoch(model,device,optimizer,loss_fun,metric_fun,train_loader,current_epoch,info_interval:int=None):
- """
- ********** 一个epoch模型训练 ************
- 关于 model.eval() model.train() with torch.no_grad() with torch.set_grad_enabled(bool) 区别
- return:
- ① batch_losses:每个batch均值loss列表
- ② 整个epoch 的 acc,recall,precision,F1
- """
- print('Training ... ')
- model.train()
- model.to(device)
- LRs = [i['lr'] for i in optimizer.param_groups] # 获取当前epoch 优化器 optimizer 学习率组
- batch_losses = []
- batch_targets = []
- batch_predicts = []
- for idx, (input_x, target) in enumerate(train_loader):
- input_x, target = input_x.to(device), target.to(device)
- optimizer.zero_grad()
- output = model(input_x) # 前向传播
- loss = loss_fun(output, target, alpha=1.0, gamma=2.0)
- loss.backward() # 反向传播计算梯度
- optimizer.step() # 更新
- batch_losses.append(loss.item())
- # 计算score
- pre = torch.argmax(output, dim=1)
- pre = pre.cpu().numpy().reshape(-1).tolist()
- target = target.cpu().numpy().reshape(-1).tolist()
- (acc,recall,precision,F1),con_matrix = metric_fun(target=target,predict=pre)
- batch_targets.extend(target)
- batch_predicts.extend(pre)
-
- if info_interval is not None:
- if idx % info_interval == 0:
- print("Epoch:{}\t[{}\{}\t\t{:.2f}%]\tLoss:{:.8f}\tScores: < acc:{:.3f}%\t"\
- "macro_recall:{:.3f}%\tmacro_precision:{:.3f}%\tmacro_F1:{:.3f}%\t >\t\tBatch input_x shape:{}".format(
- current_epoch, idx * len(input_x),
- len(train_loader.dataset), 100. * (idx / len(train_loader)),loss.item(),
- 100. * acc,100. * recall,100. * precision,100. * F1,input_x.shape
- ))
- # 计算一个epoch的score
- (epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets, predict=batch_predicts)
- print("Epoch Info :\tLoss:{:.8f}\tScores: <\tacc:{:.3f}%\t "\
- "macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\tLRs:{}".format(
- np.mean(batch_losses),100. * epoch_acc,100. * epoch_recall,100. * epoch_precision,100. * epoch_F1,LRs
- ))
- return batch_losses,[epoch_acc, epoch_recall, epoch_precision, epoch_F1]
-
- def eval_one_epoch(model,device,loss_fun,metric_fun,eval_loader):
- """
- ********** 一个epoch模型验证 ************
- 关于 model.eval() model.train() with torch.no_grad() with torch.set_grad_enabled(bool) 区别
- return: batch_losses 每个batch均值loss列表,batch_scores 每个batch的 acc,recall,precision,F1
- """
- print('Evaling ... ')
- model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
- model.to(device)
- batch_losses = []
- batch_targets = []
- batch_predicts = []
- with torch.no_grad():
- for idx, (input_x, target) in enumerate(eval_loader):
- input_x, target = input_x.to(device), target.to(device)
- output = model(input_x) # 前向传播
- loss = loss_fun(output, target, alpha=1.0, gamma=2.0)
- batch_losses.append(loss.item())
- # 计算score
- pre = torch.argmax(output, dim=1)
- pre = pre.cpu().numpy().reshape(-1).tolist()
- target = target.cpu().numpy().reshape(-1).tolist()
- (acc, recall, precision, F1), con_matrix = metric_fun(target=target, predict=pre)
- batch_targets.extend(target)
- batch_predicts.extend(pre)
- # 计算一个epoch的score
- (epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets, predict=batch_predicts)
- print(
- "Epoch Info :\tLoss:{:.8f}\tScores: Scores: <\tacc:{:.3f}%\t "\
- "macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>".format(
- np.mean(batch_losses), 100. * epoch_acc, 100. * epoch_recall,
- 100. * epoch_precision, 100. * epoch_F1
- ))
- return batch_losses,[epoch_acc, epoch_recall, epoch_precision, epoch_F1]
-
- def train(model,device,optimizer,scheduler_fun,loss_fun,epochs,metric_fun,info_interval,checkpoint,train_loader,eval_loader):
- """
- ********** 模型训练 ************
- return:
- ① train_losses,eval_losses: 2D list ,(epoch,batch_num)
- ② train_scores,eval_scores: 2D list,(epoch,4)acc,recall,precision,F1
- """
-
- # 判断加载已保留的最优的模型参数【支持断点续传】
- best_scores = [-0.000001,-0.000001,-0.000001,-0.000001] # 定义初始的acc,recall,precision,F1的值
- history_epoch,best_epoch = 0,0 # 定义历史训练模型epoch次数初始值、最优模型的epoch初始值
- best_params = copy.deepcopy(model.state_dict()) # 获取模型的最佳参数,OrderDict属于链表,对其更该引用的变量也会变动,因此这里要用到深拷贝
- best_optimizer = copy.deepcopy(optimizer.state_dict())
- LRs = [i['lr'] for i in optimizer.param_groups]
- if os.path.exists(checkpoint):
- """
- 为了保证 gpu/cpu 训练的模型参数可以相互加载,这里在load时使用 map_location=lambda storage, loc: storage 来控制,详情请看文章:
- https://blog.csdn.net/nospeakmoreact/article/details/89634039?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.withoutpai&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.withoutpai
- """
- if torch.cuda.is_available():
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
- else:
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
- best_scores = ck_dict['best_score']
- history_epoch,best_epoch = ck_dict['epochs'],ck_dict['best_epochs']
- model.load_state_dict(ck_dict['best_params'])
- # optimizer.load_state_dict(ck_dict['optimizer'])
-
- # if torch.cuda.is_available():
- # """
- # 重载optimizer的参数时将所有的tensor都放到cuda上(optimizer保存时默认放在cpu上了),详情见:
- # https://blog.csdn.net/weixin_41848012/article/details/105675735
- # """
- # for state in optimizer.state.values():
- # for k, v in state.items():
-
-
-
- # if torch.is_tensor(v):
- # state[k] = v.cuda()
-
- best_params = copy.deepcopy(model.state_dict()) # 获取模型的最佳参数,OrderDict属于链表,对其更该引用的变量也会变动,因此这里要用到深拷贝
- # best_optimizer = copy.deepcopy(optimizer.state_dict())
- LRs = [i['lr'] for i in optimizer.param_groups]
-
- print('From "{}" load history model params:\n\tTrained Epochs:{}\n\t'\
- 'Best Model Epoch:{}\n\t各层学习率 LRs 为:{}\n\tBest Score:<\tacc:{:.3f}%\t'\
- ' macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n'.format(
- checkpoint, history_epoch,best_epoch,LRs
- , 100. * best_scores[0],100. * best_scores[1]
- ,100. * best_scores[2],100. * best_scores[3]))
- # print(best_params)
- # print(best_optimizer)
-
- # Train
- train_losses =[]
- eval_losses = []
- train_scores = []
- eval_scores = []
- for epoch in range(1,epochs + 1):
- # 获得本次训练的 lr 学习率
- scheduler_fun.step(history_epoch + epoch) # 这里需要使用历史的epoch,为了是LR变化符合 Warmup + cosine
- LRs = [i['lr'] for i in optimizer.param_groups]
- # train & eval
- train_batch_loss,train_score = train_one_epoch(model=model,
- device=device,
- optimizer=optimizer,
- loss_fun=loss_fun,
- metric_fun=metric_fun,
- train_loader=train_loader,
- current_epoch=history_epoch+epoch,
- info_interval=info_interval)
- print()
- eval_batch_loss,eval_score = eval_one_epoch(model=model,
- device=device,
- loss_fun=loss_fun,
- metric_fun=metric_fun,
- eval_loader=eval_loader)
- train_losses.append(train_batch_loss)
- eval_losses.append(eval_batch_loss)
- train_scores.append(train_score)
- eval_scores.append(eval_score)
-
- # 保存模型[当验证集的 F1 值 大于最优F1时,模型进行保存
- if best_scores[3] < eval_score[3]:
- print('历史模型分值:{:.3f}%,更新分值{:.3f}%,优化器学习率:{},模型参数更新保存\n'.format(100.*best_scores[3],100.*eval_score[3],LRs))
- best_scores = eval_score
- best_params = copy.deepcopy(model.state_dict())
- best_optimizer = copy.deepcopy(optimizer.state_dict())
- best_epoch = history_epoch + epoch
- else:
- print("模型最优的epcoh为:{},模型验证集最高分值:{:.3f}%, model 效果未提升\n".format(best_epoch,100.* best_scores[3]))
- ck_dict = {
- "best_score":best_scores,
- "best_params":best_params,
- "optimizer":best_optimizer,
- 'epochs':history_epoch + epoch,
- 'best_epochs':best_epoch
- }
- torch.save(ck_dict,checkpoint)
-
- # 训练结束,将模型赋予最优的参数
- model.load_state_dict(best_params)
- return model,train_losses,eval_losses,train_scores,eval_scores
-
- if __name__ == '__main__':
- dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
- device = torch.device(dev)
- # 数据加载
- vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
- embedding_table = get_pretrain_embedding(Config.embedding_path) # (50002,200),numpy float默认为64位,torch需要32位,需要转化为float32的tensor
- # DataSet DataLoader
- X_train,target_train = data_2_id(vocab_2_id,Config.max_seq,Config.train_data)
- kwargs = {'num_workers':Config.num_workers,'pin_memory':True} if torch.cuda.is_available() else {'num_workers':Config.num_workers}
- train_dataset = Data_Set(X_train,target_train)
- train_loader = DataLoader(dataset=train_dataset,
- batch_size=Config.batch_size,
- shuffle=True,
- collate_fn = collate_fn,
- **kwargs
- )
- print('dataloader 第一个batch的情况如下:')
- print(next(iter(train_loader)),next(iter(train_loader))[0].shape)
- X_val,target_val = data_2_id(vocab_2_id,Config.max_seq,Config.val_data)
- # TODO 为了避免batch长短不齐形成过多的PAD,这里对 eval 数据 按照真实的长度从小到大排序
- X_val,target_val = sort_eval(X_val,target_val)
- val_dataset = Data_Set(X_val,target_val)
- val_loader = DataLoader(dataset=val_dataset,
- batch_size=Config.batch_size,
- shuffle=False,
- collate_fn = collate_fn,
- **kwargs
- )
-
- # 模型搭建
- if Config.model_name == 'lstm_attention':
- model = LSTM_Attention( vocab_size = len(vocab_2_id),
- n_class = Config.num_classes,
- embedding_dim = Config.embedding_dim,
- hidden_dim = Config.hidden_dim,
- num_layers = Config.layer_num,
- dropout = Config.dropout,
- bidirectional = Config.bidirectional,
- embedding_weights = embedding_table,
- train_w2v = Config.w2v_grad
- )
- # print(model.embedding.weight)
- else:
- model = LSTM_Model(vocab_size = len(vocab_2_id),
- n_class = Config.num_classes,
- embedding_dim = Config.embedding_dim,
- hidden_dim = Config.hidden_dim,
- num_layers = Config.layer_num,
- dropout = Config.dropout,
- bidirectional = Config.bidirectional,
- embedding_weights = embedding_table,
- train_w2v = Config.w2v_grad
- )
- print('Model-"{}" 细节:\n'.format(Config.model_name),model)
- view_will_trained_params(model,model_name=Config.model_name)
-
- # 优化器、学习率调整器、LOSS函数,设置分层学习率
- special_layers = nn.ModuleList([model.embedding])
- # 获取特等层的参数列表的内存id列表
- special_layers_ids = list(map(lambda x: id(x), special_layers.parameters()))
- # 基础层的参数列表
- basic_params = filter(lambda x: id(x) not in special_layers_ids, model.parameters())
-
- optimizer = optim.Adam([{'params': filter(lambda p: p.requires_grad, basic_params)},
- {'params': filter(lambda p: p.requires_grad, special_layers.parameters()), 'lr': 8e-5}],
- lr=Config.learning_rate)
- scheduler_fun = WarmupCosineLR(optimizer,warmup_iter=4,lrs_min=[5e-5,1e-6],T_max=40)
-
- # train
- if Config.focal_loss:
- loss_fun = focal_loss
- else:
- loss_fun = cross_entropy
- train(model=model,
- device=device,
- optimizer=optimizer,
- scheduler_fun=scheduler_fun,
- loss_fun=loss_fun,
- epochs=Config.epochs,
- metric_fun=get_score,
- info_interval=Config.info_interval,
- checkpoint=Config.checkpoint,
- train_loader=train_loader,
- eval_loader=val_loader)

- from __future__ import print_function
- from __future__ import division
- from __future__ import absolute_import
- from __future__ import with_statement
- from model import LSTM_Attention,LSTM_Model
- from data_process import data_2_id
- from loader_utils import get_vocab, Data_Set, collate_fn,sort_eval
- from model_utils import get_score
- from create_config import Config
- from torch.utils.data import DataLoader
- import torch
- import numpy as np
- import os
- import re
-
- def eval_one_epoch(model, device, metric_fun, eval_loader):
- """
- ********** 一个epoch模型验证 ************
- """
- print('Predict ... ')
- model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
- model.to(device)
- batch_targets = []
- batch_predicts = []
- error_samples = []
- with torch.no_grad():
- for idx, (input_x, target) in enumerate(eval_loader):
- input_x, target = input_x.to(device), target.to(device)
- output = model(input_x) # 前向传播
- # 计算score
- pre = torch.argmax(output, dim=1)
- error_x = input_x[target != pre]
- error_target = pre[target != pre]
- pre = pre.cpu().numpy().reshape(-1).tolist()
- target = target.cpu().numpy().reshape(-1).tolist()
- error_x = error_x.cpu().numpy().tolist()
- error_target = error_target.cpu().numpy().tolist()
- batch_targets.extend(target)
- batch_predicts.extend(pre)
- error_samples.append((error_target,error_x))
- # 计算一个epoch的score
- (epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets,
- predict=batch_predicts)
- print(
- "Epoch Info :\tScores: Scores: <\tacc:{:.3f}%\t macro_recall:{:.3f}%\t"\
- " macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>".format(100. * epoch_acc, 100. * epoch_recall,
- 100. * epoch_precision, 100. * epoch_F1
- ))
- return [epoch_acc, epoch_recall, epoch_precision, epoch_F1],con_matrix,error_samples
-
- def predict(model,device, metric_fun,checkpoint,predict_loader):
- """
- ********** 模型测试 ************
- """
- # 判断加载已保留的最优的模型参数
- if os.path.exists(checkpoint):
- if torch.cuda.is_available():
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
- else:
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
- best_scores = ck_dict['best_score']
- history_epoch,best_epoch = ck_dict['epochs'],ck_dict['best_epochs']
- model.load_state_dict(ck_dict['best_params'])
- print(
- 'From "{}" load history model params:\n\tTrained Epochs:{}\n\tBest Model Epoch:{}\n'\
- '\tBest Score:<\tacc:{:.3f}%\t macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n\t'.format(
- checkpoint, history_epoch,best_epoch, 100. * best_scores[0], 100. * best_scores[1], 100. * best_scores[2],
- 100. * best_scores[3]))
-
- # predict
- eval_score,con_matrix,error_samples = eval_one_epoch(model=model,
- device=device,
- metric_fun=metric_fun,
- eval_loader=predict_loader)
- else:
- print('Model not exists .... ')
- eval_score = None
- con_matrix = None
- error_samples = None
- exit()
- return eval_score,con_matrix,error_samples
-
- if __name__ == '__main__':
- dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
- device = torch.device(dev)
- # 数据加载,#预测的情况下无需加载 预训练的embedding 表,load model paramters 会加载
- vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
- # DataSet DataLoader
- X_test, target_test = data_2_id(vocab_2_id, Config.max_seq, Config.test_data)
- # TODO 为了避免batch长短不齐形成过多的PAD,这里对 eval 数据 按照真实的长度从小到大排序
- X_test, target_test = sort_eval(X_test, target_test)
-
- kwargs = {'num_workers': Config.num_workers, 'pin_memory': True} if torch.cuda.is_available() else {
- 'num_workers': Config.num_workers}
- test_dataset = Data_Set(X_test, target_test)
-
- test_loader = DataLoader(dataset=test_dataset,
- batch_size=Config.batch_size,
- shuffle=False,
- collate_fn=collate_fn,
- **kwargs
- )
- print('dataloader 第一个batch的情况如下:')
- print(next(iter(test_loader)), next(iter(test_loader))[0].shape)
- # 模型搭建
- if Config.model_name == 'lstm_attention':
- model = LSTM_Attention(vocab_size=len(vocab_2_id),
- n_class=Config.num_classes,
- embedding_dim=Config.embedding_dim,
- hidden_dim=Config.hidden_dim,
- num_layers=Config.layer_num,
- dropout=Config.dropout,
- bidirectional=Config.bidirectional,
- embedding_weights=None, # 预测的情况下会加载
- train_w2v=Config.w2v_grad
- )
- # print(model.embedding.weight)
- else:
- model = LSTM_Model(vocab_size=len(vocab_2_id),
- n_class=Config.num_classes,
- embedding_dim=Config.embedding_dim,
- hidden_dim=Config.hidden_dim,
- num_layers=Config.layer_num,
- dropout=Config.dropout,
- bidirectional=Config.bidirectional,
- embedding_weights=None,
- train_w2v=Config.w2v_grad
- )
- print('Model-"{}" 细节:\n'.format(Config.model_name), model)
- # predict
- _,con_matrix,error_samples = predict(model=model,
- device=device,
- metric_fun=get_score,
- checkpoint=Config.checkpoint,
- predict_loader=test_loader)
- print('混淆矩阵:\n',con_matrix)
- # 保存 测试出错了样本
- print('保存测试集错误的样本:"{}"'.format('./data/test_error_sample.data'))
- error_target,error_x = zip(*error_samples)
- error_target_ = []
- error_x_ =[]
- for i in range(len(error_target)):
- for j in range(len(error_target[i])):
- error_target_.append(error_target[i][j])
- error_x_.append(error_x[i][j])
- print(len(error_target_),len(error_x_))
-
- vocab_keys = list(vocab_2_id.keys())
- error_x_ = [np.array(vocab_keys)[np.array(i)].tolist() for i in error_x_]
- with open('./data/test_error_sample.data','w',encoding='utf-8') as w:
- for idx in range(len(error_target_)):
- word_str = ''.join(error_x_[idx])
- word_str = re.sub('<PAD>\s*', '', word_str)
- w.write(str(error_target_[idx]))
- w.write('\t')
- w.write(word_str)
- w.write('\n')

- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 2021/8/20 23:06
- # @Author :
- # @Site :
- # @File : predict.py
- # @Software: PyCharm
-
- from __future__ import print_function
- from __future__ import division
- from __future__ import absolute_import
- from __future__ import with_statement
- from os import lseek
- from model import LSTM_Attention, LSTM_Model
- from data_process import data_2_id, DataProcessNoTarget
- from loader_utils import get_vocab, Data_Set, collate_fn
- from create_config import Config
- from torch.utils.data import DataLoader
- import torch
- import torch.nn.functional as F
- import os
- import numpy as np
-
-
- def eval_one_epoch(model, device, eval_loader):
- """
- ********** 一个epoch模型验证 ************
- """
- print('Predict ... ')
- model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
- model.to(device)
- batch_predicts = []
- batch_probs = []
- with torch.no_grad():
- for idx, input_x in enumerate(eval_loader):
- input_x = input_x.to(device)
- output = model(input_x) # 前向传播
- output = F.softmax(output,dim=-1)
- # 计算score
- prob,pre = torch.max(output,dim=-1)
- prob = prob.cpu().numpy().reshape(-1).tolist()
- pre = pre.cpu().numpy().reshape(-1).tolist()
- batch_predicts.extend(pre)
- batch_probs.append(prob)
- return np.array(batch_predicts),np.array(batch_probs)
-
-
- def predict(model, device, checkpoint, predict_loader):
- """
- ********** 模型测试 ************
- """
- # 判断加载已保留的最优的模型参数
- if os.path.exists(checkpoint):
- if torch.cuda.is_available():
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
- else:
- ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
- best_scores = ck_dict['best_score']
- history_epoch, best_epoch = ck_dict['epochs'], ck_dict['best_epochs']
- model.load_state_dict(ck_dict['best_params'])
- print(
- 'From "{}" load history model params:\n\tTrained Epochs:{}\n\tBest Model Epoch:{}\n' \
- '\tBest Score:<\tacc:{:.3f}%\t macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n\t'.format(
- checkpoint, history_epoch, best_epoch, 100. * best_scores[0], 100. * best_scores[1],
- 100. * best_scores[2],
- 100. * best_scores[3]))
-
- # predict
- predict_array,probs_array = eval_one_epoch(model=model,
- device=device,
- eval_loader=predict_loader)
- else:
- print('Model not exists .... ')
- predict_array = None
- probs_array = None
- exit()
- return predict_array,probs_array
-
-
- if __name__ == '__main__':
- dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
- device = torch.device(dev)
- # 数据加载
- vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
- # DataSet DataLoader
- X_predict = DataProcessNoTarget().forward(Config.predict_data,Config.stop_word_path,vocab_2_id,Config.max_seq)
-
- kwargs = {'num_workers': Config.num_workers, 'pin_memory': True} if torch.cuda.is_available() else {
- 'num_workers': Config.num_workers}
- predict_dataset = Data_Set(X_predict)
-
- predict_loader = DataLoader(dataset=predict_dataset,
- batch_size=Config.batch_size,
- shuffle=False,
- collate_fn=collate_fn,
- **kwargs
- )
- print('dataloader 第一个batch的情况如下:')
- print(next(iter(predict_loader)), next(iter(predict_loader))[0].shape)
-
- # 模型搭建
- if Config.model_name == 'lstm_attention':
- model = LSTM_Attention(vocab_size=len(vocab_2_id),
- n_class=Config.num_classes,
- embedding_dim=Config.embedding_dim,
- hidden_dim=Config.hidden_dim,
- num_layers=Config.layer_num,
- dropout=Config.dropout,
- bidirectional=Config.bidirectional,
- embedding_weights=None, # 预测的情况下会加载
- train_w2v=Config.w2v_grad
- )
- # print(model.embedding.weight)
- else:
- model = LSTM_Model(vocab_size=len(vocab_2_id),
- n_class=Config.num_classes,
- embedding_dim=Config.embedding_dim,
- hidden_dim=Config.hidden_dim,
- num_layers=Config.layer_num,
- dropout=Config.dropout,
- bidirectional=Config.bidirectional,
- embedding_weights=None,
- train_w2v=Config.w2v_grad
- )
- print('Model-"{}" 细节:\n'.format(Config.model_name), model)
- # predict
- predict_array,probs_array = predict(model=model,
- device=device,
- checkpoint=Config.checkpoint,
- predict_loader=predict_loader)
- print('predict 结果:\n结果:{}\n置信度:{}'.format(np.array(['讨厌','喜欢'])[predict_array],probs_array))

- 1. 首排除模型问题:
- 1.1 换成简单的模型,训练后模型过拟合依然存在,排除模型错误
- 1.1 双向的lstm怎么区分处理pad的输出与真实信息输出
- 正向传播:由于我们在文本前面进行PAD,所以正向传播的过程中最后一个时刻的输出应该是全部句子正向
- 传播的语义表征
- 反向传播:而bilstm中反向传播时,前面的时刻都是PAD的数据,因此PAD的数据的输出不应该使用,我们
- 应该根据每个batch中每个样本真实的数据的长度来获取句子的语义表征。
-
- 数据问题:
- 1. 数据是否不均衡
- 2. 样本存在一部分噪声样本,需要用机器学习的方法将其清洗,oneclassvm
- 3. 原数据是否打乱了排序?train_dataloader 是否打乱顺序,相对的 val_loader/test_loader 是否从大到小排序【为了避
- 免长短不齐加入过多的PAD标签】
- 4. 停止词字典与去除高频词可能过滤掉了有用词【情感分析场景下:哈、吗、呵,这些词也是有主观色彩的,不建议去除】
-
- 训练问题
- 5. dataloader 中的定义coffle_fn 函数剪切每一个batch的长度为最长真实长度,使的每一个batch的长度可能不同
- 6. 如果使用了预训练模型,是否使用分层学习率,过大了学习率会使预训练模型震荡
- 7. 学习率的定义、loss函数 focal loss、学习率衰减策略
- focal loss:主要为了解决样本不均衡的问题,还有一些标签正确却特征难学的样本,但是如果数据中脏数据较多可能
- 导致模型的准确率下降

1. 为什么 PAD 标签对应的id 是0,UNK 不是0呢?
这是因为后续对batch每个样本真实长度统计计算方便规定PAD的id为0
2. 为什么PAD会放在前面?
- PAD放在前面时,正向传播时最后一个时刻是真实的数据,不需要去针对真实的样本数据的长度去索引真实数据的最后
- 一个时刻的输出
3、验证集与测试集最优的 acc、macro reall、macro precison、macro f1 为 92.5%,进一步训练会过拟合
- 原因:原始数据中存在一部分噪声数据,测试集将预测错误的样本保存发现:
- ① 有些标签原始数据标注错误
- ② 有些数据情感色彩处于模棱两可的状态
-
- 解决方法猜想:
- 数据预处理分割train、val、test集前应该做离群点检测,机器学习的方法有OneClassSVM、Isolation Forest等
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。