赞
踩
模型使用的是:uer/sbert-base-chinese-nli · Hugging Face
sentence_transformers 官网:SentenceTransformers Documentation — Sentence-Transformers documentation
sentence_transformer 使用自己训练数据微调模型的代码如下所示。(模型为计算句子相似度)
- from sentence_transformers import SentenceTransformer,SentenceDataset,InputExample,evaluation,losses,util
- class sbert():
- def build_train_data(self,o1,o2,n1,n2,train_size):
- train_data = []
- for i in range(train_size):
- train_data.append(InputExample(tests=[o1[i],o2[i]],label=1.0))
- train_data.append(InputExample(tests=[n1[i],n2[i]],label=1.0))
- return train_data
- def build_evaluation_data(o1,o2,n1,n2,train_size,eval_size):
- s1 = o1[train_size:]
- s2 = o2[train_size:]
- s1.extend(list(n1[train_size:]))
- s2.extend(list(n2[train_size:]))
- score = [1.0]*eval_size + [0.0]*eval_size
- evaluator = evaluation.EmbeddingSimilarityEvaluator(s1,s2,score)
- return evaluator
- def callback(self,score,epoch,steps)
- print('score:{},epoch:{},steps:{}'.format(score,epoch,steps))
-
- def train(self):
- #1.获取正、负样本,o1是标准问,O2是相似问
- o1,o2 = self.get_act_data()
- n1,n2 = self.get_neg_data()
-
- #2.定义训练集、测试集大小 + 构造训练数据
- train_size = int(len(o1)*0.8)
- eval_size = len(o1) - train_size
- train_data = self.build_train_data(o1,o2,n1,n2,train_size)
-
- #3.定义测试数据
- evaluator = self.build_evaluation_data(o1,o2,n1,n2,train_size,eval_size)
-
- #4.需要训练的模型
- mode = SentenceTransformer('模型地址')
-
- #5
- train_dataset = SentenceDataset(train_data,model)
- train_dataloader = DataLoader(train_dataset,shuffle =true, batch_size = 8)
- train_loss = losses.CosineSimilarityLoss(model)
-
- #6.调试模型
- model.fit(train_objectives = [(train_dataloader,train_loss)],epochs = 1,warmup_steps = 100,evaluator = evaluator,evaluation_steps = 100,output_path = '存调试后模型的地址',save_best_model = True,callback = self.callback)
sentence_transformer使用自己微调后的模型的代码如下所示:
- #1. 定义模型
- model = SentenceTransformer('模型地址')
- #2.编码向量
- o1_emb = model.encode(['数据list','求求一定要好运啊'])
- o2_emb = model.encode(['一定要是列表','我绝对可以好运']
- #计算相似度
- cosine_score0 = util.cos_sim(o1_emb,o2_emb)
- cosine_score = []
- for i in range(len(cosine_score0)):
- cosine_score.append(cosine_score0[i][i].numpy().tolist()
-
增加模型层数
- from sentence_transformers import SentenceTransformer, models
- from torch import nn
-
- word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
- dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
-
- model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。