赞
踩
import json import numpy as np from tqdm import tqdm import re def find_chinese_text(text): pattern = re.compile(r'[^\u4e00-\u9fff]') return pattern.sub('', text) with open("E:/data_sets/multiturn_chat_0.8M.json", "r", encoding="utf-8") as f: json_list = f.readlines() data = [json.loads(i) for i in tqdm(json_list)] data_list = [] basic_list = ["<|User|>", "<|Ash|>"] * 100 for one_data in tqdm(data): ins = one_data["instruction"] # inp=one_data["input"] out = one_data["output"] ins = np.hstack([i.split("Human:") for i in ins.split("Assistant:")])[1:-1].tolist() ins_len = len(ins) ins = np.hstack([[i, j] for i, j in zip(basic_list[:ins_len], ins)]).tolist() ins += [basic_list[ins_len], out] data_list.append(ins) with open("data_set.txt","a",encoding="utf-8") as f: voc_set=set() for one_list in tqdm(data_list): one_data=[] for one in one_list: try: if one in ["<|User|>", "<|Ash|>"]: one_data+=[one] else: if len(find_chinese_text(one))/(len(one)+1)<0.5: one=one.split() else: one=list(one) one_data+=one except: print() voc_set|=set(one_data) f.write(str(one_data)+"\n")
find_chinese_text()
,用于寻找文本中的中文字符。with
打开文件 “E:/data_sets/multiturn_chat_0.8M.json”,并逐行读取文件,将其转化为列表形式。with
打开文件 “data_set.txt”,并迭代 data_list 中的每一段对话,将其中的每一个元素添加到 one_data 列表中,如果元素是 “<|User|>” 或者 “<|Ash|>”,则直接添加,否则将元素按空格分开,分开后的每个单词作为列表元素添加到 one_data 列表中。Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。