赞
踩
Token indices sequence length is longer than the specified maximum sequence length for this model (170835 > 32768). Running this sequence through the model will result in indexing errors
truncation=True
sentence_ids = Encoder.tokenizer(sentence, truncation=True)['input_ids']
- def encode(self, json_line):
- try: # json.loads may raise an exception
- data = json.loads(json_line)
- ids = {}
- tokens = 0
- # print("json_keys", args.json_keys)
- for key in self.args.json_keys:
- text = data[key]
- doc_ids = []
- for sentence in Encoder.splitter.tokenize(text):
- if args.tokenizer_type == "QwenTokenizer":
- sentence_ids = Encoder.tokenizer(sentence, truncation=True)['input_ids']
- else:
- sentence_ids = Encoder.tokenizer.tokenize(sentence)
- tokens += len(sentence_ids)
- if len(sentence_ids) > 0:
- doc_ids.append(sentence_ids)
- if len(doc_ids) > 0 and self.args.append_eod:
- doc_ids[-1].append(Encoder.tokenizer.eos_token_id) #qwen1.5:eos_token_id, qwen:eod_id, llama:eod
- ids[key] = doc_ids
- return ids, len(json_line), tokens
- except:
- print("error in token_raw_data_for_dsw_qwen.py, please check lines 81 to 100")
- return {}, 0, 0
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。