赞
踩
代码参考keras-bert中的tokenizer
def word_piece_tokenize(word, token_dict): if word in token_dict: return [word] tokens = [] start, stop = 0, 0 while start < len(word): stop = len(word) while stop > start: sub = word[start:stop] if start > 0: # sub-word前加## sub = '##' + sub if sub in token_dict: break stop -= 1 if start == stop: stop += 1 tokens.append(sub) start = stop return tokens
代码参考transformers包中的BertTokenizer
class WordpieceTokenizer(object): """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): self.vocab = vocab self.unk_token = unk_token self.max_input_chars_per_word = max_input_chars_per_word def tokenize(self, text): """ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. """ output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True print(chars) break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens
二者的区别在于:
transformers.BertTokenizer的WordpieceTokenizer方法中,对于找不到的词语,会分配UNK;
而在keras_bert.tokenizer中则不会这样处理。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。