赞
踩
TOKEN_PAD = '' # Token for padding 填充标记 TOKEN_UNK = '<UNK>' # Token for unknown words 未登录词标记 TOKEN_CLS = '<CLS>' # Token for classification 分类标记 TOKEN_SEP = '<SEP>' # Token for separation 分隔符标记 TOKEN_MASK = '<MASK>' # Token for masking 掩蔽标记 token_dict = get_base_dict() print(token_dict) # {'': 0, '<UNK>': 1, '<CLS>': 2, '<SEP>': 3, '<MASK>': 4} # len(token_dict) = 5 example01: sentence_pairs = [ [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']], [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']], [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']] ] for pairs in sentence_pairs: for token in pairs[0] + pairs[1]: if token not in token_dict: token_dict[token] = len(token_dict) # 未添加 token 前,token_dict 字典的长度 token_list = list(token_dict.keys()) # 用于随机选择一个单词 print(token_list) # ['', '<UNK>', '<CLS>', '<SEP>', '<MASK>', 'all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', 'from', 'the', 'day', 'forth', 'my', 'arm', 'changed', 'voice', 'echoed', 'power', 'give', 'me', 'more'] print(list(token_dict.values())) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。