赞
踩
- MAX_VOCAB_SIZE = 10000
- UNK, PAD = '<UNK>', '<PAD>'
-
- def build_vocab(file_name, tokenize, max_size, min_freq):
- vocab_dic = {}
- with open(file_name, 'r', encoding='utf-8') as f:
- for line in f:
- lin = line.strip()
- if not lin:
- continue
- content = lin.split('\t')[0]
- # print(content)
-
- for word in tokenize(content):
- # print(word)
- vocab_dic[word] = vocab_dic.get(word, 0) +1
- # break
- vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
- :max_size]
- vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
- print(vocab_dic)
- print(vocab_list)
-
- file_name = '../text/THUCNews/data/train.txt'
- tokenize = lambda x: x.strip(' ')
- build_vocab(file_name, tokenize, max_size=MAX_VOCAB_SIZE, min_freq=1)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。