赞
踩
# Training Parameters learning_rate = 0.1 batch_size = 128 num_steps = 3000000 display_step = 10000 eval_step = 200000 # Evaluation Parameters valid_size = 20 valid_window = 100 #从词典的前100个词中随机选取20个词来验证模型 eval_words = np.random.choice(valid_window, valid_size, replace=False) # Word2Vec Parameters embedding_size = 200 max_vocabulary_size = 50000 min_occurrence = 10 # 词典中词出现的最低次数 skip_window = 3 # 窗口大小 num_skips = 2 # 每个输入中心词在其上下文区间中选取num_skips个词来生成样本 num_sampled = 64 # Number of negative examples
def make_vocabulary(data): """ data:是一个一维的list,每个元素可以是单个字也可以是切词后的词 data是我们将句子切词后再拼接生成的(如果以字为单位不用切词直接拼接) """ word2count = [('UNK', -1)] #统计语言库词的次数 word2count.extend(collections.Counter("".join(data)).most_common(max_vocabulary_size - 1)) #去掉出现次数比较少的词 for i in range(len(word2count) - 1, -1, -1): if word2count[i][1] < min_occurrence: word2count.pop(i) else: break vocabulary_size = len(word2count) word2id = dict() for i, (word, _) in enumerate(word2count): word2id[word] = i #将data中的词转化为其对应索引ID data_id = list() unk_count = 0 for word in data: index = word2id.get(word, 0) if index == 0: unk_count += 1 data_id.append(index) word2count[0] = ('UNK', unk_count)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。