赞
踩
- import numpy as np
-
- # original_distribution 是概率值组成的一维 Numpy 数组,
- # 这些概率值之和必须等于 1。temperature 是一个因子,
- # 用于定量描述输出分布的熵
- def reweight_distribution(original_distribution, temperature=0.5):
- distribution = np.log(original_distribution) / temperature
- distribution = np.exp(distribution)
- # 返回原始分布重新加权后的结果。
- # distribution的求和可能不再等于1,
- # 因此需要将它除以求和,以得到新的分布
- return distribution / np.sum(distribution)
更高的温度得到的是熵更大的采样分布,会生成更加出人意料、更加无结构的生成数据,而更低的温度对应更小的随机性,以及更加可预测的生成数据(见图 8-2)。
- import keras
- import numpy as np
- path = keras.utils.get_file(
- 'nietzsche.txt',
- origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
- text = open(path).read().lower()
- print('Corpus length:', len(text))
- # 代码清单8-3将字符序列向量化
- maxlen = 60 # 提取 60 个字符组成的序列
- step = 3 # 每 3 个字符采样一个新序列
- sentences = [] # 保存所提取的序列
- next_chars = [] # 保存目标(即下一个字符)
-
- for i in range(0, len(text) - maxlen, step):
- sentences.append(text[i: i + maxlen])
- next_chars.append(text[i + maxlen])
-
- print('Number of sequences:', len(sentences))
-
- chars = sorted(list(set(text))) # 语料中唯一字符组成的列表
- print('Unique characters:', len(chars))
-
- # 一个字典,将唯一字符映射为它在列表 chars 中的索引
- char_indices = dict((char, chars.index(char)) for char in chars)
-
- print('Vectorization...')
- x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
- y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
-
- # 将字符 one-hot 编码为二进制数组
- for i, sentence in enumerate(sentences):
- for t, char in enumerate(sentence):
- x[i, t, char_indices[char]] = 1
- y[i, char_indices[next_chars[i]]] = 1
2. 构建网络
- # 代码清单8 - 4用于预测下一个字符的单层LSTM模型
- from keras import layers
-
- model = keras.models.Sequential()
- model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
- model.add(layers.Dense(len(chars), activation='softmax'))
- # 代码清单 8-5 模型编译配置
- optimizer = keras.optimizers.RMSprop(lr=0.01)
- model.compile(loss='categorical_crossentropy', optimizer=optimizer)
- # 代码清单 8-6 给定模型预测,采样下一个字符的函数
- def sample(preds, temperature=1.0):
- preds = np.asarray(preds).astype('float64')
- preds = np.log(preds) / temperature
- exp_preds = np.exp(preds)
- preds = exp_preds / np.sum(exp_preds)
- probas = np.random.multinomial(1, preds, 1)
-
- return np.argmax(probas)
- # 代码清单 8-7 文本生成循环
-
- for epoch in range(1, 60):
- print('epoch', epoch)
- model.fit(x, y, batch_size=128, epochs=1)
- start_index = random.randint(0, len(text) - maxlen - 1)
- generated_text = text[start_index: start_index + maxlen]
- print('--- Generating with seed: "' + generated_text + '"')
-
- for temperature in [0.2, 0.5, 1.0, 1.2]: # 尝试一系列不同的采样温度
- print('------ temperature:', temperature)
- sys.stdout.write(generated_text)
-
- # 从种子文本开始,生成 400个字符
- for i in range(400):
- # 对目前生成的字符进行one-hot 编码
- sampled = np.zeros((1, maxlen, len(chars)))
- for t, char in enumerate(generated_text):
- sampled[0, t, char_indices[char]] = 1.
- # 对下一个字符进行采样
- preds = model.predict(sampled, verbose=0)[0]
- next_index = sample(preds, temperature)
- next_char = chars[next_index]
- generated_text += next_char
- generated_text = generated_text[1:]
- sys.stdout.write(next_char)
- poch 21
- 1565/1565 [==============================] - 55s 35ms/step - loss: 1.5468
- --- Generating with seed: "this our mode?--from german heart came this vexed ululating?"
- epoch 22
- 1565/1565 [==============================] - 56s 35ms/step - loss: 1.5346
- --- Generating with seed: " come to his fantastic consequent
- of the so called discretio"
- epoch 23
- 1565/1565 [==============================] - 56s 36ms/step - loss: 1.5224
- --- Generating with seed: "or
- introspection, and is accustomed to severe discipline and"
可见,较小的温度值会得到极端重复和可预测的文本,但局部结构是非常真实的,特别是所有单词都是真正的英文单词(单词就是字符的局部模式)。随着温度值越来越大,生成的文本也变得更有趣、更出人意料,甚至更有创造性,它有时会创造出全新的单词,听起来有几分可信(比如 eterned 和 troveration)。对于较大的温度值,局部模式开始分解,大部分单词看起来像是半随机的字符串。毫无疑问,在这个特定的设置下,0.5 的温度值生成的文本最为有趣。一定要尝试多种采样策略!在学到的结构与随机性之间,巧妙的平衡能够让生成的序列非常有趣。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。