赞
踩
1.jieba分词的cut()方法:是一个生成器,返回一个列表,可以遍历查看。
精准模式
全模式:lists= [word for word in jieba.cut(s,cut_all=True)]
['我', '是', '张', '三', ',', '今天', '今天天气', '天天', '天气', '真好']
2.运行报错
readlines():
UnicodeDecodeError: 'gbk' codec can't decode byte 0x86 in position 22: illegal multibyte sequence
解决: with open(file, 'r') as file_object:改为 with open(file, 'r',encoding='UTF-8') as file_object:
1)构造训练集
读取问题与回答的数据,进行结巴分词,为每个词形成id,词和对应id保存在两个集合中。
为答案的句子添加'<EOS>'用来指示decoder什么时候停止预测。
训练集为:问题向量,答案向量
2)构造模型
设置学习率、优化方法等参数
3)模型训练
训练让loss变小,
训练10000次结果:
4)保存模型
1)加载模型
2)输入问题,对数据进行处理
3)输出结果
- def get_model(feed_previous=False):
- """构造模型
- """
-
- learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32)
- learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9)
-
- encoder_inputs = []
- decoder_inputs = []
- target_weights = []
- for i in range(input_seq_len):
- encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
- for i in range(output_seq_len + 1):
- decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
- for i in range(output_seq_len):
- target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))
-
- # decoder_inputs左移一个时序作为targets
- targets = [decoder_inputs[i + 1] for i in range(output_seq_len)]
-
- cell = tf.contrib.rnn.BasicLSTMCell(size)
-
- # 这里输出的状态我们不需要
- outputs, _ = seq2seq.embedding_attention_seq2seq(
- encoder_inputs,
- decoder_inputs[:output_seq_len],
- cell,
- num_encoder_symbols=num_encoder_symbols,
- num_decoder_symbols=num_decoder_symbols,
- embedding_size=size,
- output_projection=None,
- feed_previous=feed_previous,
- dtype=tf.float32)
-
- # 计算加权交叉熵损失
- loss = seq2seq.sequence_loss(outputs, targets, target_weights)
- # 使用自适应优化器
- opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
- # 优化目标:让loss最小化
- update = opt.apply_gradients(opt.compute_gradients(loss))
- # 模型持久化
- saver = tf.train.Saver(tf.global_variables())
-
- return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
-
-
- def train():
- """
- 训练过程
- """
- train_set = get_train_set()
- with tf.Session() as sess:
-
- encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate = get_model()
-
- # 全部变量初始化
- sess.run(tf.global_variables_initializer())
-
- # 训练很多次迭代,每隔100次打印一次loss,可以看情况直接ctrl+c停止
- previous_losses = []
- for step in range(10000):
- sample_encoder_inputs, sample_decoder_inputs, sample_target_weights = get_samples(train_set, 1000)
- input_feed = {}
- for l in range(input_seq_len):
- input_feed[encoder_inputs[l].name] = sample_encoder_inputs[l]
- for l in range(output_seq_len):
- input_feed[decoder_inputs[l].name] = sample_decoder_inputs[l]
- input_feed[target_weights[l].name] = sample_target_weights[l]
- input_feed[decoder_inputs[output_seq_len].name] = np.zeros([len(sample_decoder_inputs[0])], dtype=np.int32)
- [loss_ret, _] = sess.run([loss, update], input_feed)
- if step % 100 == 0:
- print( 'step=', step, 'loss=', loss_ret, 'learning_rate=', learning_rate.eval())
-
- if len(previous_losses) > 5 and loss_ret > max(previous_losses[-5:]):
- sess.run(learning_rate_decay_op)
- previous_losses.append(loss_ret)
-
- # 模型持久化
- saver.save(sess, output_dir)
-
-
- def predict():
- """
- 预测过程
- """
- with tf.Session() as sess:
- encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate = get_model(feed_previous=True)
- saver.restore(sess, output_dir)
- sys.stdout.write("> ")
- sys.stdout.flush()
- input_seq=input()
- while input_seq:
- input_seq = input_seq.strip()
- input_id_list = get_id_list_from(input_seq)
- if (len(input_id_list)):
- sample_encoder_inputs, sample_decoder_inputs, sample_target_weights = seq_to_encoder(' '.join([str(v) for v in input_id_list]))
-
- input_feed = {}
- for l in range(input_seq_len):
- input_feed[encoder_inputs[l].name] = sample_encoder_inputs[l]
- for l in range(output_seq_len):
- input_feed[decoder_inputs[l].name] = sample_decoder_inputs[l]
- input_feed[target_weights[l].name] = sample_target_weights[l]
- input_feed[decoder_inputs[output_seq_len].name] = np.zeros([2], dtype=np.int32)
-
- # 预测输出
- outputs_seq = sess.run(outputs, input_feed)
- # 因为输出数据每一个是num_decoder_symbols维的,因此找到数值最大的那个就是预测的id,就是这里的argmax函数的功能
- outputs_seq = [int(np.argmax(logit[0], axis=0)) for logit in outputs_seq]
- # 如果是结尾符,那么后面的语句就不输出了
- if EOS_ID in outputs_seq:
- outputs_seq = outputs_seq[:outputs_seq.index(EOS_ID)]
- outputs_seq = [wordToken.id2word(v) for v in outputs_seq]
- print(" ".join(outputs_seq))
- else:
- print("WARN:词汇不在服务区")
-
- sys.stdout.write("> ")
- sys.stdout.flush()
- input_seq = input()
-
-
- if __name__ == "__main__":
- tf.reset_default_graph()
- train()
- tf.reset_default_graph()
- predict()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。