当前位置:   article > 正文

利用tensorflow制作一个简单的聊天机器人_c#中用tensorflowsharp做问答机器人

c#中用tensorflowsharp做问答机器人

         现在很多卖货公司都使用聊天机器人充当客服人员,许多科技巨头也纷纷推出各自的聊天助手,如苹果Siri、Google Now、Amazon Alexa、微软小冰等等。前不久有一个视频 比较了Google Now和Siri 哪个更智能,貌似Google Now更智能。

       本帖使用TensorFlow制作一个简单的聊天机器人。这个聊天机器人使用中文对话数据集进行训练(使用什么数据集训练决定了对话类型)。使用的模型为RNN(seq2seq),和前文的《RNN生成古诗词》《RNN生成音乐》类似。

       本次博客使用的数据集:影视对白数据集

       下载数据集后,解压提取dgk_shooter_min.conv文件;

         1)数据预处理:

  1. #coding=utf-8
  2. import os
  3. import random
  4. from io import open
  5. conv_path = 'dgk_shooter_min.conv'
  6. #判断数据集是否存在?
  7. if not os.path.exists(conv_path):
  8. print('数据集不存在')
  9. exit()
  10. # 数据集格式
  11. """
  12. E
  13. M 畹/华/吾/侄/
  14. M 你/接/到/这/封/信/的/时/候/
  15. M 不/知/道/大/伯/还/在/不/在/人/世/了/
  16. E
  17. M 咱/们/梅/家/从/你/爷/爷/起/
  18. M 就/一/直/小/心/翼/翼/地/唱/戏/
  19. M 侍/奉/宫/廷/侍/奉/百/姓/
  20. M 从/来/不/曾/遭/此/大/祸/
  21. M 太/后/的/万/寿/节/谁/敢/不/穿/红/
  22. M 就/你/胆/儿/大/
  23. M 唉/这/我/舅/母/出/殡/
  24. M 我/不/敢/穿/红/啊/
  25. M 唉/呦/唉/呦/爷/
  26. M 您/打/得/好/我/该/打/
  27. M 就/因/为/没/穿/红/让/人/赏/咱/一/纸/枷/锁/
  28. M 爷/您/别/给/我/戴/这/纸/枷/锁/呀/
  29. E
  30. M 您/多/打/我/几/下/不/就/得/了/吗/
  31. M 走/
  32. M 这/是/哪/一/出/啊/…/ / /这/是/
  33. M 撕/破/一/点/就/弄/死/你/
  34. M 唉/
  35. M 记/着/唱/戏/的/再/红/
  36. M 还/是/让/人/瞧/不/起/
  37. M 大/伯/不/想/让/你/挨/了/打/
  38. M 还/得/跟/人/家/说/打/得/好/
  39. M 大/伯/不/想/让/你/再/戴/上/那/纸/枷/锁/
  40. M 畹/华/开/开/门/哪/
  41. E
  42. ...
  43. """
  44. # 我首先使用文本编辑器sublime把dgk_shooter_min.conv文件编码转为UTF-8,一下子省了不少麻烦
  45. convs = [] # 对话集合
  46. with open(conv_path, encoding="utf8") as f:
  47. one_conv = [] # 一次完整对话
  48. for line in f:
  49. line = line.strip('\n').replace('/', '')#将分隔符去掉
  50. if line == '':
  51. continue
  52. if line[0] == 'E':
  53. if one_conv:
  54. convs.append(one_conv)
  55. one_conv = []
  56. elif line[0] == 'M':
  57. one_conv.append(line.split(' ')[1])
  58. #将对话转成utf-8格式,并将其保存在dgk_shooter_min.conv文件中
  59. """
  60. print(convs[:3]) # 个人感觉对白数据集有点不给力啊
  61. [ ['畹华吾侄', '你接到这封信的时候', '不知道大伯还在不在人世了'],
  62. ['咱们梅家从你爷爷起', '就一直小心翼翼地唱戏', '侍奉宫廷侍奉百姓', '从来不曾遭此大祸', '太后的万寿节谁敢不穿红', '就你胆儿大', '唉这我舅母出殡', '我不敢穿红啊', '唉呦唉呦爷', '您打得好我该打', '就因为没穿红让人赏咱一纸枷锁', '爷您别给我戴这纸枷锁呀'],
  63. ['您多打我几下不就得了吗', '走', '这是哪一出啊 ', '撕破一点就弄死你', '唉', '记着唱戏的再红', '还是让人瞧不起', '大伯不想让你挨了打', '还得跟人家说打得好', '大伯不想让你再戴上那纸枷锁', '畹华开开门哪'], ....]
  64. """
  65. # 把对话分成问与答
  66. ask = [] # 问
  67. response = [] # 答
  68. for conv in convs:
  69. if len(conv) == 1:
  70. continue
  71. if len(conv) % 2 != 0: # 奇数对话数, 转为偶数对话
  72. conv = conv[:-1]
  73. for i in range(len(conv)):
  74. if i % 2 == 0:
  75. ask.append(conv[i])#偶数对,填写问题
  76. else:
  77. response.append(conv[i])#回答
  78. """
  79. print(len(ask), len(response))
  80. print(ask[:3])
  81. print(response[:3])
  82. ['畹华吾侄', '咱们梅家从你爷爷起', '侍奉宫廷侍奉百姓']
  83. ['你接到这封信的时候', '就一直小心翼翼地唱戏', '从来不曾遭此大祸']
  84. """
  85. def convert_seq2seq_files(questions, answers, TESTSET_SIZE=8000):
  86. # 创建文件
  87. train_enc = open('train.enc', 'w') # 问
  88. train_dec = open('train.dec', 'w') # 答
  89. test_enc = open('test.enc', 'w') # 问
  90. test_dec = open('test.dec', 'w') # 答
  91. # 选择8000数据作为测试数据
  92. test_index = random.sample([i for i in range(len(questions))], TESTSET_SIZE)
  93. for i in range(len(questions)):
  94. if i in test_index:#创建测试文件
  95. test_enc.write(questions[i] + '\n')
  96. test_dec.write(answers[i] + '\n')
  97. else:#创建训练文件
  98. train_enc.write(questions[i] + '\n')
  99. train_dec.write(answers[i] + '\n')
  100. if i % 1000 == 0:#表示处理了多少个i
  101. print(len(range(len(questions))), '处理进度:', i)
  102. train_enc.close()
  103. train_dec.close()
  104. test_enc.close()
  105. test_dec.close()
  106. convert_seq2seq_files(ask, response)
  107. # 生成的*.enc文件保存了问题
  108. # 生成的*.dec文件保存了回答
           2)创建词汇表

  

  1. #coding=utf-8
  2. # 前一步生成的问答文件路径
  3. train_encode_file = 'train.enc'
  4. train_decode_file = 'train.dec'
  5. test_encode_file = 'test.enc'
  6. test_decode_file = 'test.dec'
  7. print('开始创建词汇表...')
  8. # 特殊标记,用来填充标记对话
  9. PAD = "__PAD__"
  10. GO = "__GO__"
  11. EOS = "__EOS__" # 对话结束
  12. UNK = "__UNK__" # 标记未出现在词汇表中的字符
  13. START_VOCABULART = [PAD, GO, EOS, UNK]
  14. PAD_ID = 0
  15. GO_ID = 1
  16. EOS_ID = 2
  17. UNK_ID = 3
  18. # 参看tensorflow.models.rnn.translate.data_utils
  19. vocabulary_size = 5000
  20. # 生成词汇表文件
  21. def gen_vocabulary_file(input_file, output_file):
  22. vocabulary = {}
  23. with open(input_file) as f:
  24. counter = 0
  25. for line in f:
  26. counter += 1
  27. tokens = [word for word in line.strip()]
  28. for word in tokens:
  29. if word in vocabulary:
  30. vocabulary[word] += 1
  31. else:
  32. vocabulary[word] = 1
  33. vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
  34. # 取前5000个常用汉字, 应该差不多够用了(额, 好多无用字符, 最好整理一下. 我就不整理了)
  35. if len(vocabulary_list) > 5000:
  36. vocabulary_list = vocabulary_list[:5000]
  37. print(input_file + " 词汇表大小:", len(vocabulary_list))
  38. with open(output_file, "w") as ff:
  39. for word in vocabulary_list:
  40. ff.write(word + "\n")
  41. gen_vocabulary_file(train_encode_file, "train_encode_vocabulary")
  42. gen_vocabulary_file(train_decode_file, "train_decode_vocabulary")
  43. train_encode_vocabulary_file = 'train_encode_vocabulary'
  44. train_decode_vocabulary_file = 'train_decode_vocabulary'
  45. print("对话转向量...")
  46. # 把对话字符串转为向量形式
  47. def convert_to_vector(input_file, vocabulary_file, output_file):
  48. tmp_vocab = []
  49. with open(vocabulary_file, "r") as f:
  50. tmp_vocab.extend(f.readlines())
  51. tmp_vocab = [line.strip() for line in tmp_vocab]
  52. vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
  53. # {'硕': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
  54. output_f = open(output_file, 'w')
  55. with open(input_file, 'r') as f:
  56. for line in f:
  57. line_vec = []
  58. for words in line.strip():
  59. line_vec.append(vocab.get(words, UNK_ID))
  60. output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
  61. output_f.close()
  62. convert_to_vector(train_encode_file, train_encode_vocabulary_file, 'train_encode.vec')
  63. convert_to_vector(train_decode_file, train_decode_vocabulary_file, 'train_decode.vec')
  64. convert_to_vector(test_encode_file, train_encode_vocabulary_file, 'test_encode.vec')
  65. convert_to_vector(test_decode_file, train_decode_vocabulary_file, 'test_decode.vec')
生成的train_encode.vec和train_decode.vec用于训练,对应的词汇表是train_encode_vocabulary和train_decode_vocabulary。

        3)训练

  1. #coding=utf-8
  2. import tensorflow as tf # 0.12
  3. from tensorflow.models.rnn.translate import seq2seq_model
  4. import os
  5. import numpy as np
  6. import math
  7. #导入文件
  8. PAD_ID = 0
  9. GO_ID = 1
  10. EOS_ID = 2
  11. UNK_ID = 3
  12. train_encode_vec = 'train_encode.vec'
  13. train_decode_vec = 'train_decode.vec'
  14. test_encode_vec = 'test_encode.vec'
  15. test_decode_vec = 'test_decode.vec'
  16. # 词汇表大小5000
  17. vocabulary_encode_size = 5000
  18. vocabulary_decode_size = 5000
  19. buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
  20. layer_size = 256 # 每层大小
  21. num_layers = 3 # 层数
  22. batch_size = 64
  23. # 读取*dencode.vec和*decode.vec数据(数据还不算太多, 一次读人到内存)
  24. def read_data(source_path, target_path, max_size=None):
  25. data_set = [[] for _ in buckets]#生成了[[],[],[],[]],即当值与参数不一样
  26. with tf.gfile.GFile(source_path, mode="r") as source_file:#以读格式打开源文件(source_file)
  27. with tf.gfile.GFile(target_path, mode="r") as target_file:#以读格式打开目标文件
  28. source, target = source_file.readline(), target_file.readline()#只读取一行
  29. counter = 0#计数器为0
  30. while source and target and ( not max_size or counter < max_size):#当读入的还存在时
  31. counter += 1
  32. source_ids = [int(x) for x in source.split()]#source的目标序列号,默认分隔符为空格,组成了一个源序列
  33. target_ids = [int(x) for x in target.split()]#target组成一个目标序列,为目标序列
  34. target_ids.append(EOS_ID)#加上结束标记的序列号
  35. for bucket_id, (source_size, target_size) in enumerate(buckets):#enumerate()遍历序列中的元素和其下标
  36. if len(source_ids) < source_size and len(target_ids) < target_size:#判断是否超越了最大长度
  37. data_set[bucket_id].append([source_ids, target_ids])#读取到数据集文件中区
  38. break#一次即可,跳出当前循环
  39. source, target = source_file.readline(), target_file.readline()#读取了下一行
  40. return data_set
  41. model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size,
  42. buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm=5.0,
  43. batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.97,
  44. forward_only=False)
  45. config = tf.ConfigProto()
  46. config.gpu_options.allocator_type = 'BFC' # 防止 out of memory
  47. with tf.Session(config=config) as sess:
  48. # 恢复前一次训练
  49. ckpt = tf.train.get_checkpoint_state('.')
  50. if ckpt != None:
  51. print(ckpt.model_checkpoint_path)
  52. model.saver.restore(sess, ckpt.model_checkpoint_path)
  53. else:
  54. sess.run(tf.global_variables_initializer())
  55. train_set = read_data(train_encode_vec, train_decode_vec)
  56. test_set = read_data(test_encode_vec, test_decode_vec)
  57. train_bucket_sizes = [len(train_set[b]) for b in range(len(buckets))]#分别计算出训练集中的长度【1,2,3,4】
  58. train_total_size = float(sum(train_bucket_sizes))#训练实例总数
  59. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]#计算了之前所有的数的首战百分比
  60. loss = 0.0#损失置位0
  61. total_step = 0
  62. previous_losses = []
  63. # 一直训练,每过一段时间保存一次模型
  64. while True:
  65. random_number_01 = np.random.random_sample()#每一次循环结果不一样
  66. #选出最小的大于随机采样的值的索引号
  67. bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01])
  68. encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id)
  69. #get_batch()函数首先获取bucket的encoder_size与decoder_size
  70. _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False)#损失
  71. loss += step_loss / 500
  72. total_step += 1
  73. print(total_step)
  74. if total_step % 500 == 0:
  75. print(model.global_step.eval(), model.learning_rate.eval(), loss)
  76. # 如果模型没有得到提升,减小learning rate
  77. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):#即损失比以前的大则降低学习率
  78. sess.run(model.learning_rate_decay_op)
  79. previous_losses.append(loss)
  80. # 保存模型
  81. checkpoint_path = "chatbot_seq2seq.ckpt"
  82. model.saver.save(sess, checkpoint_path, global_step=model.global_step)
  83. #返回路径checkpoint_file = "%s-%s" % (save_path, "{:08d}".format(global_step))
  84. loss = 0.0#置当前损失为0
  85. # 使用测试数据评估模型
  86. for bucket_id in range(len(buckets)):
  87. if len(test_set[bucket_id]) == 0:
  88. continue
  89. #获取当前bucket的encoder_inputs, decoder_inputs, target_weights
  90. encoder_inputs, decoder_inputs, target_weights = model.get_batch(test_set, bucket_id)
  91. #计算bucket_id的损失权重
  92. _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
  93. eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
  94. print(bucket_id, eval_ppx)#输出的是bucket_id与eval_ppx

   这个阶段最好要用GPU运行,不然需要很长时间。

          4)使用训练好的模型

                 

  1. #coding=utf-8
  2. import tensorflow as tf # 0.12
  3. from tensorflow.models.rnn.translate import seq2seq_model#在翻译模型中,引入seq2seq_model
  4. import os
  5. import numpy as np
  6. PAD_ID = 0
  7. GO_ID = 1
  8. EOS_ID = 2
  9. UNK_ID = 3
  10. #词汇表路径path
  11. train_encode_vocabulary = 'train_encode_vocabulary'
  12. train_decode_vocabulary = 'train_decode_vocabulary'
  13. #读取词汇表
  14. def read_vocabulary(input_file):
  15. tmp_vocab = []
  16. with open(input_file, "r") as f:
  17. tmp_vocab.extend(f.readlines())#打开的文件全部读入input_file中
  18. tmp_vocab = [line.strip() for line in tmp_vocab]#转换成列表
  19. vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
  20. return vocab, tmp_vocab#返回字典,列表
  21. vocab_en, _, = read_vocabulary(train_encode_vocabulary)#得到词汇字典
  22. _, vocab_de, = read_vocabulary(train_decode_vocabulary)#得到词汇列表
  23. # 词汇表大小5000
  24. vocabulary_encode_size = 5000
  25. vocabulary_decode_size = 5000
  26. buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
  27. layer_size = 256 # 每层大小
  28. num_layers = 3 # 层数
  29. batch_size = 1
  30. model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size,
  31. buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm=5.0,
  32. batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.99,
  33. forward_only=True)
  34. #模型说明:源,目标词汇尺寸=vocabulary_encode(decode)_size;batch_size:训练期间使用的批次的大小;#forward_only:仅前向不传递误差
  35. model.batch_size = 1#batch_size=1
  36. with tf.Session() as sess:#打开作为一次会话
  37. # 恢复前一次训练
  38. ckpt = tf.train.get_checkpoint_state('.')#从检查点文件中返回一个状态(ckpt)
  39. #如果ckpt存在,输出模型路径
  40. if ckpt != None:
  41. print(ckpt.model_checkpoint_path)
  42. model.saver.restore(sess, ckpt.model_checkpoint_path)#储存模型参数
  43. else:
  44. print("没找到模型")
  45. #测试该模型的能力
  46. while True:
  47. input_string = input('me > ')
  48. # 退出
  49. if input_string == 'quit':
  50. exit()
  51. input_string_vec = []#输入字符串向量化
  52. for words in input_string.strip():
  53. input_string_vec.append(vocab_en.get(words, UNK_ID))#get()函数:如果words在词表中,返回索引号;否则,返回UNK_ID
  54. bucket_id = min([b for b in range(len(buckets)) if buckets[b][0] > len(input_string_vec)])#保留最小的大于输入的bucket的id
  55. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(input_string_vec, [])]}, bucket_id)
  56. #get_batch(A,B):两个参数,A为大小为len(buckets)的元组,返回了指定bucket_id的encoder_inputs,decoder_inputs,target_weights
  57. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
  58. #得到其输出
  59. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]#求得最大的预测范围列表
  60. if EOS_ID in outputs:#如果EOS_ID在输出内部,则输出列表为[,,,,:End]
  61. outputs = outputs[:outputs.index(EOS_ID)]
  62. response = "".join([tf.compat.as_str(vocab_de[output]) for output in outputs])#转为解码词汇分别添加到回复中
  63. print('AI > ' + response)#输出回复
结果:

转载地址: http://blog.topspeedsnail.com/archives/10735

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/337166
推荐阅读
相关标签
  

闽ICP备14008679号