当前位置:   article > 正文

textcnn多分类(keras实现)实例详解_textcnn训练多分类

textcnn训练多分类

源代码地址:https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py

这是一个非常经典的代码,我增加了一定的注释和保存模型的代码

使用的语料库和数据集都是网站公开数据(百度直接搜索名字都能搜到,下面提供下载地址)

预料库:http://nlp.stanford.edu/data/glove.6B.zip

数据集:http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html

  1. from __future__ import print_function
  2. import os
  3. import sys
  4. import numpy as np
  5. from keras.preprocessing.text import Tokenizer
  6. from keras.preprocessing.sequence import pad_sequences
  7. from keras.utils import to_categorical
  8. from keras.layers import Dense, Input, GlobalMaxPooling1D
  9. from keras.layers import Conv1D, MaxPooling1D, Embedding
  10. from keras.models import Model
  11. from keras.initializers import Constant
  12. from keras.callbacks import ModelCheckpoint
  13. BASE_DIR = ''
  14. #glove模型路径
  15. GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
  16. #文本语料路径
  17. TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
  18. MAX_SEQUENCE_LENGTH = 1000
  19. MAX_NUM_WORDS = 20000
  20. EMBEDDING_DIM = 100
  21. VALIDATION_SPLIT = 0.2
  22. # first, build index mapping words in the embeddings set
  23. # to their embedding vector
  24. #1.准备glove词向量和它们对应的字典映射
  25. print('Indexing word vectors.')
  26. #我们从GloVe文件中解析出每个词和它所对应的词向量,并用字典的方式存储
  27. embeddings_index = {}
  28. with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
  29. for line in f:
  30. word, coefs = line.split(maxsplit=1)
  31. coefs = np.fromstring(coefs, 'f', sep=' ')
  32. embeddings_index[word] = coefs
  33. print('Found %s word vectors.' % len(embeddings_index))
  34. # second, prepare text samples and their labels
  35. print('Processing text dataset')
  36. #我们首先遍历下语料文件下的所有文件夹,获得不同类别的新闻以及对应的类别标签,代码如下所示
  37. texts = [] # list of text samples
  38. labels_index = {} # dictionary mapping label name to numeric id
  39. labels = [] # list of label ids
  40. for name in sorted(os.listdir(TEXT_DATA_DIR)):
  41. path = os.path.join(TEXT_DATA_DIR, name)
  42. if os.path.isdir(path):
  43. label_id = len(labels_index)
  44. labels_index[name] = label_id
  45. for fname in sorted(os.listdir(path)):
  46. if fname.isdigit():
  47. fpath = os.path.join(path, fname)
  48. args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
  49. with open(fpath, **args) as f:
  50. t = f.read()
  51. i = t.find('\n\n') # skip header
  52. if 0 < i:
  53. t = t[i:]
  54. texts.append(t)
  55. labels.append(label_id)
  56. #print(texts[0],labels[0])
  57. print('Found %s texts.' % len(texts))
  58. #之后,我们可以新闻样本转化为神经网络训练所用的张量。所用到的Keras库是keras.preprocessing.text.Tokenizer和keras.preprocessing.sequence.pad_sequences。代码如下所示
  59. # finally, vectorize the text samples into a 2D integer tensor
  60. #tokenizer计算机在处理语言文字时,是无法理解文字的含义,通常会把一个词(中文单个字或者词组认为是一个词)转化为一个正整数,于是一个文本就变成了一个序列。而tokenizer的核心任务就是做这个事情。
  61. # 具体参考:https://www.jianshu.com/p/ac721387fe48
  62. tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
  63. tokenizer.fit_on_texts(texts)
  64. sequences = tokenizer.texts_to_sequences(texts)
  65. word_index = tokenizer.word_index
  66. print('Found %s unique tokens.' % len(word_index))
  67. #为了实现的简便,keras只能接受长度相同的序列输入。因此如果目前序列长度参差不齐,这时需要使用pad_sequences()。该函数是将序列转化为经过填充以后的一个长度相同的新序列新序列。
  68. data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  69. print(data)
  70. #data变成文本个数*MAX_SEQUENCE_LENGTH的形状
  71. #to_categorical就是将类别向量转换为二进制(只有0和1)的矩阵类型表示。其表现为将原有的类别向量转换为独热编码的形式
  72. labels = to_categorical(np.asarray(labels))
  73. print(labels)
  74. #labels变成文本个数*分类个数的形状
  75. print('Shape of data tensor:', data.shape)
  76. print('Shape of label tensor:', labels.shape)
  77. # split the data into a training set and a validation set
  78. indices = np.arange(data.shape[0])
  79. #print(indices)
  80. #np.random.shuffle(x) 现场修改序列,改变自身内容。(类似洗牌,打乱顺序)
  81. np.random.shuffle(indices)
  82. data = data[indices]
  83. print(data)
  84. labels = labels[indices]
  85. print(labels)
  86. num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
  87. print(data.shape[0])
  88. x_train = data[:-num_validation_samples]
  89. y_train = labels[:-num_validation_samples]
  90. x_val = data[-num_validation_samples:]
  91. y_val = labels[-num_validation_samples:]
  92. print('Preparing embedding matrix.')
  93. # prepare embedding matrix
  94. num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
  95. #此时,我们可以根据得到的字典生成上文所定义的词向量矩阵
  96. embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
  97. for word, i in word_index.items():
  98. if i >= MAX_NUM_WORDS:
  99. continue
  100. embedding_vector = embeddings_index.get(word)
  101. if embedding_vector is not None:
  102. # words not found in embedding index will be all-zeros.
  103. # 从预训练模型的词向量到语料库的词向量映射
  104. embedding_matrix[i] = embedding_vector
  105. # load pre-trained word embeddings into an Embedding layer
  106. # note that we set trainable = False so as to keep the embeddings fixed
  107. #现在我们将这个词向量矩阵加载到Embedding层中,注意,我们设置trainable=False使得这个编码层不可再训练。
  108. embedding_layer = Embedding(num_words,
  109. EMBEDDING_DIM,
  110. embeddings_initializer=Constant(embedding_matrix),
  111. input_length=MAX_SEQUENCE_LENGTH,
  112. trainable=False)
  113. print('Training model.')
  114. # train a 1D convnet with global maxpooling
  115. sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
  116. embedded_sequences = embedding_layer(sequence_input)
  117. x = Conv1D(128, 5, activation='relu')(embedded_sequences)
  118. x = MaxPooling1D(5)(x)
  119. x = Conv1D(128, 5, activation='relu')(x)
  120. x = MaxPooling1D(5)(x)
  121. x = Conv1D(128, 5, activation='relu')(x)
  122. x = GlobalMaxPooling1D()(x)
  123. x = Dense(128, activation='relu')(x)
  124. preds = Dense(len(labels_index), activation='softmax')(x)
  125. model = Model(sequence_input, preds)
  126. model.compile(loss='categorical_crossentropy',
  127. optimizer='rmsprop',
  128. metrics=['acc'])
  129. # checkpoint
  130. filepath = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
  131. # 中途训练效果提升, 则将文件保存, 每提升一次, 保存一次
  132. checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,
  133. mode='max')
  134. callbacks_list = [checkpoint]
  135. model.fit(x_train, y_train,
  136. batch_size=128,
  137. epochs=2,
  138. validation_data=(x_val, y_val))

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/353334
推荐阅读
相关标签
  

闽ICP备14008679号