当前位置:   article > 正文

利用keras进行情感分析(一)_基于keras+bert模型编写的情感分析python源码

基于keras+bert模型编写的情感分析python源码

textRNN

  1. # -*- coding: utf-8 -*-
  2. # @Time : 2019/7/4 9:56
  3. # @Author : hejipei
  4. # @File : keras_sentiment_TextRNN.py
  5. """ """
  6. from keras import Input, Model
  7. from keras.layers import Embedding, Dense, Dropout, CuDNNLSTM
  8. class TextRNN(object):
  9. def __init__(self, maxlen, max_features, embedding_dims,
  10. class_num=1,
  11. last_activation='sigmoid'):
  12. self.maxlen = maxlen
  13. self.max_features = max_features
  14. self.embedding_dims = embedding_dims
  15. self.class_num = class_num
  16. self.last_activation = last_activation
  17. def get_model(self):
  18. input = Input((self.maxlen,))
  19. embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
  20. x = CuDNNLSTM(128)(embedding) # LSTM or GRU
  21. output = Dense(self.class_num, activation=self.last_activation)(x)
  22. model = Model(inputs=input, outputs=output)
  23. return model
  24. from keras.callbacks import EarlyStopping
  25. from keras.datasets import imdb
  26. from keras.preprocessing import sequence
  27. # from text_rnn import TextRNN
  28. max_features = 5000
  29. maxlen = 400
  30. batch_size = 32
  31. embedding_dims = 50
  32. epochs = 10
  33. print('Loading data...')
  34. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
  35. print(len(x_train), 'train sequences')
  36. print(len(x_test), 'test sequences')
  37. print('Pad sequences (samples x time)...')
  38. x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
  39. x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
  40. print('x_train shape:', x_train.shape)
  41. print('x_test shape:', x_test.shape)
  42. print('Build model...')
  43. model = TextRNN(maxlen, max_features, embedding_dims).get_model()
  44. model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
  45. print('Train...')
  46. early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
  47. model.fit(x_train, y_train,
  48. batch_size=batch_size,
  49. epochs=epochs,
  50. callbacks=[early_stopping],
  51. validation_data=(x_test, y_test))
  52. print('Test...')
  53. result = model.predict(x_test)

FastText

  1. # -*- coding: utf-8 -*-
  2. # @Time : 2019/7/2 18:48
  3. # @Author : hejipei
  4. # @File : keras_sentiment_FastText.py
  5. """ """
  6. import numpy as np
  7. from keras.preprocessing import sequence
  8. from keras import Input, Model
  9. from keras.layers import Embedding, GlobalAveragePooling1D, Dense
  10. from keras.callbacks import EarlyStopping
  11. from keras.datasets import imdb
  12. class FastText(object):
  13. def __init__(self, maxlen, max_features, embedding_dims,class_num=1,last_activation='sigmoid'):
  14. self.maxlen = maxlen
  15. self.max_features = max_features
  16. self.embedding_dims = embedding_dims
  17. self.class_num = class_num
  18. self.last_activation = 'sigmoid'
  19. def get_model(self):
  20. input = Input((self.maxlen,))
  21. embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
  22. x = GlobalAveragePooling1D()(embedding)
  23. output = Dense(self.class_num, activation=self.last_activation)(x)
  24. model = Model(inputs=input, outputs=output)
  25. return model
  26. def create_ngram_set(input_list, ngram_value=2):
  27. """
  28. Extract a set of n-grams from a list of integers.
  29. # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
  30. {(1, 4), (4, 1), (4, 9), (9, 4)}
  31. # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
  32. [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
  33. """
  34. return set(zip(*[input_list[i:] for i in range(ngram_value)]))
  35. def add_ngram(sequences, token_indice, ngram_range=2):
  36. """
  37. Augment the input list of list (sequences) by appending n-grams values.
  38. Example: adding bi-gram
  39. # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  40. # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
  41. # >>> add_ngram(sequences, token_indice, ngram_range=2)
  42. [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
  43. Example: adding tri-gram
  44. # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  45. # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
  46. # >>> add_ngram(sequences, token_indice, ngram_range=3)
  47. [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
  48. """
  49. new_sequences = []
  50. for input_list in sequences:
  51. new_list = input_list[:]
  52. for ngram_value in range(2, ngram_range + 1):
  53. for i in range(len(new_list) - ngram_value + 1):
  54. ngram = tuple(new_list[i:i + ngram_value])
  55. if ngram in token_indice:
  56. new_list.append(token_indice[ngram])
  57. new_sequences.append(new_list)
  58. return new_sequences
  59. # Set parameters:
  60. # ngram_range = 2 will add bi-grams features
  61. ngram_range = 1
  62. max_features = 5000
  63. maxlen = 400
  64. batch_size = 32
  65. embedding_dims = 50
  66. epochs = 10
  67. print('Loading data...')
  68. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
  69. print(len(x_train), 'train sequences')
  70. print(len(x_test), 'test sequences')
  71. print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
  72. print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
  73. if ngram_range > 1:
  74. print('Adding {}-gram features'.format(ngram_range))
  75. # Create set of unique n-gram from the training set.
  76. ngram_set = set()
  77. for input_list in x_train:
  78. for i in range(2, ngram_range + 1):
  79. set_of_ngram = create_ngram_set(input_list, ngram_value=i)
  80. ngram_set.update(set_of_ngram)
  81. # Dictionary mapping n-gram token to a unique integer.
  82. # Integer values are greater than max_features in order
  83. # to avoid collision with existing features.
  84. start_index = max_features + 1
  85. token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
  86. indice_token = {token_indice[k]: k for k in token_indice}
  87. # max_features is the highest integer that could be found in the dataset.
  88. max_features = np.max(list(indice_token.keys())) + 1
  89. # Augmenting x_train and x_test with n-grams features
  90. x_train = add_ngram(x_train, token_indice, ngram_range)
  91. x_test = add_ngram(x_test, token_indice, ngram_range)
  92. print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
  93. print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
  94. print('Pad sequences (samples x time)...')
  95. x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
  96. x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
  97. print('x_train shape:', x_train.shape)
  98. print('x_test shape:', x_test.shape)
  99. print('Build model...')
  100. model = FastText(maxlen, max_features, embedding_dims).get_model()
  101. model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
  102. print('Train...')
  103. early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
  104. model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping], validation_data=(x_test, y_test))
  105. score, acc = model.evaluate(x_test, y_test, batch_size=128)
  106. print('Test score:', score)
  107. print('Test accuracy:', acc)
  108. # print('Test...')
  109. # result = model.predict(x_test)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/198517?site
推荐阅读
相关标签
  

闽ICP备14008679号