当前位置:   article > 正文

使用word2vec+tensorflow自然语言处理NLP_tensorflow 完成word2vec

tensorflow 完成word2vec

目录

介绍: 

 搭建上下文或预测目标词来学习词向量

建模1:

建模2:

预测:

介绍: 

Word2Vec是一种用于将文本转换为向量表示的技术。它是由谷歌团队于2013年提出的一种神经网络模型。Word2Vec可以将单词表示为高维空间中的向量,使得具有相似含义的单词在向量空间中距离较近。这种向量表示可以用于各种自然语言处理任务,如语义相似度计算、文本分类和命名实体识别等。Word2Vec的核心思想是通过预测上下文或预测目标词来学习词向量。具体而言,它使用连续词袋(CBOW)和跳字模型(Skip-gram)来训练神经网络,从而得到单词的向量表示。这些向量可以捕捉到单词之间的语义和语法关系,使得它们在计算机中更容易处理和比较。Word2Vec已经成为自然语言处理领域中常用的工具,被广泛应用于各种文本分析和语义理解任务中。

  1. import os
  2. os.environ['KMP_DUPLICATE_LIB_OK']='True'
  3. #Dataset 10 sentences to create word vectors
  4. corpus = ['king is a strong man',
  5. 'queen is a wise woman',
  6. 'boy is a young man',
  7. 'girl is a young woman',
  8. 'prince is a young king',
  9. 'princess is a young queen',
  10. 'man is strong',
  11. 'woman is pretty',
  12. 'prince is a boy will be king',
  13. 'princess is a girl will be queen']
  14. #Remove stop words
  15. def remove_stop_words(corpus):
  16. stop_words = ['is', 'a', 'will', 'be']
  17. results = []
  18. for text in corpus:
  19. tmp = text.split(' ')
  20. for stop_word in stop_words:
  21. if stop_word in tmp:
  22. tmp.remove(stop_word)
  23. results.append(" ".join(tmp))
  24. return results
  25. corpus = remove_stop_words(corpus)
  26. corpus
  27. '''结果:
  28. ['king strong man',
  29. 'queen wise woman',
  30. 'boy young man',
  31. 'girl young woman',
  32. 'prince young king',
  33. 'princess young queen',
  34. 'man strong',
  35. 'woman pretty',
  36. 'prince boy king',
  37. 'princess girl queen']
  38. '''

 搭建上下文或预测目标词来学习词向量

  1. words = []
  2. for text in corpus:
  3. for word in text.split(' '):
  4. words.append(word)
  5. words = set(words)
  6. word2int = {}
  7. for i,word in enumerate(words):
  8. word2int[word] = i
  9. print(word2int)
  10. '''结果:
  11. {'strong': 0,
  12. 'wise': 1,
  13. 'man': 2,
  14. 'boy': 3,
  15. 'queen': 4,
  16. 'king': 5,
  17. 'princess': 6,
  18. 'young': 7,
  19. 'woman': 8,
  20. 'pretty': 9,
  21. 'prince': 10,
  22. 'girl': 11}
  23. '''
  24. sentences = []
  25. for sentence in corpus:
  26. sentences.append(sentence.split())
  27. print(sentences)
  28. WINDOW_SIZE = 2#距离为2
  29. data = []
  30. for sentence in sentences:
  31. for idx, word in enumerate(sentence):
  32. for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] :
  33. if neighbor != word:
  34. data.append([word, neighbor])
  35. print(data)
  36. data
  37. '''结果:
  38. [['king', 'strong'],
  39. ['king', 'man'],
  40. ['strong', 'king'],
  41. ['strong', 'man'],
  42. ['man', 'king'],
  43. ['man', 'strong'],
  44. ['queen', 'wise'],
  45. ['queen', 'woman'],
  46. ['wise', 'queen'],
  47. ['wise', 'woman'],
  48. ['woman', 'queen'],
  49. ['woman', 'wise'],
  50. ['boy', 'young'],
  51. ['boy', 'man'],
  52. ['young', 'boy'],
  53. ['young', 'man'],
  54. ['man', 'boy'],
  55. ['man', 'young'],
  56. ['girl', 'young'],
  57. ['girl', 'woman'],
  58. ['young', 'girl'],
  59. ['young', 'woman'],
  60. ['woman', 'girl'],
  61. ['woman', 'young'],
  62. ['prince', 'young'],
  63. ['prince', 'king'],
  64. ['young', 'prince'],
  65. ['young', 'king'],
  66. ['king', 'prince'],
  67. ['king', 'young'],
  68. ['princess', 'young'],
  69. ['princess', 'queen'],
  70. ['young', 'princess'],
  71. ['young', 'queen'],
  72. ['queen', 'princess'],
  73. ['queen', 'young'],
  74. ['man', 'strong'],
  75. ['strong', 'man'],
  76. ['woman', 'pretty'],
  77. ['pretty', 'woman'],
  78. ['prince', 'boy'],
  79. ['prince', 'king'],
  80. ['boy', 'prince'],
  81. ['boy', 'king'],
  82. ['king', 'prince'],
  83. ['king', 'boy'],
  84. ['princess', 'girl'],
  85. ['princess', 'queen'],
  86. ['girl', 'princess'],
  87. ['girl', 'queen'],
  88. ['queen', 'princess'],
  89. ['queen', 'girl']]
  90. '''

 搭建X,Y

  1. import pandas as pd
  2. for text in corpus:
  3. print(text)
  4. df = pd.DataFrame(data, columns = ['input', 'label'])
  5. word2int
  6. #Define Tensorflow Graph
  7. import tensorflow as tf
  8. import numpy as np
  9. ONE_HOT_DIM = len(words)
  10. # function to convert numbers to one hot vectors
  11. def to_one_hot_encoding(data_point_index):
  12. one_hot_encoding = np.zeros(ONE_HOT_DIM)
  13. one_hot_encoding[data_point_index] = 1
  14. return one_hot_encoding
  15. X = [] # input word
  16. Y = [] # target word
  17. for x, y in zip(df['input'], df['label']):
  18. X.append(to_one_hot_encoding(word2int[ x ]))
  19. Y.append(to_one_hot_encoding(word2int[ y ]))
  20. # convert them to numpy arrays
  21. X_train = np.asarray(X)
  22. Y_train = np.asarray(Y)

建模1:

  1. import tensorflow.compat.v1 as tf
  2. tf.disable_v2_behavior()
  3. # making placeholders for X_train and Y_train
  4. x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
  5. y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
  6. # word embedding will be 2 dimension for 2d visualization
  7. EMBEDDING_DIM = 2
  8. # hidden layer: which represents word vector eventually
  9. W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
  10. b1 = tf.Variable(tf.random_normal([1])) #bias
  11. hidden_layer = tf.add(tf.matmul(x,W1), b1)
  12. # output layer
  13. W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
  14. b2 = tf.Variable(tf.random_normal([1]))
  15. prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))
  16. # loss function: cross entropy
  17. loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))
  18. # training operation
  19. train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  20. sess = tf.Session()
  21. init = tf.global_variables_initializer()
  22. sess.run(init)
  23. iteration = 20000
  24. for i in range(iteration):
  25. # input is X_train which is one hot encoded word
  26. # label is Y_train which is one hot encoded neighbor word
  27. sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
  28. if i % 3000 == 0:
  29. print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))
  30. # Now the hidden layer (W1 + b1) is actually the word look up table
  31. vectors = sess.run(W1 + b1)
  32. print(vectors)
  33. import matplotlib.pyplot as plt
  34. fig, ax = plt.subplots()
  35. for word, x1, x2 in zip(words, w2v_df['x1'], w2v_df['x2']):
  36. ax.annotate(word, (x1,x2 ))
  37. PADDING = 1.0
  38. x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
  39. y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
  40. x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
  41. y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
  42. plt.xlim(x_axis_min,x_axis_max)
  43. plt.ylim(y_axis_min,y_axis_max)
  44. plt.rcParams["figure.figsize"] = (10,10)
  45. plt.show()

建模2:

  1. # Deep learning:
  2. from tensorflow.python.keras.models import Input
  3. from keras.models import Model
  4. from keras.layers import Dense
  5. # Defining the size of the embedding
  6. embed_size = 2
  7. # Defining the neural network
  8. #inp = Input(shape=(X.shape[1],))
  9. #x = Dense(units=embed_size, activation='linear')(inp)
  10. #x = Dense(units=Y.shape[1], activation='softmax')(x)
  11. xx = Input(shape=(X_train.shape[1],))
  12. yy = Dense(units=embed_size, activation='linear')(xx)
  13. yy = Dense(units=Y_train.shape[1], activation='softmax')(yy)
  14. model = Model(inputs=xx, outputs=yy)
  15. model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
  16. # Optimizing the network weights
  17. model.fit(
  18. x=X_train,
  19. y=Y_train,
  20. batch_size=256,
  21. epochs=1000
  22. )
  23. # Obtaining the weights from the neural network.
  24. # These are the so called word embeddings
  25. # The input layer
  26. weights = model.get_weights()[0]
  27. # Creating a dictionary to store the embeddings in. The key is a unique word and
  28. # the value is the numeric vector
  29. embedding_dict = {}
  30. for word in words:
  31. embedding_dict.update({
  32. word: weights[df.get(word)]
  33. })
  34. import matplotlib.pyplot as plt
  35. fig, ax = plt.subplots()
  36. #for word, x1, x2 in zip(words, w2v_df['x1'], w2v_df['x2']):
  37. for word, x1, x2 in zip(words, weights[:,0], weights[:,1]):
  38. ax.annotate(word, (x1,x2 ))
  39. PADDING = 1.0
  40. x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
  41. y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
  42. x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
  43. y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
  44. plt.xlim(x_axis_min,x_axis_max)
  45. plt.ylim(y_axis_min,y_axis_max)
  46. plt.rcParams["figure.figsize"] = (10,10)
  47. plt.show()

预测:

  1. X_train[2]
  2. #结果:array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) strong
  3. model.predict(X_train)[2]
  4. '''结果:
  5. array([0.07919139, 0.0019384 , 0.48794392, 0.05578128, 0.00650001,
  6. 0.10083131, 0.02451131, 0.03198219, 0.04424168, 0.0013569 ,
  7. 0.16189449, 0.00382716], dtype=float32) 预测结果:man
  8. '''

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/367113
推荐阅读
相关标签
  

闽ICP备14008679号