赞
踩
目录
Word2Vec是一种用于将文本转换为向量表示的技术。它是由谷歌团队于2013年提出的一种神经网络模型。Word2Vec可以将单词表示为高维空间中的向量,使得具有相似含义的单词在向量空间中距离较近。这种向量表示可以用于各种自然语言处理任务,如语义相似度计算、文本分类和命名实体识别等。Word2Vec的核心思想是通过预测上下文或预测目标词来学习词向量。具体而言,它使用连续词袋(CBOW)和跳字模型(Skip-gram)来训练神经网络,从而得到单词的向量表示。这些向量可以捕捉到单词之间的语义和语法关系,使得它们在计算机中更容易处理和比较。Word2Vec已经成为自然语言处理领域中常用的工具,被广泛应用于各种文本分析和语义理解任务中。
- import os
- os.environ['KMP_DUPLICATE_LIB_OK']='True'
-
- #Dataset 10 sentences to create word vectors
-
- corpus = ['king is a strong man',
- 'queen is a wise woman',
- 'boy is a young man',
- 'girl is a young woman',
- 'prince is a young king',
- 'princess is a young queen',
- 'man is strong',
- 'woman is pretty',
- 'prince is a boy will be king',
- 'princess is a girl will be queen']
-
- #Remove stop words
-
- def remove_stop_words(corpus):
- stop_words = ['is', 'a', 'will', 'be']
- results = []
- for text in corpus:
- tmp = text.split(' ')
- for stop_word in stop_words:
- if stop_word in tmp:
- tmp.remove(stop_word)
- results.append(" ".join(tmp))
-
- return results
-
- corpus = remove_stop_words(corpus)
-
- corpus
-
- '''结果:
- ['king strong man',
- 'queen wise woman',
- 'boy young man',
- 'girl young woman',
- 'prince young king',
- 'princess young queen',
- 'man strong',
- 'woman pretty',
- 'prince boy king',
- 'princess girl queen']
- '''
- words = []
- for text in corpus:
- for word in text.split(' '):
- words.append(word)
-
- words = set(words)
-
- word2int = {}
-
- for i,word in enumerate(words):
- word2int[word] = i
- print(word2int)
- '''结果:
- {'strong': 0,
- 'wise': 1,
- 'man': 2,
- 'boy': 3,
- 'queen': 4,
- 'king': 5,
- 'princess': 6,
- 'young': 7,
- 'woman': 8,
- 'pretty': 9,
- 'prince': 10,
- 'girl': 11}
- '''
-
- sentences = []
- for sentence in corpus:
- sentences.append(sentence.split())
- print(sentences)
-
-
- WINDOW_SIZE = 2#距离为2
-
- data = []
- for sentence in sentences:
- for idx, word in enumerate(sentence):
- for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] :
- if neighbor != word:
- data.append([word, neighbor])
- print(data)
-
- data
- '''结果:
- [['king', 'strong'],
- ['king', 'man'],
- ['strong', 'king'],
- ['strong', 'man'],
- ['man', 'king'],
- ['man', 'strong'],
- ['queen', 'wise'],
- ['queen', 'woman'],
- ['wise', 'queen'],
- ['wise', 'woman'],
- ['woman', 'queen'],
- ['woman', 'wise'],
- ['boy', 'young'],
- ['boy', 'man'],
- ['young', 'boy'],
- ['young', 'man'],
- ['man', 'boy'],
- ['man', 'young'],
- ['girl', 'young'],
- ['girl', 'woman'],
- ['young', 'girl'],
- ['young', 'woman'],
- ['woman', 'girl'],
- ['woman', 'young'],
- ['prince', 'young'],
- ['prince', 'king'],
- ['young', 'prince'],
- ['young', 'king'],
- ['king', 'prince'],
- ['king', 'young'],
- ['princess', 'young'],
- ['princess', 'queen'],
- ['young', 'princess'],
- ['young', 'queen'],
- ['queen', 'princess'],
- ['queen', 'young'],
- ['man', 'strong'],
- ['strong', 'man'],
- ['woman', 'pretty'],
- ['pretty', 'woman'],
- ['prince', 'boy'],
- ['prince', 'king'],
- ['boy', 'prince'],
- ['boy', 'king'],
- ['king', 'prince'],
- ['king', 'boy'],
- ['princess', 'girl'],
- ['princess', 'queen'],
- ['girl', 'princess'],
- ['girl', 'queen'],
- ['queen', 'princess'],
- ['queen', 'girl']]
- '''
搭建X,Y
- import pandas as pd
- for text in corpus:
- print(text)
-
- df = pd.DataFrame(data, columns = ['input', 'label'])
-
-
- word2int
-
- #Define Tensorflow Graph
- import tensorflow as tf
- import numpy as np
-
- ONE_HOT_DIM = len(words)
-
-
- # function to convert numbers to one hot vectors
- def to_one_hot_encoding(data_point_index):
- one_hot_encoding = np.zeros(ONE_HOT_DIM)
- one_hot_encoding[data_point_index] = 1
- return one_hot_encoding
-
- X = [] # input word
- Y = [] # target word
-
- for x, y in zip(df['input'], df['label']):
- X.append(to_one_hot_encoding(word2int[ x ]))
- Y.append(to_one_hot_encoding(word2int[ y ]))
-
- # convert them to numpy arrays
- X_train = np.asarray(X)
- Y_train = np.asarray(Y)
- import tensorflow.compat.v1 as tf
- tf.disable_v2_behavior()
-
-
- # making placeholders for X_train and Y_train
- x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
- y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
-
- # word embedding will be 2 dimension for 2d visualization
- EMBEDDING_DIM = 2
-
- # hidden layer: which represents word vector eventually
- W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
- b1 = tf.Variable(tf.random_normal([1])) #bias
- hidden_layer = tf.add(tf.matmul(x,W1), b1)
-
- # output layer
- W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
- b2 = tf.Variable(tf.random_normal([1]))
- prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))
-
- # loss function: cross entropy
- loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))
-
- # training operation
- train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
-
-
- sess = tf.Session()
- init = tf.global_variables_initializer()
- sess.run(init)
-
- iteration = 20000
- for i in range(iteration):
- # input is X_train which is one hot encoded word
- # label is Y_train which is one hot encoded neighbor word
- sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
- if i % 3000 == 0:
- print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))
-
- # Now the hidden layer (W1 + b1) is actually the word look up table
- vectors = sess.run(W1 + b1)
- print(vectors)
-
-
-
- import matplotlib.pyplot as plt
-
- fig, ax = plt.subplots()
-
- for word, x1, x2 in zip(words, w2v_df['x1'], w2v_df['x2']):
- ax.annotate(word, (x1,x2 ))
-
- PADDING = 1.0
- x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
- y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
- x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
- y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
-
- plt.xlim(x_axis_min,x_axis_max)
- plt.ylim(y_axis_min,y_axis_max)
- plt.rcParams["figure.figsize"] = (10,10)
-
- plt.show()
- # Deep learning:
- from tensorflow.python.keras.models import Input
- from keras.models import Model
- from keras.layers import Dense
-
- # Defining the size of the embedding
- embed_size = 2
-
- # Defining the neural network
- #inp = Input(shape=(X.shape[1],))
- #x = Dense(units=embed_size, activation='linear')(inp)
- #x = Dense(units=Y.shape[1], activation='softmax')(x)
- xx = Input(shape=(X_train.shape[1],))
- yy = Dense(units=embed_size, activation='linear')(xx)
- yy = Dense(units=Y_train.shape[1], activation='softmax')(yy)
- model = Model(inputs=xx, outputs=yy)
- model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
-
- # Optimizing the network weights
- model.fit(
- x=X_train,
- y=Y_train,
- batch_size=256,
- epochs=1000
- )
-
- # Obtaining the weights from the neural network.
- # These are the so called word embeddings
-
- # The input layer
- weights = model.get_weights()[0]
-
- # Creating a dictionary to store the embeddings in. The key is a unique word and
- # the value is the numeric vector
- embedding_dict = {}
- for word in words:
- embedding_dict.update({
- word: weights[df.get(word)]
- })
-
-
- import matplotlib.pyplot as plt
-
- fig, ax = plt.subplots()
-
- #for word, x1, x2 in zip(words, w2v_df['x1'], w2v_df['x2']):
- for word, x1, x2 in zip(words, weights[:,0], weights[:,1]):
- ax.annotate(word, (x1,x2 ))
-
- PADDING = 1.0
- x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
- y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
- x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
- y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
-
- plt.xlim(x_axis_min,x_axis_max)
- plt.ylim(y_axis_min,y_axis_max)
- plt.rcParams["figure.figsize"] = (10,10)
-
- plt.show()
- X_train[2]
- #结果:array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) strong
-
- model.predict(X_train)[2]
- '''结果:
- array([0.07919139, 0.0019384 , 0.48794392, 0.05578128, 0.00650001,
- 0.10083131, 0.02451131, 0.03198219, 0.04424168, 0.0013569 ,
- 0.16189449, 0.00382716], dtype=float32) 预测结果:man
- '''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。