赞
踩
python代码写将 HarryPorter 电子书作为语料库,分别使用词袋模型,TF-IDF模型和Word2Vec模型进行文本向量化。
1. 首先将数据预处理,Word2Vec 训练时要求考虑每个单词前后的五个词汇,地址为
作为其上下文 ,生成的向量维度为50维
2.分别搜索 courtroom 和 wizard 这两个词语义最近的5个单词
3.对wizard 和witch 这两个单词在二维平面上进行可视化
- nltk.download('punkt')
- nltk.download('stopwords')
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
- from gensim.models import Word2Vec
- from gensim.models import TfidfModel
- from gensim.corpora import Dictionary
- import matplotlib.pyplot as plt
-
- # 导入停用词
- stop_words = set(stopwords.words('english'))
-
- # 加载数据
- corpus_file = '/Users/zhengyawen/Downloads/HarryPorter.txt'
- with open(corpus_file, 'r', encoding='utf-8') as file:
- data = file.read()
-
- # 预处理数据
- sentences = [word_tokenize(sentence.lower()) for sentence in data.split('.')]
- preprocessed_sentences = []
- for sentence in sentences:
- valid_words = []
- for word in sentence:
- if word.isalpha() and word not in stop_words:
- valid_words.append(word)
- preprocessed_sentences.append(valid_words)
-
- # 构建Word2Vec模型
- w2v_model = Word2Vec(sentences=preprocessed_sentences, vector_size=50, window=5, min_count=1, sg=0)
-
- # 获取单词向量
- vector_courtroom = w2v_model.wv['courtroom']
- vector_wizard = w2v_model.wv['wizard']
-
- # 搜索与“courtroom”和“wizard”最相似的5个单词
- similar_words_courtroom = w2v_model.wv.most_similar('courtroom', topn=5)
- similar_words_wizard = w2v_model.wv.most_similar('wizard', topn=5)
-
- print("Word2Vec模型:")
- print("单词 courtroom 的向量:", vector_courtroom)
- print("单词 wizard 的向量:", vector_wizard)
- print("语义最近的5个单词 (courtroom):")
- for word, similarity in similar_words_courtroom:
- print(f"{word}: {similarity}")
-
- print("\n语义最近的5个单词 (wizard):")
- for word, similarity in similar_words_wizard:
- print(f"{word}: {similarity}")
-
- # 构建词袋模型
- dictionary = Dictionary(preprocessed_sentences)
- corpus = [dictionary.doc2bow(sentence) for sentence in preprocessed_sentences]
- tfidf_model = TfidfModel(corpus)
- corpus_tfidf = tfidf_model[corpus]
-
- # 可视化Word2Vec模型中wizard和witch的向量
- words_to_plot = ['wizard', 'witch']
- word_vectors = [w2v_model.wv[word] for word in words_to_plot]
-
- # 可视化
- plt.figure(figsize=(10, 6))
- for i, word in enumerate(words_to_plot):
- plt.scatter(word_vectors[i][0], word_vectors[i][1], label=word)
-
- plt.xlabel('Dimension 1')
- plt.ylabel('Dimension 2')
- plt.title('Visualization of Word Vectors')
- plt.legend()
- plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。