赞
踩
复杂度计算和主题词提取,需要三个文件,未分词的数据集,词典文件,停用词,路径最好用/替代\,
- import numpy as np
- import pandas as pd
- import re
- import jieba
- import jieba.posseg as psg
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
- from sklearn.decomposition import LatentDirichletAllocation
- import pyLDAvis
- import pyLDAvis.sklearn
- import matplotlib.pyplot as plt
- data=pd.read_table("数据分析师测试.txt")#第一行标题为content
- dic_file = "dict.txt"#词性字典,例牛马 n,摸鱼 v
- stop_file = "stopword.txt"
-
-
- def chinese_word_cut(mytext):
- jieba.load_userdict(dic_file)
- jieba.initialize()
- try:
- stopword_list = open(stop_file, encoding='utf-8')
- except:
- stopword_list = []
- print("error in stop_file")
- stop_list = []
- flag_list = ['n', 'nz', 'vn']
- for line in stopword_list:
- line = re.sub(u'\n|\\r', '', line)
- stop_list.append(line)
-
- word_list = []
- # jieba分词
- seg_list = psg.cut(mytext)
- for seg_word in seg_list:
- word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
- # word = seg_word.word #如果想要分析英语文本,注释这行代码,启动下行代码
- find = 0
- for stop_word in stop_list:
- if stop_word == word or len(word) < 2: # this word is stopword
- find = 1
- break
- if find == 0 and seg_word.flag in flag_list:
- word_list.append(word)
- return (" ").join(word_list)
- data["content_cutted"] = data.content.apply(chinese_word_cut)
- def print_top_words(model, feature_names, n_top_words):
- tword = []
- for topic_idx, topic in enumerate(model.components_):
- print("Topic #%d:" % topic_idx)
- topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
- tword.append(topic_w)
- print(topic_w)
- return tword
- n_features = 1000 #提取1000个特征词语
- tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
- max_features=n_features,
- stop_words='english',
- max_df = 0.5,
- min_df = 10)
- tf = tf_vectorizer.fit_transform(data.content_cutted)
- n_topics = 8
- lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
- learning_method='batch',
- learning_offset=50,
- # doc_topic_prior=0.1,
- # topic_word_prior=0.01,
- random_state=0)
- lda.fit(tf)
- n_top_words = 25
- tf_feature_names = tf_vectorizer.get_feature_names_out()
-
- topic_word = print_top_words(lda, tf_feature_names, n_top_words)
-
- topics=lda.transform(tf)
- topic = []
- for t in topics:
- topic.append("Topic #"+str(list(t).index(np.max(t))))
- data['概率最大的主题序号']=topic
- data['每个主题对应概率']=list(topics)
- data.to_excel("data_topic.xlsx",index=False)
-
- plexs = []
- scores = []
- n_max_topics = 16
- for i in range(1,n_max_topics):
- print(i)
- lda = LatentDirichletAllocation(n_components=i, max_iter=50,
- learning_method='batch',
- learning_offset=50,random_state=0)
- lda.fit(tf)
- plexs.append(lda.perplexity(tf))
- scores.append(lda.score(tf))
- n_t=15#区间最右侧的值。注意:不能大于n_max_topics
- x=list(range(1,n_t+1))
- plt.plot(x,plexs[0:n_t])
- plt.xlabel("number of topics")
- plt.ylabel("perplexity")
- plt.savefig('困惑度' + '.png')
- print("图片保存成功")
- plt.show()
文本分词+词云代码,可利用词云图选出无意义的词语。
- #注意!文件一定要标明是UTF-8格式,这是个警告!
- #文本分词
- import jieba as jb
- # import docx
- import re
-
- # stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
- # 加载停用词,并把停用词存储为列表的形式
- def loadStopWords(fileName):
- with open(fileName,encoding='utf-8', errors='ignore') as f:
- stopwords = f.read()
- stopwords_list = stopwords.split('\n')
- custom_stopwords_list = [i for i in stopwords_list]
- custom_stopwords_list.extend(['二'])
- return custom_stopwords_list
-
-
- # 把文本分词并去除停用词,返回列表
- def wordsCut(words, stopWordsFile):
- result = jb.cut(words)
- newWords = []
- stopWords = loadStopWords(stopWordsFile)
- for s in result:
- if s not in stopWords:
- newWords.append(s)
- return newWords
-
-
- # 去空格
- def not_empty(s):
- return s and s.strip()
-
-
- # 把样本文件的每一行每一句做分词处理,并写文件
- def fileCut(fileName, writeFile, stopWordsFile):
- dataMat = []
- fr = open(fileName,encoding='utf-8', errors='ignore')#gbk,gb18030
- frW = open(writeFile, 'w',encoding='utf-8', errors='ignore')
- for line in fr.readlines(): # 将文件逐行读取
- curLine = line.strip() # 去掉所有空格
- curLine1 = curLine.upper() # 把字符串中的英文字母转换成大写
- cutWords = wordsCut(curLine1, stopWordsFile) # 分词且去停用词,返回一行的列表
- cutWords = list(filter(not_empty, cutWords))
- for i in range(len(cutWords)):
- if re.match(r'^\d.*', cutWords[i]):
- cutWords[i] = ''
- else:
- frW.write(cutWords[i])
- frW.write(' ')
- frW.write('\n')
- dataMat.append(cutWords)
- frW.close()
-
-
- # stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
- fileName = r'D:/Project/database/数据分析师测试.txt'
- writeFile = r'D:/Project/database/数据分析师测试已分词.txt'
- stopWordsFile = r'D:/Project/stopword.txt'
- fileCut(fileName, writeFile, stopWordsFile)
- #完整代码
- import numpy as np
- import wordcloud as wc
- import matplotlib.pyplot as plt
- from PIL import Image
- import jieba
- #1.打开文件,将评论读入一个字符串变量
- with open("D:/Project/岗位职责已分词.txt", mode="r",encoding='utf-8') as fp:
- text= fp.read()
- mask = np.array(Image.open('D:/Project/小猫咪3.png'))
- #3.指定停用词
- stopwords=['的','了','熟悉','负责','产品','供应商','新','工程师','岁','关键字','岗位职责','以上学历','专业','年龄','年','熟练','流程','质量','需求','客户','系统','具备','软件','管理','开发','公司','相关','项目','数据','设计','经验','技术','工作','合作','就是','上班','地址','查看','地图','优先','AND','职能','类别','任职']
- # 4.创建WordCloud对象,设置基本信息
- word_cloud = wc.WordCloud(mask=mask,font_path="C:\Windows\Fonts\SIMHEI.ttf",stopwords=stopwords,background_color='white') #此处stopwords是指定停用词
- # 5.调用词云对象的generate方法加载文本,生成词云图
- word_cloud.generate(text)
- #6.显示词云图
- plt.imshow(word_cloud)
- plt.show()
- word_cloud.to_file('小猫咪' + '.png')
- print('小猫咪'+'词云图','保存成功')
LDA可视化,注此处的数据集是已经分词的,代码运行完成后会生成一个链接,点进链接即可看到LDA可视化网页,num_topics为主题词数量。
- from gensim.corpora import Dictionary
- from gensim.models import LdaModel
- import codecs
- import pyLDAvis.gensim
-
- if __name__ == '__main__':
- doc1 = codecs.open('D:/Project/database/华东已分词.txt', mode='r',encoding='utf-8')
- lines = doc1.readlines()
- all_set = []
-
- listword = ['数据', '数据分析', '工作', '年', '强', '良好', '相关', '企业', '公司', '要求', '优先', '各类'
- ,'良好','具备','熟练','熟悉',' - ','类','提供','优先','具有','进行' ,'要求','能力','分析','负责'
- ,'经验','任职','完成','专业','活动','问题']
- for line in lines:
- line = line.replace(' \n', '')
- line = line.replace('\r\n', '')
- line = line.split()
- all_set.append([w for w in line if w not in listword])
- # print(all_set) #每行的嵌套列表 [['xx','xx'],['xx','xx'],...,['xx','xx']]
- # 构建训练语料,并将其可视化
- dictionary = Dictionary(all_set)
- corpus = [dictionary.doc2bow(text) for text in all_set]
- lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)
- vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
- pyLDAvis.show(vis_data, open_browser=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。