【Python】自然语言处理——实现批量文本的分词、提取关键词、制作词云思路_自然语言处理文本分词与虚词过滤详细代码

作者：天景科技苑 | 2024-07-25 05:37:49
踩
自然语言处理文本分词与虚词过滤详细代码

#coding=utf-8
import jieba
from string import punctuation
import re
import jieba.analyse
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import os
import json
 
 
#去除文件中特殊字符，避免对分词精准性产生影响
def clean(tian_char):
    punc = punctuation + u'.,;《》？！“”‘’@#￥%…&×（）——+【】{};；●，。&～、|\s:：'
    tian_char_clean = re.sub(r"[{}]+".format(punc), " ", tian_char)
    return tian_char_clean
 
 
#jieba分词并将分词文件存储
def jiebacut(stopword_list, tian_char_clean, cut_file_path):
 
    seg_list = jieba.cut(tian_char_clean, cut_all=False)
    tian_char_by_cut = "/".join(seg_list)
    tian_char_by_cut = tian_char_by_cut.replace(' ', '')
    word_list = tian_char_by_cut.split('/')
    while '' in word_list:
        word_list.remove('')
 
    # 去除停用词
    if stopword_list != None:
         num = len(word_list)
         flag = 0
         while(flag != num):
             if word_list[flag] in stopword_list:
                 del word_list[flag]
                 num -= 1
             else:
                 flag += 1
 
    file = open(cut_file_path, 'a', encoding='gb18030')
    for i in range(len(word_list)):
        s = word_list[i]
        s = s + '\n'  # 每行末尾追加换行符
        file.write(s)
    file.close()
    print(cut_file_path+"分词文件保存成功")
 
    return word_list
 
 
#获取文章关键词基于TF-IDF
def keywords(word_list, keywords_path):
    tian_content = " ".join(word_list)
    keywords_n = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('n', 'nr', 'ns'))
    keywords_v = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('v', 'vd', 'vi','vn'))
    keywords_a = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('a', 'an', 'ad'))
    f = open(keywords_path, 'a')
 
    f.write("名词关键词-前200个\n")
    for item in keywords_n:
        f.write(str(item[0])+' '+str(item[1]))
        f.write('\n')
 
    f.write("动词关键词-前200个\n")
    for item in keywords_v:
        f.write(str(item[0])+' '+str(item[1]))
        f.write('\n')
 
    f.write("形容词关键词-前200个\n")
    for item in keywords_a:
        f.write(str(item[0])+' '+str(item[1]))
        f.write('\n')
 
    f.close()
    print(keywords_path+"关键词保存成功")
 
    return keywords_n, keywords_v, keywords_a
 
 
#利用关键词做出词云
def wordcloud(im_path,keywords,type):
    #backgroud_Image = plt.imread('zhengxie.jpg')
    path = im_path + type +'.jpg'
    keywords_dict = dict(keywords)
    wc = WordCloud(background_color='black',  # 背景颜色
                   max_words=2000,  # 最大词数
                   #mask=backgroud_Image,  # 以该参数值作图绘制词云，这个参数不为空时，width和height会被忽略
                   max_font_size=100,  # 显示字体的最大值
                   #stopwords=STOPWORDS.add(''),  # 使用内置的屏蔽词，再添加其他词
                   font_path="C:/Windows/Fonts/STFANGSO.ttf",  # 解决显示口字型乱码问题，可进入C:/Windows/Fonts/目录更换字体
                   random_state=42,  # 为每个词返回一个PIL颜色
                   width=1000,  # 图片的宽
                   height=860 #图片的长
                   )
    wc.generate_from_frequencies(keywords_dict)
    #img_colors = ImageColorGenerator(backgroud_Image)
    #wc.recolor(color_func=img_colors)
    plt.imshow(wc)  # 显示词云
    plt.axis('off')
    wc.to_file(path)
    print(path+"词云图保存成功")
 
 
def stopword(stopword_path):
    stopword_list = []
    with open(stopword_path, "r", encoding="gb18030") as f:
        for line in f.readlines():
            line = line.strip('\n')
            stopword_list.append(line)
    return stopword_list
 
 
def get_all_path(open_file_path):
    rootdir = open_file_path
    path_list = []
    list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件
    for i in range(0, len(list)):
        com_path = os.path.join(rootdir, list[i])
        #print(com_path)
        if os.path.isfile(com_path):
            path_list.append(com_path)
        if os.path.isdir(com_path):
            path_list.extend(get_all_path(com_path))
    #print(path_list)
    return path_list
 
 
def im_com_path(im_path, open_file_path):
    in_com_name = str(os.path.join(im_path, os.path.basename(open_file_path)))[:-3]
    return in_com_name
 
 
#判断输入的存储文件路径是否存在，若不存在则创建
def judge_path(File_Path):
    if not os.path.exists(File_Path):
        os.makedirs(File_Path)
    return File_Path
 
def flow_path(open_file_path, stopword_path, cut_file_name, im_name, keywords_name):
    f = open(open_file_path, encoding='gb18030')
    txt_data = f.read()
    f.close()
    tian_char = str(txt_data)
    tian_char_clean = clean(tian_char)
    if stopword_path == None:
        stopword_list = None
    else:
        stopword_list = stopword(stopword_path)
    word_list = jiebacut(stopword_list, tian_char_clean, cut_file_name)
    keywords_n, keywords_v, keywords_a = keywords(word_list, keywords_name)
    wordcloud(im_name, keywords_n, 'n')
    wordcloud(im_name, keywords_v, 'v')
    wordcloud(im_name, keywords_a, 'a')
 
 
#主函数，输入打开文件路径，分词结果保存路径，词云图保存路径，输出分词结果及词云图
def main(open_file_path, stopword_path, cut_file_path, im_path, keywords_path):
    if os.path.isfile(open_file_path):
        cut_file_path_exist = judge_path(cut_file_path)
        im_path_exist = judge_path(im_path)
        keywords_path_exist = judge_path(keywords_path)
        im_com_name = im_com_path(im_path_exist,open_file_path)
        flow_path(open_file_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(open_file_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(open_file_path)))
 
    if os.path.isdir(open_file_path):
        path_list = get_all_path(open_file_path)
        cut_file_path_exist = judge_path(cut_file_path)
        im_path_exist = judge_path(im_path)
        keywords_path_exist = judge_path(keywords_path)
        for com_open_path in path_list:
            im_com_name = im_com_path(im_path_exist, com_open_path)
            flow_path(com_open_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(com_open_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(com_open_path)))
 
 
 
if __name__ == '__main__':
    main()
声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/天景科技苑/article/detail/878625