赞
踩
- from importlib.resources import path
- import os
- import re
- import jieba
- import jieba.posseg as psg
- from sklearn.feature_extraction.text import TfidfVectorizer
- from wordcloud import WordCloud
- import matplotlib.pyplot as plt
-
-
- """
- 句子分词, 去停
- """
- def fclist(sentences,stoplist,fchcxfilepath):
- list = ["zg","z","y","x","uv","ul","uj","ug","ud","vi","v","t","tg","rz","rr","r","q","o","mq","m","i","k","h","f","e","a","ad","ag","an"]
- outcx = open(fchcxfilepath,encoding='utf-8',mode='w')
- # 结巴分词(精准模式)
- cutsentence = psg.lcut(sentences)
- lastsentences = ""
- cx = ""
- for word,flag in cutsentence:
- # 去停
- if flag not in list:
- if word not in stoplist:
- if word != '\t' and len(word) != 1:
- lastsentences += word
- lastsentences += " "
- cx += word
- cx += " "
- cx += flag
- cx += "\n"
-
- outcx.write(cx)
- return lastsentences
-
- """
- 添加新词至jiaba
- """
- def add_word_jieba(path):
- f = open(path,encoding='utf-8')
- iter_f = iter(f)
- for word in iter_f:
- jieba.add_word(list[0])
- def processing(pathname):
- path = "F:/bysj/data_src/" +pathname
- files = os.listdir(path)
- s = []
- # 分词去停后的词性路径
- fchcxfilepath = "F:/bysj/data_src_fc/"+pathname+"fchqtcx.txt"
- """
- 加载停用词表
- """
- stoplists = [line.strip() for line in open("F:/vscode/python/stop.txt",encoding='UTF-8').readlines()]
- """
- 遍历文件夹
- """
- for file in files:
- if not os.path.isdir(file):
- add_word_jieba("F:/bysj/data_src/自定义词.txt")
- ff = path+"/"+file
- ff1 = "F:/bysj/data_src_fc/"+pathname+"/"+file
- file1 = open(ff1,encoding='utf-8',mode='w')
- f = open(ff,encoding='UTF-8')
- # 迭代
- iter_f = iter(f)
- for line in iter_f:
- l = re.sub('[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~\s]+', "", line) #去除不必要的符号
- lastline = fclist(l,stoplists,fchcxfilepath)
- # 将分词后的词写入文件
- file1.write(lastline)
- file1.close()
-
- """
- 特征提取
- """
- def featureselect(pathname):
- """
- 路径
- """
- featurePath = "F:/bysj/data_src_fc/"+pathname+"特征.txt"
- dataPath = "F:/bysj/data_src_fc/"+pathname+"数据.txt"
- path = "F:/bysj/data_src_fc/"+pathname
- files = os.listdir(path)
- list = []
- """
- 加载文件
- """
- for file in files:
- if not os.path.isdir(file):
- filepath = path+"/"+file
- f = open(filepath,encoding='UTF-8')
- #迭代
- filestr = ""
- iter_f = iter(f)
- for line in iter_f:
- filestr = filestr+line+" "
- list.append(filestr)
- # 文本特征转换
- # 实例化+转化
- transfer = TfidfVectorizer(min_df=0.0305,max_df=0.7)
- new_data = transfer.fit_transform(list)
- # 查看特征名字
- names = transfer.get_feature_names()
-
- gg = open(featurePath,encoding='UTF-8',mode='w')
- datafile = open(dataPath,encoding='utf-8',mode='w')
-
- datafile.writelines(str(new_data))
- datafile.close()
- s = ""
- for yuansu in names:
- s = s + yuansu
- s=s+" "
-
- # 特征写入文本
- gg.write(s)
- gg.close()
- print(len(names))
- print(pathname+"特征名字是:\n", names)
- print(new_data.toarray())
- print(new_data)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。