当前位置:   article > 正文

特征提取 TF-IDF算法 python代码_python的tf-idf实现代码

python的tf-idf实现代码
  1. from importlib.resources import path
  2. import os
  3. import re
  4. import jieba
  5. import jieba.posseg as psg
  6. from sklearn.feature_extraction.text import TfidfVectorizer
  7. from wordcloud import WordCloud
  8. import matplotlib.pyplot as plt
  9. """
  10. 句子分词, 去停
  11. """
  12. def fclist(sentences,stoplist,fchcxfilepath):
  13. list = ["zg","z","y","x","uv","ul","uj","ug","ud","vi","v","t","tg","rz","rr","r","q","o","mq","m","i","k","h","f","e","a","ad","ag","an"]
  14. outcx = open(fchcxfilepath,encoding='utf-8',mode='w')
  15. # 结巴分词(精准模式)
  16. cutsentence = psg.lcut(sentences)
  17. lastsentences = ""
  18. cx = ""
  19. for word,flag in cutsentence:
  20. # 去停
  21. if flag not in list:
  22. if word not in stoplist:
  23. if word != '\t' and len(word) != 1:
  24. lastsentences += word
  25. lastsentences += " "
  26. cx += word
  27. cx += " "
  28. cx += flag
  29. cx += "\n"
  30. outcx.write(cx)
  31. return lastsentences
  32. """
  33. 添加新词至jiaba
  34. """
  35. def add_word_jieba(path):
  36. f = open(path,encoding='utf-8')
  37. iter_f = iter(f)
  38. for word in iter_f:
  39. jieba.add_word(list[0])
  40. def processing(pathname):
  41. path = "F:/bysj/data_src/" +pathname
  42. files = os.listdir(path)
  43. s = []
  44. # 分词去停后的词性路径
  45. fchcxfilepath = "F:/bysj/data_src_fc/"+pathname+"fchqtcx.txt"
  46. """
  47. 加载停用词表
  48. """
  49. stoplists = [line.strip() for line in open("F:/vscode/python/stop.txt",encoding='UTF-8').readlines()]
  50. """
  51. 遍历文件夹
  52. """
  53. for file in files:
  54. if not os.path.isdir(file):
  55. add_word_jieba("F:/bysj/data_src/自定义词.txt")
  56. ff = path+"/"+file
  57. ff1 = "F:/bysj/data_src_fc/"+pathname+"/"+file
  58. file1 = open(ff1,encoding='utf-8',mode='w')
  59. f = open(ff,encoding='UTF-8')
  60. # 迭代
  61. iter_f = iter(f)
  62. for line in iter_f:
  63. l = re.sub('[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~\s]+', "", line) #去除不必要的符号
  64. lastline = fclist(l,stoplists,fchcxfilepath)
  65. # 将分词后的词写入文件
  66. file1.write(lastline)
  67. file1.close()
  68. """
  69. 特征提取
  70. """
  71. def featureselect(pathname):
  72. """
  73. 路径
  74. """
  75. featurePath = "F:/bysj/data_src_fc/"+pathname+"特征.txt"
  76. dataPath = "F:/bysj/data_src_fc/"+pathname+"数据.txt"
  77. path = "F:/bysj/data_src_fc/"+pathname
  78. files = os.listdir(path)
  79. list = []
  80. """
  81. 加载文件
  82. """
  83. for file in files:
  84. if not os.path.isdir(file):
  85. filepath = path+"/"+file
  86. f = open(filepath,encoding='UTF-8')
  87. #迭代
  88. filestr = ""
  89. iter_f = iter(f)
  90. for line in iter_f:
  91. filestr = filestr+line+" "
  92. list.append(filestr)
  93. # 文本特征转换
  94. # 实例化+转化
  95. transfer = TfidfVectorizer(min_df=0.0305,max_df=0.7)
  96. new_data = transfer.fit_transform(list)
  97. # 查看特征名字
  98. names = transfer.get_feature_names()
  99. gg = open(featurePath,encoding='UTF-8',mode='w')
  100. datafile = open(dataPath,encoding='utf-8',mode='w')
  101. datafile.writelines(str(new_data))
  102. datafile.close()
  103. s = ""
  104. for yuansu in names:
  105. s = s + yuansu
  106. s=s+" "
  107. # 特征写入文本
  108. gg.write(s)
  109. gg.close()
  110. print(len(names))
  111. print(pathname+"特征名字是:\n", names)
  112. print(new_data.toarray())
  113. print(new_data)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/人工智能uu/article/detail/782091
推荐阅读
相关标签
  

闽ICP备14008679号