当前位置:   article > 正文

关键字提取-TF-IDF算法和TextRank算法_tfidf模型处理txt

tfidf模型处理txt

关键字提取-TF-IDF算法和TextRank算法

  1. import pandas as pd
  2. raw = pd.read_table('../data/金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")
  3. # 章节判断用变量预处理
  4. def m_head(tmpstr):
  5. return tmpstr[:1]
  6. def m_mid(tmpstr):
  7. return tmpstr.find("回 ")
  8. raw['head'] = raw.txt.apply(m_head)
  9. raw['mid'] = raw.txt.apply(m_mid)
  10. raw['len'] = raw.txt.apply(len)
  11. # 章节判断
  12. chapnum = 0
  13. for i in range(len(raw)):
  14. if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
  15. chapnum += 1
  16. if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
  17. chapnum = 0
  18. raw.loc[i, 'chap'] = chapnum
  19. # 删除临时变量
  20. del raw['head']
  21. del raw['mid']
  22. del raw['len']
  23. rawgrp = raw.groupby('chap')
  24. chapter = rawgrp.agg(sum) # 只有字符串的情况下,sum函数自动转为合并字符串
  25. chapter = chapter[chapter.index != 0]
  26. chapter
  1. import jieba
  2. import jieba.analyse
  3. # 注意:函数在使用默认的TFIDF模型进行分析
  4. jieba.analyse.extract_tags(chapter.txt[1])

  1. # 要求返回权重值
  2. jieba.analyse.extract_tags(chapter.txt[1], withWeight=True)

  1. # 应用自定义词典改善分词效果
  2. jieba.load_userdict('../data/金庸小说词库.txt') # dict为自定义词典的路径
  3. # 在TFIDF计算中直接应用停用词表
  4. jieba.analyse.set_stop_words('../data/停用词.txt')
  5. TFres = jieba.analyse.extract_tags(chapter.txt[1], withWeight=True)
  6. TFres

  1. # 使用自定义TF-IDF频率文件
  2. jieba.analyse.set_idf_path('../data/idf.txt.big')
  3. TFres = jieba.analyse.extract_tags(chapter.txt[1], withWeight=True)
  4. TFres

  1. from sklearn.feature_extraction.text import TfidfTransformer
  2. txtlist = [" ".join(m_cut(w)) for w in chapter.txt.iloc[:5]]
  3. vectorizer = CountVectorizer()
  4. x = vectorizer.fit_transform(txtlist) # 将文本中的词语转换成词频矩阵
  5. transformer = TfidfTransformer()
  6. tfidf = transformer.fit_transform(x) # 基于词频矩阵x计算TF-IDF
  7. tfidf
  1. # 转换成数组
  2. tfidf.toarray()
  1. # 转换成矩阵
  2. tfidf.todense()
tfidf.todense().shape
  1. print("字典长度:", len(vectorizer.vocabulary_))
  2. vectorizer.vocabulary_

  1. # 文档分词及预处理
  2. chaplist = [m_cut(w) for w in chapter.txt.iloc[:5]]
  3. chaplist

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/342896
推荐阅读
相关标签
  

闽ICP备14008679号