当前位置:   article > 正文

推荐系统--基于TF-IDF算法实现商品标题的关键词提取

推荐系统--基于TF-IDF算法实现商品标题的关键词提取
  1. import jieba
  2. import math
  3. import jieba.analyse
  4. class TF_IDF:
  5. def __init__(self, file, stop_file):
  6. self.file = file
  7. self.stop_file = stop_file
  8. self.stop_words = self.getStopWords()
  9. # 获取停用词列表
  10. def getStopWords(self):
  11. swlist = list()
  12. for line in open(self.stop_file, "r", encoding="utf-8").readlines():
  13. swlist.append(line.strip())
  14. print("加载停用词完成...")
  15. return swlist
  16. # 加载商品和其对应的短标题,使用jieba进行分词并去除停用词
  17. def loadData(self):
  18. dMap = dict()
  19. for line in open(self.file, "r", encoding="utf-8").readlines():
  20. id, title = line.strip().split("\t")
  21. dMap.setdefault(id, [])
  22. for word in list(jieba.cut(str(title).replace(" ", ""), cut_all=False)):
  23. if word not in self.stop_words:
  24. dMap[id].append(word)
  25. print("加载商品和对应的短标题,并使用jieba分词和去除停用词完成...")
  26. return dMap
  27. # 获取一个短标题中的词频
  28. def getFreqWord(self, words):
  29. freqWord = dict()
  30. for word in words:
  31. freqWord.setdefault(word, 0)
  32. freqWord[word] += 1
  33. return freqWord
  34. # 统计单词在所有短标题中出现的次数
  35. def getCountWordInFile(self, word, dMap):
  36. count = 0
  37. for key in dMap.keys():
  38. if word in dMap[key]:
  39. count += 1
  40. return count
  41. # 计算TFIDF值
  42. def getTFIDF(self, words, dMap):
  43. # 记录单词关键词和对应的tfidf值
  44. outDic = dict()
  45. freqWord = self.getFreqWord(words)
  46. for word in words:
  47. # 计算TF值,即单个word在整句中出现的次数
  48. tf = freqWord[word] * 1.0 / len(words)
  49. # 计算IDF值,即log(所有的标题数/(包含单个word的标题数+1))
  50. idf = math.log(len(dMap) / (self.getCountWordInFile(word, dMap) + 1))
  51. tfidf = tf * idf
  52. outDic[word] = tfidf
  53. # 给字典排序
  54. orderDic = sorted(outDic.items(), key=lambda x: x[1], reverse=True)
  55. return orderDic
  56. def getTag(self, words):
  57. # withWeight 用来设置是否打印权重
  58. print(jieba.analyse.extract_tags(words, topK=20, withWeight=True))
  59. if __name__ == "__main__":
  60. # 数据集
  61. file = "id_title.txt"
  62. # 停用词文件
  63. stop_file = "stop_words.txt"
  64. tfidf = TF_IDF(file, stop_file)
  65. # tfidf.getTag("小米 红米6Pro 异形全面屏, 后置1200万双摄, 4000mAh超大电池")
  66. # dMap 中key为商品id,value为去除停用词后的词
  67. dMap = tfidf.loadData()
  68. for id in dMap.keys():
  69. tfIdfDic = tfidf.getTFIDF(dMap[id],dMap)
  70. print(id,tfIdfDic)

结果部分展示:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/493356
推荐阅读
相关标签
  

闽ICP备14008679号