赞
踩
仅记录学习过程,有问题欢迎讨论
优势:
可解释性好
可以清晰地看到关键词
即使预测结果出错,也很容易找到原因
计算速度快
分词本身占耗时最多,其余为简单统计计算
对标注数据依赖小
可以使用无标注语料完成一部分工作
可以与很多算法组合使用
可以看做是词权重
劣势:
1.受分词效果影响大
2.词与词之间没有语义相似度
3.没有语序信息(词袋模型)
4.能力范围有限,无法完成复杂任务,如机器翻译和实体挖掘等
5.样本不均衡会对结果有很大影响
6.类内样本间分布不被考虑
Demo1:手动实现TFIDF
""" 实现一个 TFIDF """ import jieba import math import os import json from collections import defaultdict def build_tf_idf_dict(corpus): tf_dict = defaultdict(dict) # key:文档序号,value:dict,文档中每个词出现的频率 idf_dict = defaultdict(set) # key:词, value:set,文档序号,最终用于计算每个词在多少篇文档中出现过 for text_index, text_words in enumerate(corpus): for word in text_words: if word not in tf_dict[text_index]: tf_dict[text_index][word] = 0 tf_dict[text_index][word] += 1 idf_dict[word].add(text_index) idf_dict = dict([(key, len(value)) for key, value in idf_dict.items()]) return tf_dict, idf_dict # 根据tf值和idf值计算tfidf def calculate_tf_idf(tf_dict, idf_dict): tf_idf_dict = defaultdict(dict) for text_index, word_tf_count_dict in tf_dict.items(): for word, tf_count in word_tf_count_dict.items(): tf = tf_count / sum(word_tf_count_dict.values()) # tf-idf = tf * log(D/(idf + 1)) tf_idf_dict[text_index][word] = tf * math.log(len(tf_dict) / (idf_dict[word] + 1)) return tf_idf_dict # 计算样本的 tfidf def calculate_tfidf(corpus): corpus = [jieba.cut(text) for text in corpus] tf_dict, idf_dict = build_tf_idf_dict(corpus) tf_idf_dict = calculate_tf_idf(tf_dict, idf_dict) return tf_idf_dict # 取出前k个 tfidf最大的数据 def tf_idf_topk(tfidf_dict, paths=[], top=10, print_word=True): topk_dict = {} for text_index, text_tfidf_dict in tfidf_dict.items(): # idf 逆序 word_list = sorted(text_tfidf_dict.items(), reverse=True, key=lambda x: x[1]) # 去排序后的前top个 topk_dict[text_index] = word_list[:top] if print_word: print(text_index, paths[text_index]) for i in range(top): print(word_list[i]) print("----------") return topk_dict def main(): dir_path = r"week4/category_corpus/" corpus = [] paths = [] for path in os.listdir(dir_path): path = os.path.join(dir_path, path) if path.endswith("txt"): corpus.append(open(path, encoding="utf8").read()) paths.append(os.path.basename(path)) tf_idf_dict = calculate_tfidf(corpus) tf_idf_topk(tf_idf_dict, paths) if __name__ == "__main__": main()
Demo2:利用 tfidf 实现简单搜索引擎功能
""" 利用 tfidf 实现简单搜索引擎功能 """ import jieba import math import os import json from collections import defaultdict # 加载文档数据(可以想象成网页数据),计算每个网页的tfidf字典 from day0429_1 import calculate_tfidf def load_data(path): # path = "/week4/news.json" corpus = [] with open(path, encoding="utf8") as f: documents = json.loads(f.read()) for document in documents: corpus.append(document['title'] + "\n" + document["content"]) tf_idf_dict = calculate_tfidf(corpus) return tf_idf_dict, corpus def search_engine(query_str, tf_idf_dict, corpus, top=3): query_words = jieba.lcut(query_str) res = [] for doc_id, tf_idf in tf_idf_dict.items(): score = 0 for word in query_words: # 搜到关键词了 score++ score += tf_idf.get(word, 0) res.append([doc_id, score]) res = sorted(res, reverse=True, key=lambda x: x[1]) for i in range(top): doc_id = res[i][0] print(corpus[doc_id]) print("--------------") return res if __name__ == "__main__": path = "C:\\Users\\Administrator\\Desktop\\LearnPython\\week4\\news.json" tf_idf_dict, corpus = load_data(path) while True: query = input("请输入您要搜索的内容:") search_engine(query, tf_idf_dict, corpus)
Demo3 :基于tfidf实现简单文本摘要
import jieba import math import os import random import re import json from collections import defaultdict from day0429_1 import calculate_tfidf """ 基于tfidf实现简单文本摘要 """ # 加载文档数据(可以想象成网页数据),计算每个网页的tfidf字典 def load_data(file_path): corpus = [] with open(file_path, encoding="utf8") as f: documents = json.loads(f.read()) for document in documents: assert "\n" not in document["title"] assert "\n" not in document["content"] corpus.append(document["title"] + "\n" + document["content"]) tf_idf_dict = calculate_tfidf(corpus) return tf_idf_dict, corpus # 计算每一篇文章的摘要 # 输入该文章的tf_idf词典,和文章内容 # top为人为定义的选取的句子数量 # 过滤掉一些正文太短的文章,因为正文太短在做摘要意义不大 def generate_document_abstract(document_tf_idf, document, top=3): sentences = re.split("?|!|。", document) if len(sentences) < 5: return None res = [] for index, sentence in enumerate(sentences): sentence_score = 0 words = jieba.lcut(sentence) for word in words: sentence_score += document_tf_idf.get(word, 0) # 记录下每句话的分数和下标 res.append([sentence_score, index]) res = sorted(res, reverse=True, key=lambda x: x[0]) # 权重最高的可能依次是第10,第6,第3句,将他们调整为出现顺序比较合理,即3,6,10 important_sentence_indexs = sorted([x[1] for x in res[:top]]) return "。".join([sentences[index] for index in important_sentence_indexs]) # 生成摘要 def generate_abstract(tf_idf_dict, corpus): res = [] for index, document_tf_idf in tf_idf_dict.items(): title, content = corpus[index].split("\n") abstract = generate_document_abstract(document_tf_idf, content) if abstract is None: continue corpus[index] = "\n" + abstract res.append({"标题": title, "正文": content, "摘要": abstract}) return res if __name__ == "__main__": path = "C:\\Users\\Administrator\\Desktop\\LearnPython\\week4\\news.json" tf_idf_dict, corpus = load_data(path) res = generate_abstract(tf_idf_dict, corpus) writer = open("abstract.json", "w", encoding="utf8") writer.write(json.dumps(res, ensure_ascii=False, indent=2)) writer.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。