赞
踩
在很多情况下,有一些文章内的英文字符、标点符号分词的结果不符合自己的预期,会出现一些不想要的分词,此时就能通过以下的函数自己设定用词,并且删除。
jieba.analyse.set_stop_words("stop_words.txt")
因为jieba对每一个字会给出IDF分数比重,但是在很多时候,会希望把文章中特别的关键字突显出来(或者降低),可以设定IDF分数高一些(或低一些),就能将想要的字突显出来(或者降低)。
jieba.analyse.set_idf_path("idf.txt") #读入IDF关键字比重分数
一个demo
- import sys
- from os import path
- import jieba
- import jieba.analyse
- d=path.dirname(__file__)
- jieba.load_userdict(path.join(d,r"C:\Users\nsy\Desktop\userdict.txt.txt"))
- text="今天学习好烦躁,还没有效率"
- content =text
- extracted_tags=jieba.analyse.extract_tags(content,topK=10,withWeight=False)
- print(" ,".join(extracted_tags))
- jieba.analyse.set_stop_words(path.join(d, r"C:\Users\nsy\Desktop\stop_words.txt.txt"))
- weighted_tags=jieba.analyse.extract_tags(content,topK=10,withWeight=True,allowPOS=('ns','n','vn','v'))
- for item in weighted_tags:
- keyword,weight=item
- print(f"关键词:{keyword},权重:{weight}")
-
- import sys
- from os import path
- import jieba
- import jieba.analyse
-
- d = path.dirname(__file__)
-
- # 根据Python版本打开文件
- if sys.version_info > (3, 0):
- text = open(path.join(d, r"C:\\Users\\nsy\\Desktop\\test.txt"), 'r', encoding='utf-8').read()
- else:
- text = open(path.join(d, r"C:\\Users\\nsy\\Desktop\\test.txt"), 'r').read()
-
- text = text.replace('\n', '')
-
- # 设置停用词文件路径,注意文件名是否正确
- jieba.analyse.set_stop_words(r"C:\Users\nsy\Desktop\stop_words.txt.txt")
- # 输出分词结果
- print(" ".join(jieba.cut(text)))
-
- # 打印分隔线
- print("-" * 10)
-
- # 使用自定义词典
- jieba.load_userdict(path.join(d, r"C:\Users\nsy\Desktop\userdict.txt.txt"))
-
- # 初始化字典存储词频
- dic = {}
-
- for ele in jieba.cut(text):
- if ele not in dic:
- dic[ele] = 1
- else:
- dic[ele] += 1
-
- # 按词频排序并输出
- for w in sorted(dic, key=dic.get, reverse=True):
- print("%s %d" % (w, dic[w]))
- import sys
- import jieba
- import jieba.analyse
- import urllib.request as httplib
-
- # 网络请求异常处理
- try:
- # 网络文章的网址
- url = "https://csdnnews.blog.csdn.net/article/details/140678511?spm=1000.2115.3001.5928"
- # 送出连接的需求
- req = httplib.Request(url)
- # 打开网页
- response = httplib.urlopen(req)
- # 连接网页正常(200)
- if response.status == 200:
- # 如果是 Python 3.0 以上
- if sys.version_info > (3, 0):
- # 取得网页的数据并解码
- contents = response.read().decode(response.headers.get_content_charset())
- else:
- # 考虑到 Python 2 不再使用,这里可以省略对应的处理逻辑
- raise Exception("Python 2 is not supported")
- except Exception as e:
- print("Error during HTTP request:", e)
- contents = ""
-
- # 去除不要的文字
- jieba.analyse.set_stop_words("C:\\Users\\nsy\\Desktop\\stop_words.txt.txt")
-
- # 仅捕获地名、名词、动名词、动词
- keywords = jieba.analyse.extract_tags(contents, topK=5, withWeight=True, allowPOS=('ns', 'n', 'vn'))
-
- # 输出关键词和相应的权重
- for item in keywords:
- print("%s=%f" % (item[0], item[1]))
-
- print("*" * 40)
-
- # 数据结构字典 key:value
- dic = {}
-
- # 做分词动作
- words = jieba.cut(contents)
-
- # 仅处理名词、动名词
- for word in words:
- if word not in dic:
- dic[word] = 1 # 记录为1
- else:
- dic[word] += 1 # 累加1
-
- # 由大到小排列并打印
- for w in sorted(dic.items(), key=lambda x: x[1], reverse=True):
- print("%s: %d" % w)
-
- # 异常处理应该针对具体的操作,而不是放在代码的最后
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。