赞
踩
import jieba
content = "工信部女干事每月经过下属科室都要亲口交待24口交换机等技术性器件的安装工作"
#将返回一个生成器对象
jieba.cut(content,cut_all=False)
<generator object Tokenizer.cut at 0x000001DE14573660>
#直接返回列表内容,使用jieba.lcut即可
jieba.lcut(content,cut_all=False)
[‘工信部’,
‘女干事’,
‘每月’,
‘经过’,
‘下属’,
‘科室’,
‘都’,
‘要’,
‘亲口’,
‘交待’,
‘24’,
‘口’,
‘交换机’,
‘等’,
‘技术性’,
‘器件’,
‘的’,
‘安装’,
‘工作’]
jieba.lcut(content,cut_all=True)#cut_all默认为False
[‘工’,
‘信’,
‘部’,
‘女干事’,
‘干事’,
‘每月’,
‘月经’,
‘经过’,
‘下属’,
‘科室’,
‘都’,
‘要’,
‘亲口’,
‘口交’,
‘交待’,
‘24’,
‘口交’,
‘交换’,
‘交换机’,
‘换机’,
‘等’,
‘技术’,
‘技术性’,
‘性器’,
‘器件’,
‘的’,
‘安装’,
‘安装工’,
‘装工’,
‘工作’]
jieba.lcut_for_search(content)
[‘工信部’,
‘干事’,
‘女干事’,
‘每月’,
‘经过’,
‘下属’,
‘科室’,
‘都’,
‘要’,
‘亲口’,
‘交待’,
‘24’,
‘口’,
‘交换’,
‘换机’,
‘交换机’,
‘等’,
‘技术’,
‘技术性’,
‘器件’,
‘的’,
‘安装’,
‘工作’]
#对"女干事","交换机"等长词汇都进行了再次分词
import jieba
content2="煩惱即是菩提,我暫且不提"
jieba.lcut(content)
[‘工信部’,
‘干事’,
‘女干事’,
‘每月’,
‘经过’,
‘下属’,
‘科室’,
‘都’,
‘要’,
‘亲口’,
‘交待’,
‘24’,
‘口’,
‘交换’,
‘换机’,
‘交换机’,
‘等’,
‘技术’,
‘技术性’,
‘器件’,
‘的’,
‘安装’,
‘工作’]
'''
云计算 5 n
easy_install 3 eng
好用 300
韩玉赏鉴 3 nz
八一双鹿 3 nz
'''
jieba.lcut('八一双鹿更名为八一南昌篮球队!')
#没有使用用户自定义词典的结果:
[‘八’, ‘一双’, ‘鹿’, ‘更名’, ‘为’, ‘八一’, ‘南昌’, ‘篮球队’, ‘!’]
jieba.load_userdict(r'D:\NLP\user_dict.txt')
[‘八一双鹿’, ‘更名’, ‘为’, ‘八一’, ‘南昌’, ‘篮球队’, ‘!’]
import hanlp
HanLP=hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 世界最大中文语料库
doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
Native API的输入单位为句子,需使用多语种分句模型或基于规则的分句函数先行分句。特别地,Python HanLPClient也支持当作函数调用,在语义上完全一致。简洁的接口也支持灵活的参数,常用的技巧有:
输出结果是一个可以json化的dict,键为NLP任务名,值为分析结果。通过.pretty_print,可以在等宽字体环境中得到可视化,你需要取消换行才能对齐可视化结果(我们准备发布HTML环境的可视化)。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。