赞
踩
Models & Languages · spaCy Usage Documentation 里有教程
pip install spacy
装英文模型
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
import spacy
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
doc = nlp("This is a sentence.")
#分词
for token in doc:
print(token)
#分句
for sent in doc.sents:
print(sent)
#词性(打印出词和对应的词性)
for token in doc:
print('{}--{}'.format(token,token.pos_))
#明名体识别 label_是什么类型
doc_2=nlp("I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
print ('{}-{}'.format(ent,ent.label_))
from spacy import displacy #(同样是明名体识别只是标了颜色)
doc1 =nlp('I went to Paris where I met myold friend Jack from uni.')
displacy.render(doc,style='ent',jupyter=True)
'''测试文本'''
def read_file(file_name):
with open(file_name, 'r') as file:
return file.read()
#加载文本数据
text =read_file('文件地址')
processed_text=nlp(text) #创建的对象
sentences =[s for s in processed_text.sents]#分句
print (len(sentences))
from collections import Counter,defaultdict #导入计数器
def find_person(doc):
c=Counter()
for ent in doc.ents: #明名体
if ent.label_=='PERSON': #人的明名体
c[ent.lemma_]+=1 #找到所有人
return c.most_common(10) #排名前10的
print(find_person(processed_text))
'''统计'''
articles=''
articles_nlp=[nlp(art) for art in articles] #每一段做实例化
loca_dict=defaultdict(Counter) #设置一个默认的字典
list_ren=['定义一个组织组织或者人的集合']
list_di=['定义一个地点的集合']
for articl in articles_nlp:
#找到人或者组织
articl_ren=[ent.lemma_ for ent in articl.ents if ent.label_=='PERSON' or ent.label_=='ORG']
#找到地点
articl_di_dian=[ent.lemma_ for ent in articl.ents if ent.label_=='GPE']
#只找刚刚创建的集合中的人或者组织
ren=[ent for ent in articl_ren if ent in list_ren]
#只找前面闯创建的集合中的地点
di_dian=[ent for ent in articl_di_dian if ent in list_di]
for i in ren:
for r in di_dian:
loca_dict[i][r]+=1 #人:地点:出现的次数
import pandas as pd
location_df=pd.DataFrame.from_dict(dict(loca_dict),dtype=int) #转换
location_df=location_df.fillna(value=0)
#pip install jieba
import jieba
# 全模式cut_all=True
# 精确模式cut_all=False 默认是精确模式
seg_list=jieba.cut('我来到北京清华大学',cut_all=False)
print(list(seg_list))
print(type(seg_list))
print('全模式'+'/'.join(seg_list))
'''分词自己添加关键字'''
jieba.load_userdict('这里填路径.txt 格式要设置成utf-8')
jieba.add_word('直接写')
'''关键词抽取'''
import jieba.analyse
text='文本'
#topK=5 前5项数据 withWeight=True带不带权重计算
tegs=jieba.analyse.extract_tags(text,topK=5,withWeight=True)
for word,weight in tegs:
print(word,weight) #词 和权重
'''词性标注'''
import jieba.posseg as pseg
words=pseg.cut(text)
for word,flag in words:
print(word,flag)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。