is'),">
赞
踩
有这本书的童鞋能共享一下最新版本么
- para="hello world. it's good to see you. thanks for buying this book"
- from nltk.tokenize import sent_tokenize
- print(sent_tokenize(para))
-
- print("----------------------------")
-
- from nltk.tokenize import word_tokenize
- print(word_tokenize('hello world'))
-
- print("----------------------------")
-
- from nltk.tokenize import word_tokenize
- print(word_tokenize('你好,我是 自然 语言 处理'))
- # 这里如果不加空格的话它就处理不出来,。也就是说是以空格作为切分得依据
-
- print("----------------------------")
- import nltk
- text="hello, this is my world"
- pattern=r"\w+|[^\w\s]+"
- # r:regular expression;双引号""可以用单引号''代替;
- # \w表示单词字符,等同于字符集合[a-zA-Z0-9_];+表示一次或者多次,等同于{1,},即c+ 和 c{1,} 是一个意思;
- # "|":二选一,正则表达式中的"或"; [...]:字符集(字符类),其对应的位置可以是字符集中任意字符,
- # 例如,a[bcd]表abe、ace和ade;^表示只匹配字符串的开头;\s匹配单个空格,等同于[\f\n\r\t\v]。
- print(nltk.tokenize.regexp_tokenize(text,pattern))
-
- print("--------------以上均为切词的手段--------------")
-
- from nltk.tokenize import PunktSentenceTokenizer
- from nltk.corpus import webtext
- text=webtext.raw('overheard.txt')
- sent_tokenizer=PunktSentenceTokenizer(text)
- # print(sent_tokenizer=PunktSentenceTokenizer(text))
-
- sents1=sent_tokenizer.tokenize(text)
- print(sents1[0])
-
- from nltk.tokenize import sent_tokenize
- sents2=sent_tokenize(text)
- print(sents2[1])
-
- print("--------------去除停用词--------------")
- from nltk.corpus import stopwords
- english_stops=set(stopwords.words('english'))
- words=["cant","is","a","constraction"]
- sets=[]
- for word in words:
- if word not in english_stops:
- sets.append(word)
- print(sets)
-
- print("--------------在WORDnet上找同义词--------------")
- # way1:
- from nltk.corpus import wordnet
- syn=wordnet.synsets('cookbook')[0]
- print(syn.name())
- print(syn.definition())
-
- # way2:
- print(syn.name)
- print(syn.definition)
-
-
- print("----------------------------")
-
- from nltk.corpus import wordnet as wn
- motorcar=wn.synset('car.n.01')
- types_of_motorcar=motorcar.hyponyms()
- print(types_of_motorcar)
-
- print("-----------------部分与整体的关系-----------")
-
- print(wn.synset('computer.n.01').part_meronyms())
-
- print("-------------反义词关系---------------")
- print(wn.lemma('good.a.01.good').antonyms())
-
- print("---------查看词汇关系和同义词集上定义的其他方法-------------------")
- print(dir(wn.synset('beautiful.a.01')))
-
-
- print("------------pos----------------")
- syn=wordnet.synsets('hello')[0]
- print(syn.pos())
-
- print("------------查看复数形式和同义词----------------")
- print(wn.synset('car.n.01').lemma_names())
-
- print("------------计算同义词的相似度----------------")
- # way1: path_similarity 基于上位词层次结构中相互连接的概念之间的最短路径,
- # 其值为0-1之间,如果没有路径返回-1
- right=wn.synset('right_whale.n.01')
- minke=wn.synset('minke_whale.n.01')
- print(right.path_similarity(minke))
- # way2: wup_similarity 基于同义词在上位树出现的位置进行计算
- print(right.wup_similarity(minke))
-
-
- print("------------相对于n-gram----------------")
- from nltk import bigrams
- a=r"I'm a girl"
- tokens=a.split()
- # 这地方一定要加LISt,否则打印不出来
- print(list(bigrams(tokens)))
-
-
- print("----------------词频统计--------------------")
-
- from nltk import FreqDist
- # 空格也算
- fdist1=FreqDist("a ni n nn n t t m")
- print(fdist1)
- print(fdist1.most_common(3))
-
- import matplotlib
- # fdist1.plot(3,cumulative=True)
-
-
- print("----------------词干词语--------------------")
- # 单个词干 Poter是一种词干提取的算法
- from nltk.stem import PorterStemmer
- stemmer=PorterStemmer()
- print(stemmer.stem('coding'))
-
- # 多个词词干
- verbs=['appears', 'appear', 'appeared', 'calling', 'called']
- stems=[]
- for verb in verbs:
- stemmed_verb=stemmer.stem(verb)
- stems.append(stemmed_verb)
- print(sorted((set(stems))))
-
- print("----------------词形还原-------------------")
- from nltk.stem import WordNetLemmatizer
- lemmatizer=WordNetLemmatizer()
- print(lemmatizer.lemmatize('coding'))
- print(lemmatizer.lemmatize('codes'))
-
-
- print("----------------利用正则表达式进行词语替换词语--------------------")
-
- import re
- replacement_patterns = [
- (r'won\'t', 'will not'),
- (r'can\'t', 'cannot'),
- (r'i\'m', 'i am'),
- (r'ain\'t', 'is not'),
- (r'(\w+)\'ll', '\g<1> will'),
- (r'(\w+)n\'t', '\g<1> not'),
- (r'(\w+)\'ve', '\g<1> have'),
- (r'(\w+)\'s', '\g<1> is'),
- (r'(\w+)\'re', '\g<1> are'),
- (r'(\w+)\'d', '\g<1> would')
- ]
-
- class RegexReplacer(object):
- def __init__(self,patterns=replacement_patterns):
- self.patterns=[(re.compile(regex),repl) for (regex,repl) in patterns]
-
- def replace(self,text):
- s=text
- for (pattern,repl) in self.patterns:
- s=re.sub(pattern,repl,s)
- return s
-
- replacer=RegexReplacer()
- print(replacer.replace("You're the world, I'm a girl"))
-
- print("----------------获取语料-------------------")
- # 语料库的文件名,平均字长,平均句长,每个词平均出现的次数
- from nltk.corpus import gutenberg
- for filename in gutenberg.fileids():
- r=gutenberg.raw(filename)
- w=gutenberg.words(filename)
- s=gutenberg.sents(filename)
- v=set(w)
- print(filename,len(r)/len(w),len(w)/len(s),len(w)/len(v))
-
-
- f=open('hello.txt')
- print(f.read())
-
- print("----------------建立语料库,并进行检索-------------------")
- # step1:
- corps_root='E:/JustForNLP/nltkEx'
- from nltk.corpus import PlaintextCorpusReader
- wordlist=PlaintextCorpusReader(corps_root,'walden.txt')
- print(wordlist.fileids())
-
- wordlists=PlaintextCorpusReader(corps_root,'.*')
- print(wordlists.fileids())
- import nltk
- # step2:
- n=nltk.word_tokenize(wordlists.raw(fileids="walden.txt"))
- complete_Walden=nltk.Text(n)
- print(complete_Walden.concordance("walden"))
-
- print("----------------获取网络文本-------------------")
- from urllib.request import urlopen
- url='https://blog.csdn.net/u011001084/article/details/78980299'
- html=urlopen(url).read()
- print(html[:20])
-
-
- print("----------------tag-------------------")
- import nltk
- nltk.download('averaged_perceptron_tagger')
-
- text=nltk.word_tokenize("I'm a small girl but the world is big")
- print(nltk.pos_tag(text))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。