is'),">
当前位置:   article > 正文

Python Text Processing with NLTK 2.0 Cookbook中比较有用的代码_(r'(\w+)\'s', '\g<1> is'),

(r'(\w+)\'s', '\g<1> is'),

有这本书的童鞋能共享一下最新版本么

  1. para="hello world. it's good to see you. thanks for buying this book"
  2. from nltk.tokenize import sent_tokenize
  3. print(sent_tokenize(para))
  4. print("----------------------------")
  5. from nltk.tokenize import word_tokenize
  6. print(word_tokenize('hello world'))
  7. print("----------------------------")
  8. from nltk.tokenize import word_tokenize
  9. print(word_tokenize('你好,我是 自然 语言 处理'))
  10. # 这里如果不加空格的话它就处理不出来,。也就是说是以空格作为切分得依据
  11. print("----------------------------")
  12. import nltk
  13. text="hello, this is my world"
  14. pattern=r"\w+|[^\w\s]+"
  15. # r:regular expression;双引号""可以用单引号''代替;
  16. # \w表示单词字符,等同于字符集合[a-zA-Z0-9_];+表示一次或者多次,等同于{1,},即c+ 和 c{1,} 是一个意思;
  17. # "|":二选一,正则表达式中的"或"; [...]:字符集(字符类),其对应的位置可以是字符集中任意字符,
  18. # 例如,a[bcd]表abe、ace和ade;^表示只匹配字符串的开头;\s匹配单个空格,等同于[\f\n\r\t\v]。
  19. print(nltk.tokenize.regexp_tokenize(text,pattern))
  20. print("--------------以上均为切词的手段--------------")
  21. from nltk.tokenize import PunktSentenceTokenizer
  22. from nltk.corpus import webtext
  23. text=webtext.raw('overheard.txt')
  24. sent_tokenizer=PunktSentenceTokenizer(text)
  25. # print(sent_tokenizer=PunktSentenceTokenizer(text))
  26. sents1=sent_tokenizer.tokenize(text)
  27. print(sents1[0])
  28. from nltk.tokenize import sent_tokenize
  29. sents2=sent_tokenize(text)
  30. print(sents2[1])
  31. print("--------------去除停用词--------------")
  32. from nltk.corpus import stopwords
  33. english_stops=set(stopwords.words('english'))
  34. words=["cant","is","a","constraction"]
  35. sets=[]
  36. for word in words:
  37. if word not in english_stops:
  38. sets.append(word)
  39. print(sets)
  40. print("--------------在WORDnet上找同义词--------------")
  41. # way1:
  42. from nltk.corpus import wordnet
  43. syn=wordnet.synsets('cookbook')[0]
  44. print(syn.name())
  45. print(syn.definition())
  46. # way2:
  47. print(syn.name)
  48. print(syn.definition)
  49. print("----------------------------")
  50. from nltk.corpus import wordnet as wn
  51. motorcar=wn.synset('car.n.01')
  52. types_of_motorcar=motorcar.hyponyms()
  53. print(types_of_motorcar)
  54. print("-----------------部分与整体的关系-----------")
  55. print(wn.synset('computer.n.01').part_meronyms())
  56. print("-------------反义词关系---------------")
  57. print(wn.lemma('good.a.01.good').antonyms())
  58. print("---------查看词汇关系和同义词集上定义的其他方法-------------------")
  59. print(dir(wn.synset('beautiful.a.01')))
  60. print("------------pos----------------")
  61. syn=wordnet.synsets('hello')[0]
  62. print(syn.pos())
  63. print("------------查看复数形式和同义词----------------")
  64. print(wn.synset('car.n.01').lemma_names())
  65. print("------------计算同义词的相似度----------------")
  66. # way1: path_similarity 基于上位词层次结构中相互连接的概念之间的最短路径,
  67. # 其值为0-1之间,如果没有路径返回-1
  68. right=wn.synset('right_whale.n.01')
  69. minke=wn.synset('minke_whale.n.01')
  70. print(right.path_similarity(minke))
  71. # way2: wup_similarity 基于同义词在上位树出现的位置进行计算
  72. print(right.wup_similarity(minke))
  73. print("------------相对于n-gram----------------")
  74. from nltk import bigrams
  75. a=r"I'm a girl"
  76. tokens=a.split()
  77. # 这地方一定要加LISt,否则打印不出来
  78. print(list(bigrams(tokens)))
  79. print("----------------词频统计--------------------")
  80. from nltk import FreqDist
  81. # 空格也算
  82. fdist1=FreqDist("a ni n nn n t t m")
  83. print(fdist1)
  84. print(fdist1.most_common(3))
  85. import matplotlib
  86. # fdist1.plot(3,cumulative=True)
  87. print("----------------词干词语--------------------")
  88. # 单个词干 Poter是一种词干提取的算法
  89. from nltk.stem import PorterStemmer
  90. stemmer=PorterStemmer()
  91. print(stemmer.stem('coding'))
  92. # 多个词词干
  93. verbs=['appears', 'appear', 'appeared', 'calling', 'called']
  94. stems=[]
  95. for verb in verbs:
  96. stemmed_verb=stemmer.stem(verb)
  97. stems.append(stemmed_verb)
  98. print(sorted((set(stems))))
  99. print("----------------词形还原-------------------")
  100. from nltk.stem import WordNetLemmatizer
  101. lemmatizer=WordNetLemmatizer()
  102. print(lemmatizer.lemmatize('coding'))
  103. print(lemmatizer.lemmatize('codes'))
  104. print("----------------利用正则表达式进行词语替换词语--------------------")
  105. import re
  106. replacement_patterns = [
  107. (r'won\'t', 'will not'),
  108. (r'can\'t', 'cannot'),
  109. (r'i\'m', 'i am'),
  110. (r'ain\'t', 'is not'),
  111. (r'(\w+)\'ll', '\g<1> will'),
  112. (r'(\w+)n\'t', '\g<1> not'),
  113. (r'(\w+)\'ve', '\g<1> have'),
  114. (r'(\w+)\'s', '\g<1> is'),
  115. (r'(\w+)\'re', '\g<1> are'),
  116. (r'(\w+)\'d', '\g<1> would')
  117. ]
  118. class RegexReplacer(object):
  119. def __init__(self,patterns=replacement_patterns):
  120. self.patterns=[(re.compile(regex),repl) for (regex,repl) in patterns]
  121. def replace(self,text):
  122. s=text
  123. for (pattern,repl) in self.patterns:
  124. s=re.sub(pattern,repl,s)
  125. return s
  126. replacer=RegexReplacer()
  127. print(replacer.replace("You're the world, I'm a girl"))
  128. print("----------------获取语料-------------------")
  129. # 语料库的文件名,平均字长,平均句长,每个词平均出现的次数
  130. from nltk.corpus import gutenberg
  131. for filename in gutenberg.fileids():
  132. r=gutenberg.raw(filename)
  133. w=gutenberg.words(filename)
  134. s=gutenberg.sents(filename)
  135. v=set(w)
  136. print(filename,len(r)/len(w),len(w)/len(s),len(w)/len(v))
  137. f=open('hello.txt')
  138. print(f.read())
  139. print("----------------建立语料库,并进行检索-------------------")
  140. # step1:
  141. corps_root='E:/JustForNLP/nltkEx'
  142. from nltk.corpus import PlaintextCorpusReader
  143. wordlist=PlaintextCorpusReader(corps_root,'walden.txt')
  144. print(wordlist.fileids())
  145. wordlists=PlaintextCorpusReader(corps_root,'.*')
  146. print(wordlists.fileids())
  147. import nltk
  148. # step2:
  149. n=nltk.word_tokenize(wordlists.raw(fileids="walden.txt"))
  150. complete_Walden=nltk.Text(n)
  151. print(complete_Walden.concordance("walden"))
  152. print("----------------获取网络文本-------------------")
  153. from urllib.request import urlopen
  154. url='https://blog.csdn.net/u011001084/article/details/78980299'
  155. html=urlopen(url).read()
  156. print(html[:20])
  157. print("----------------tag-------------------")
  158. import nltk
  159. nltk.download('averaged_perceptron_tagger')
  160. text=nltk.word_tokenize("I'm a small girl but the world is big")
  161. print(nltk.pos_tag(text))

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/488085
推荐阅读
相关标签
  

闽ICP备14008679号