赞
踩
句子切分技术,使用NLTK 框架进行切分,该框架提供用于执行句子切分的各种接口,有sent_tokenize , PunktSentenceTokenizer, RegexpTokenizer, 预先训练的句子切分模型
- import nltk
- from pprint import pprint#pprin和print功能基本一样,pprint打印出的数据结构更加完整,采用分行打印
- sample_text='We will discuss briefly about the basic syntax,structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming lanuage!'
- #方法一
- sample_sentences=nltk.sent_tokenize(text=sample_text)
- #方法二
- punkt_st=nltk.tokenize.PunktSentenceTokenizer()
- sample_sentences=punkt_st.tokenize(sample_text)
- pprint(sample_sentences)
-
- 》》》》
- ['We will discuss briefly about the basic syntax,structure and design '
- 'philosophies.',
- 'There is a defined hierarchical syntax for Python code which you should '
- 'remember when writing code!',
- 'Python is a really powerful programming lanuage!']
注:在使用以上句子切分器时遇到了nltk.download('punkt')的问题,我查看了一博主nltk.download()下载失败问题解决方法,解决了问题,如遇到类似问题可以参考,非常不错!nltk.download()下载失败问题解决方法_高冷男孩不吃苹果的博客-CSDN博客_nltk下载失败nltk.download()下载失败问题解决方法https://blog.csdn.net/lcf0000/article/details/121849782?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~aggregatepage~first_rank_ecpm_v1~rank_v31_ecpm-4-121849782.pc_agg_new_rank&utm_term=nltk.download%28punkt%29&spm=1000.2123.3001.4430
依然在nltk框架下,主流接口有word_tokenize, TreebankWordTokenizer,RegexpTokenizer,RegexpTokenizer继承的切分器
- import nltk
- sentence="The brown for wasn't that quick and he couldn't win the race"
- words=nltk.word_tokenize(sentence)
- print(words)
- treebank_wk=nltk.TreebankWordTokenizer()
- words=treebank_wk.tokenize(sentence)
- print(words)
-
- 《《《《《
- print(words)
- ['The', 'brown', 'for', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']
一下代码加载基本依存关系和将使用的语料库
- import nltk
- import re
- import string
- from pprint import pprint
- corpus=["The brown fox wasn't that quick and couldn't win the race","Hey that's a great deal! I just bought a phone for $199", "@@You'll(learn) a **lot** in the book . Python is amazing language!@@"]
- import nltk
- import re
- import string
- from pprint import pprint
- corpus=["The brown fox wasn't that quick and couldn't win the race","Hey that's a great deal! I just bought a phone for $199", "@@You'll(learn) a **lot** in the book . Python is amazing language!@@"]
-
- #文本切分
- def tokenize_tex(text):
- sentences=nltk.sent_tokenize(text)
- word_tokens=[nltk.word_tokenize(sentence) for sentence in sentences]
- return word_tokens
- token_list=[tokenize_tex(text) for text in corpus]
- pprint(token_list)
-
- >>>>
- [[['The',
- 'brown',
- 'fox',
- 'was',
- "n't",
- 'that',
- 'quick',
- 'and',
- 'could',
- "n't",
- 'win',
- 'the',
- 'race']],
- [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
- ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
- [['@',
- '@',
- 'You',
- "'ll",
- '(',
- 'learn',
- ')',
- 'a',
- '*',
- '*',
- 'lot',
- '*',
- '*',
- 'in',
- 'the',
- 'book',
- '.'],
- ['Python', 'is', 'amazing', 'language', '!'],
- ['@', '@']]]
- def remove_characters_after_tokenization(tokens):
- pattern=re.compile('[{}]'.format(re.escape(string.punctuation)))#删除特殊字符
- filtered_tokens=list(filter(None,[pattern.sub('',token) for token in tokens]))
- return filtered_tokens
- #按以下书上的代码,我没跑出来
- filtered_list_1=[filter(None,[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens]) for sentence_tokens in token_list]
- print(filtered_list_1)
-
- #这是我修改以后的代码
- sentence_list=[]
- for sentence_tokens in token_list:
- for tokens in sentence_tokens:
- print(tokens)
- sentence_list.append(remove_characters_after_tokenization(tokens))
- >>>>>
- #结果已经不含特殊字符了
- [['The',
- 'brown',
- 'fox',
- 'was',
- 'nt',
- 'that',
- 'quick',
- 'and',
- 'could',
- 'nt',
- 'win',
- 'the',
- 'race'],
- ['Hey', 'that', 's', 'a', 'great', 'deal'],
- ['I', 'just', 'bought', 'a', 'phone', 'for', '199'],
- ['You', 'll', 'learn', 'a', 'lot', 'in', 'the', 'book'],
- ['Python', 'is', 'amazing', 'language'],
- []]
- import contractions
- from contractions import CONTRACTION_MAP
- def expand_contractions(sentence,contraction_mapping):
- contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
- def expand_match(contraction):
- match=contraction.group(0)
- first_char=match[0]
- expanded_contraction=contraction_mapping.get(match)\
- if contraction_mapping.get(match)\
- else contraction_mapping.get(match.lower())
- expanded_contraction=first_char+expanded_contraction[1:]
- return expanded_contraction
- expanded_sentence=contractions_pattern.sub(expand_match,sentence)
- return expanded_sentence
-
- expanded_corpus=[expand_contractions(sentence,CONTRACTION_MAP) for sentence in sentence_list]
- print(expanded_corpus)
- print(corpus[0].lower())
- print(corpus[0].upper())
-
- >>>>>
- the brown fox wasn't that quick and couldn't win the race
- THE BROWN FOX WASN'T THAT QUICK AND COULDN'T WIN THE RACE
- def remove_stopwords(tokens):
- stopword_list=nltk.corpus.stopwords.words('english')
- filtered_tokens=[token.lower() for token in tokens if token.lower() not in stopword_list]
- return filtered_tokens
-
- corpus_tokens=[tokenize_text(text) for text in corpus]#先用前文定义的tokenize_text函数分割文章
- filted_list_3=[[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in corpus_tokens]
-
- >>>>>>对比以下结果
- stopword_list#都是以小写字母展示
- Out[69]:
- ['i',
- 'me',
- 'my',
- 'myself',
- 'we',
- 'our',
- 'ours',
- 'ourselves',
- 'you',
- "you're",
- "you've",
- "you'll",
- corpus_tokens
- Out[68]:
- [[['The',
- 'brown',
- 'fox',
- 'was',
- "n't",
- 'that',
- 'quick',
- 'and',
- 'could',
- "n't",
- 'win',
- 'the',
- 'race']],
- [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
- ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
- [['@',
- '@',
- 'You',
- "'ll",
- '(',
- 'learn',
- ')',
- 'a',
- '*',
- '*',
- 'lot',
- '*',
- '*',
- 'in',
- 'the',
- 'book',
- '.'],
- ['Python', 'is', 'amazing', 'language', '!'],
- ['@', '@']]]
-
- filted_list_3
- Out[67]:
- [[['brown', 'fox', "n't", 'quick', 'could', "n't", 'win', 'race']],
- [['hey', "'s", 'great', 'deal', '!'], ['bought', 'phone', '$', '199']],
- [['@', '@', "'ll", '(', 'learn', ')', '*', '*', 'lot', '*', '*', 'book', '.'],
- ['python', 'amazing', 'language', '!'],
- ['@', '@']]]
非正式的英文表达中常有重复字符的情况
- #校正重复字符
- import nltk
- import re
- from nltk.corpus import wordnet
- def remove_repleated_characters(tokens):
- repeat_pattern=re.compile(r'(\w*)(\w)\2(\w*)')#用该模式识别单词中两个不同 之间的重复字符
- match_substition=r'\1\2\3'#用置换方法消除一个重复字符
- def replace(old_word):
- if wordnet.synsets(old_word):
- return old_word#判断单词是否存在在语料库中,存在则保留
- new_word=repeat_pattern.sub(match_substition,old_word)
- return replace(new_word) if new_word!=old_word else new_word
- correct_tokens=[replace(word) for word in tokens]
- return correct_tokens
-
- sample_sentences="My school is reallllly amaaazningggg"
- sample_sentence=tokenize_text(sample_sentences)
- print(remove_repleated_characters(sample_sentence[0]))
- >>>>>>>>
-
- sample_sentence
- Out[24]: [['My', 'school', 'is', 'reallllly', 'amaaazningggg']]
-
- print(remove_repleated_characters(sample_sentence[0]))
- ['My', 'school', 'is', 'really', 'amazning']
词干是单词的基本形式,可通过在词干上添加词缀来创建新词,词干不一定是标准正确的单词。nltk包中含几种实现算法:PorterStemmer ;LancasterStemmer;RegexpStemmer;SnowballStemmber,各算法实现方法不一致
- #词干提取PorterStemmer
- from nltk.stem import PorterStemmer
- ps=PorterStemmer()
- print(ps.stem('jumping'),ps.stem('jumps'),ps.stem('jumped'),ps.stem('lying'),ps.stem('strange'))
- >>>>>>
- jump jump jump lie strang
- #词形还原,词元(lemma)始终在词典中
- from nltk.stem import WordNetLemmatizer
- wnl=WordNetLemmatizer()
- print(wnl.lemmatize('cars','n'))
- print(wnl.lemmatize('running','v'))
- >>>>>>
- car
- run
以上就是处理,规范化,标准化文本的内容。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。