Python文本分析（精读笔记1）

作者：AllinToyou | 2024-05-14 12:42:18

踩

python文本分析

一、文本切分

1.句子切分：将文本语料库分解为句子的过程

句子切分技术，使用NLTK 框架进行切分，该框架提供用于执行句子切分的各种接口，有sent_tokenize , PunktSentenceTokenizer, RegexpTokenizer, 预先训练的句子切分模型


import nltk
from pprint import pprint#pprin和print功能基本一样，pprint打印出的数据结构更加完整，采用分行打印
sample_text='We will discuss briefly about the basic syntax,structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming lanuage!'
#方法一
sample_sentences=nltk.sent_tokenize(text=sample_text)
#方法二
punkt_st=nltk.tokenize.PunktSentenceTokenizer()
sample_sentences=punkt_st.tokenize(sample_text)
pprint(sample_sentences)
 
》》》》
['We will discuss briefly about the basic syntax,structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember when writing code!',
 'Python is a really powerful programming lanuage!']

注：在使用以上句子切分器时遇到了nltk.download('punkt')的问题，我查看了一博主nltk.download()下载失败问题解决方法，解决了问题，如遇到类似问题可以参考，非常不错！nltk.download()下载失败问题解决方法_高冷男孩不吃苹果的博客-CSDN博客_nltk下载失败nltk.download()下载失败问题解决方法https://blog.csdn.net/lcf0000/article/details/121849782?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~aggregatepage~first_rank_ecpm_v1~rank_v31_ecpm-4-121849782.pc_agg_new_rank&utm_term=nltk.download%28punkt%29&spm=1000.2123.3001.4430

2.词语切分：将句子分割为其组成单词的过程。

依然在nltk框架下，主流接口有word_tokenize， TreebankWordTokenizer，RegexpTokenizer，RegexpTokenizer继承的切分器


import nltk
sentence="The brown for wasn't that quick and he couldn't win the race"
words=nltk.word_tokenize(sentence)
print(words)
treebank_wk=nltk.TreebankWordTokenizer()
words=treebank_wk.tokenize(sentence)
print(words)
 
《《《《《
print(words)
['The', 'brown', 'for', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']

二、文本规范化

一下代码加载基本依存关系和将使用的语料库


import nltk
import re
import string
from pprint import pprint
corpus=["The brown fox wasn't that quick and couldn't win the race","Hey that's a great deal! I just bought a phone for $199", "@@You'll(learn) a **lot** in the book . Python is amazing language!@@"]

1.文本清洗：删除无关不必要标识和字符

2.文本切分：


import nltk
import re
import string
from pprint import pprint
corpus=["The brown fox wasn't that quick and couldn't win the race","Hey that's a great deal! I just bought a phone for $199", "@@You'll(learn) a **lot** in the book . Python is amazing language!@@"]
 
#文本切分
def tokenize_tex(text):
    sentences=nltk.sent_tokenize(text) 
    word_tokens=[nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens
token_list=[tokenize_tex(text) for text in corpus]
pprint(token_list)
 
>>>>
[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '*',
   '*',
   'lot',
   '*',
   '*',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'amazing', 'language', '!'],
  ['@', '@']]]

3.删除特殊字符


def remove_characters_after_tokenization(tokens):
    pattern=re.compile('[{}]'.format(re.escape(string.punctuation)))#删除特殊字符
    filtered_tokens=list(filter(None,[pattern.sub('',token) for token in tokens]))
    return filtered_tokens
#按以下书上的代码，我没跑出来
filtered_list_1=[filter(None,[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens]) for sentence_tokens in token_list]
print(filtered_list_1)
 
#这是我修改以后的代码
sentence_list=[]
for sentence_tokens in token_list:
    for tokens in sentence_tokens:
        print(tokens)
        sentence_list.append(remove_characters_after_tokenization(tokens))
>>>>>
#结果已经不含特殊字符了
[['The',
  'brown',
  'fox',
  'was',
  'nt',
  'that',
  'quick',
  'and',
  'could',
  'nt',
  'win',
  'the',
  'race'],
 ['Hey', 'that', 's', 'a', 'great', 'deal'],
 ['I', 'just', 'bought', 'a', 'phone', 'for', '199'],
 ['You', 'll', 'learn', 'a', 'lot', 'in', 'the', 'book'],
 ['Python', 'is', 'amazing', 'language'],
 []]

4.扩展缩写词


import contractions        
from contractions import CONTRACTION_MAP
def expand_contractions(sentence,contraction_mapping):
    contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match=contraction.group(0)
        first_char=match[0]
        expanded_contraction=contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
                else contraction_mapping.get(match.lower())
        expanded_contraction=first_char+expanded_contraction[1:]
        return expanded_contraction
        expanded_sentence=contractions_pattern.sub(expand_match,sentence)
        return expanded_sentence
        
expanded_corpus=[expand_contractions(sentence,CONTRACTION_MAP) for sentence in sentence_list]
print(expanded_corpus)

5.大小写转换


print(corpus[0].lower())
print(corpus[0].upper())
 
>>>>>
the brown fox wasn't that quick and couldn't win the race
THE BROWN FOX WASN'T THAT QUICK AND COULDN'T WIN THE RACE

6.删除停用词（删除没有或者极小意义的词）


def remove_stopwords(tokens):
    stopword_list=nltk.corpus.stopwords.words('english')
    filtered_tokens=[token.lower() for token in tokens if token.lower() not in stopword_list]
    return filtered_tokens
 
corpus_tokens=[tokenize_text(text) for text in corpus]#先用前文定义的tokenize_text函数分割文章
filted_list_3=[[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in corpus_tokens]
 
>>>>>>对比以下结果
stopword_list#都是以小写字母展示
Out[69]: 
['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
corpus_tokens
Out[68]: 
[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '*',
   '*',
   'lot',
   '*',
   '*',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'amazing', 'language', '!'],
  ['@', '@']]]
 
filted_list_3
Out[67]: 
[[['brown', 'fox', "n't", 'quick', 'could', "n't", 'win', 'race']],
 [['hey', "'s", 'great', 'deal', '!'], ['bought', 'phone', '$', '199']],
 [['@', '@', "'ll", '(', 'learn', ')', '*', '*', 'lot', '*', '*', 'book', '.'],
  ['python', 'amazing', 'language', '!'],
  ['@', '@']]]

7.校正重复字符

非正式的英文表达中常有重复字符的情况


#校正重复字符
import nltk
import re
from nltk.corpus import wordnet
def remove_repleated_characters(tokens):
    repeat_pattern=re.compile(r'(\w*)(\w)\2(\w*)')#用该模式识别单词中两个不同 之间的重复字符
    match_substition=r'\1\2\3'#用置换方法消除一个重复字符
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word#判断单词是否存在在语料库中，存在则保留
        new_word=repeat_pattern.sub(match_substition,old_word)
        return replace(new_word) if new_word!=old_word else new_word
    correct_tokens=[replace(word) for word in tokens]
    return correct_tokens
 
sample_sentences="My school is reallllly amaaazningggg"
sample_sentence=tokenize_text(sample_sentences)
print(remove_repleated_characters(sample_sentence[0]))
>>>>>>>>
 
sample_sentence
Out[24]: [['My', 'school', 'is', 'reallllly', 'amaaazningggg']]
 
print(remove_repleated_characters(sample_sentence[0]))
['My', 'school', 'is', 'really', 'amazning']

8.词干提取

词干是单词的基本形式，可通过在词干上添加词缀来创建新词，词干不一定是标准正确的单词。nltk包中含几种实现算法：PorterStemmer ；LancasterStemmer；RegexpStemmer；SnowballStemmber,各算法实现方法不一致


#词干提取PorterStemmer
from nltk.stem import PorterStemmer 
ps=PorterStemmer()
print(ps.stem('jumping'),ps.stem('jumps'),ps.stem('jumped'),ps.stem('lying'),ps.stem('strange'))
>>>>>>
jump jump jump lie strang

9.词形还原


#词形还原，词元（lemma)始终在词典中
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()
print(wnl.lemmatize('cars','n'))
print(wnl.lemmatize('running','v'))
>>>>>>
car
run

以上就是处理，规范化，标准化文本的内容。

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/AllinToyou/article/detail/568711