赞
踩
import nltk
import re
import string
from pprint import pprint
sample_corpus = ["The brown fox wasn't that quick and he couldn't win the ra",
"Hey that's a great deal! 1 just bought a phone for $199",
"Ill@You'l1 (ο1earn) in the book. Python is an amazing languagel@@"]
我们要使用或分析的文本数据部包含大量无关和不必要的标识和字符,在进行其他操作(如切分和其他规范化操作)之前,应该先删除它们,这包括从如 HTML 之类的数据源中提取有意义的文本,数据源中可能包含不必要的 HTML 标记,甚至是来自 XML 和JSON feed 的数据解析并清洗这些数据的方法很多,以删除不必要的标签 你可以使用 nltk的 cleafLhtml()函数,甚至是 BeautifulSoup库米解析 HTML数据。 你还可以使用自定义的逻辑,包括正则表达式、 xpath 和 lxml库米解析 XML 数据. 从 JSON 获取数据较为容易,因为它具有明确的键值注释
通常完成文本清洗之后,进行文本切分操作
文本切分和删除多余字符的顺序取决于你要解决的问题和你正在处理的数据
# 文本切分函数, 将文本切分为词
def tokenize_text(text):
sentences = nltk.sent_tokenize(text)
word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
return word_tokens
token_list = [tokenize_text(corpu) for corpu in sample_corpus]
pprint(token_list)
[[['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'ra']], [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'], ['1', 'just', 'bought', 'a', 'phone', 'for', '$', '199']], [['Ill', '@', "You'l1", '(', 'ο1earn', ')', 'in', 'the', 'book', '.'], ['Python', 'is', 'an', 'amazing', 'languagel', '@', '@']]]
代码段段示了如何在切分之后删除特殊字符:
# 这里使用的是 string.punctuation 属性,它由所有可能的特殊字符/符号组成,并从中创建一个正则表达式模式.
# 我们使用它来匹配并删除符号和字符标识
def remove_characters_after_tokenization(tokens):
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = list(filter(None, [pattern.sub("", token) for token in tokens]))
return filtered_tokens
filtered_list_l = [list(filter(None,[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens])) for sentence_tokens in token_list]
pprint(filtered_list_l)
[[['The', 'brown', 'fox', 'was', 'nt', 'that', 'quick', 'and', 'he', 'could', 'nt', 'win', 'the', 'ra']], [['Hey', 'that', 's', 'a', 'great', 'deal'], ['1', 'just', 'bought', 'a', 'phone', 'for', '199']], [['Ill', 'Youl1', 'ο1earn', 'in', 'the', 'book'], ['Python', 'is', 'an', 'amazing', 'languagel']]]
一种是删除所有特殊字符(通过只保留空格、数字、字母),一种是保留撇号和句号(通过删除一些特殊字符)
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
sentence = sentence.strip()
if keep_apostrophes:
PATTERN = r'[?|$|&|*|%|@|(|)|-]' # add other characters here to remove them
filtered_sentence = re.sub(PATTERN, r'', sentence)
else:
PATTERN = r'[^a-zA-Z0-9 ]'# only extract alpha-numeric characters
filtered_sentence = re.sub(PATTERN, r'', sentence)
return filtered_sentence
filtered_list_2 = [remove_characters_before_tokenization(sentence, keep_apostrophes=False) for sentence in sample_corpus]
pprint(filtered_list_2)
['The brown fox wasnt that quick and he couldnt win the ra',
'Hey thats a great deal 1 just bought a phone for 199',
'IllYoul1 1earn in the book Python is an amazing languagel']
# 小写: str.lower()
# 大写: str.upper()
每个领域或语言可能都有一系列独有的停用词 。
# 使用正则表达式来识别单词中的重复字符,然后使用置换来逐个删除重复字符 old_word = 'finalllyyy' # \2就是复制一下 # 也就是 (\w*)(\w)\2(\w*) 相当于(\w*)(\w)(\w)(\w*) repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)') # \1\2\3就是保留正则表达式中,3个括号中的内容,复制的那个剔除,还可以通过改变顺序实现匹配之后括号中内容的换位,如改成r'\2\3\1' match_substitution = r'\1\2\3' step = 1 # 找出词语中连续重复的字母, 删除他 while True: new_word = repeat_pattern.sub(match_substitution, old_word) if new_word != old_word: print("Step: {}, word: {}".format(step, new_word)) step += 1 old_word = new_word else: print("Final word: {}".format(new_word)) break
Step: 1, word: finalllyy
Step: 2, word: finallly
Step: 3, word: finally
Step: 4, word: finaly
Final word: finaly
from nltk.corpus import wordnet def remove_repeated_characters(tokens): repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)') match_substitution = r'\1\2\3' def replace(old_word): # wordnet.synsets可以获得一个词的所有sense,包括词语的各种变形的sense # 在这里用来判断一个单词是否有语义 if wordnet.synsets(old_word): return old_word new_word = repeat_pattern.sub(match_substitution, old_word) # 递归重复调用自身, 后面if else, 判断是否调用自身, 如果new_word已经等于old_word那么就直接返回new_word return replace(new_word) if new_word != old_word else new_word correct_tokens = [replace(word) for word in tokens] return correct_tokens
sample_sentence= 'myy schooool is reallllyyy amaaazingggg'
sample_sentence_tokens = tokenize_text(sample_sentence)[0]
remove_repeated_characters(sample_sentence_tokens)
import enchant
d = enchant.Dict("en_US")
print(d.check("folat"))
print(d.suggest("folat"))
False
['float', 'flat', 'fol at']
# 波特词干提取器
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print(ps.stem("jumping"), ps.stem("jumps"), ps.stem("lying"), ps.stem("strange"), ss.stem('ate'))
jump jump lie strang ate
# 兰卡斯特词干提取器
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
print(ls.stem("jumping"), ls.stem("jumps"), ls.stem("lying"), ls.stem("strange"), ss.stem('ate'))
jump jump lying strange ate
# SnowballStemmer提取器
from nltk.stem import SnowballStemmer
ss = SnowballStemmer(language='english')
print(ss.stem("jumping"), ss.stem("jumps"), ss.stem("lying"), ss.stem("strange"), ss.stem('ate'))
jump jump lie strang ate
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('cars', 'n'), wnl.lemmatize('men', 'n'))
print(wnl.lemmatize('running' , 'v'), wnl.lemmatize('ate', 'v'))
print(wnl.lemmatize('saddest', 'a'), wnl.lemmatize('fancier', 'a'))
car men
run eat
sad fancy
词性特别重要,如果词性给错则词形还原会失效,如下
print(wnl.lemmatize('running' , 'n'), wnl.lemmatize('ate', 'n'))
running ate
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。