赞
踩
常见的预处理任务包括:
NLTK、SpaCy 等库提供内置的文本预处理功能。
下载到您的计算机并将其加载到 pandas 数据框中。如果使用 read_csv(),请使用编码 = 'latin-1'。数据集有很多列,我们只对这篇关于文本预处理的文章的原始推文列感兴趣。
- # Read the dataset into a dataframe
- import pandas as pd
- train_data = pd.read_csv('Corona_NLP_train.csv', encoding='latin-1')
- train_data.head()
-
- # Remove the columns not relevant to Text-Preprocessing Task
- train_data = train_data.drop(['UserName', 'ScreenName', 'Location', 'TweetAt', 'Sentiment'], axis = 1)
- train_data.columns
- #1. Case Conversion to Lower Case
- train_data['OriginalTweet'] = train_data['OriginalTweet'].str.lower()
- train_data.head()
- #Remove stop words and punctuation marks
- #https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
- import nltk
- import string
- from nltk.corpus import stopwords
- stop_words = stopwords.words('english')
- stopwordsandpunct = stop_words + list(string.punctuation)
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(lambda w:' '.join(w for w in w.split() if w not in stopwordsandpunct))
- train_data['OriginalTweet']
- # Remove URLs from all the tweets
- import re
- def remove_url(tweet):
- tweet = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
- return tweet
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(remove_url)
- train_data['OriginalTweet'].head()
- # remove mentions and hashtags
- def remove_mentions_hashs(tweet):
- tweet = re.sub("@[A-Za-z0-9_]+","", tweet) #Remove mentions
- tweet = re.sub("#[A-Za-z0-9_]+","", tweet) #Remove hashtags
- return tweet
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(remove_mentions_hashs)
- train_data['OriginalTweet'].head()
- # Removing emojis from tweets
- # Source Credit: https://stackoverflow.com/a/49146722/330558
- import re
- def remove_emojis(tweet):
- pat = re.compile("["
- u"\U0001F600-\U0001F64F" # emoticons
- u"\U0001F300-\U0001F5FF" # symbols & pictographs
- u"\U0001F680-\U0001F6FF" # transport & map symbols
- u"\U0001F1E0-\U0001F1FF" # flags (iOS)
- u"\U00002702-\U000027B0"
- u"\U000024C2-\U0001F251"
- "]+", flags=re.UNICODE)
- return pat.sub(r'', tweet)
-
- train_data['OriginalTweet'] =train_data['OriginalTweet'].apply(remove_emojis)
- train_data.head()
- #https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
- import unicodedata
- def remove_nonascii(text):
- text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')# apply compatibility decomposition
- return text
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(remove_nonascii)
- train_data.head()
- import string
- def remove_empty_strings1(tweet):
- tweet = re.sub(r"^\s+|\s+$", 'NaN', tweet)
- return tweet
- train_data['OriginalTweet'] =train_data['OriginalTweet'].apply(remove_empty_strings1)
- train_data = train_data[train_data['OriginalTweet'] != 'NaN']
-
- # Now resetting index of Data frame
- train_data = train_data.reset_index(drop = True)
- # Spelling correction
- import warnings
- warnings.filterwarnings("ignore")
- from textblob import TextBlob
- train_data['SpellCorrectedTweet'] = train_data['OriginalTweet'].apply(lambda x : str(TextBlob(x).correct()))
- train_data.head()
- # Now we will perform tokenization
- import nltk
- from nltk import word_tokenize
- tokenizer = nltk.tokenize.WhitespaceTokenizer()
- def tokenize(text):
- return tokenizer.tokenize(text)
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(tokenize)
- train_data['OriginalTweet'].head()
- import nltk
- tokenizer = nltk.tokenize.WhitespaceTokenizer()
- lemmatizer = nltk.stem.WordNetLemmatizer()
-
- def lemmatize(text):
- return [lemmatizer.lemmatize(w) for w in text]
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(lemmatize)
- train_data.head()
- # Stemming
- from nltk.stem import PorterStemmer
- stemmer = PorterStemmer()
-
- def stemming(text):
- return [stemmer.stem(w) for w in text]
-
- train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(stemming)
- train_data.head()
- # Counting most frequent words in tweets
- #https://docs.python.org/3/library/itertools.html#itertools.chain
- import itertools
- import collections
- all_tweets = list(train_data["OriginalTweet"])
- all_tokens = list(itertools.chain(*all_tweets))
- token_counts = collections.Counter(all_tokens)
-
- # Print 10 most common words with their frequency
- print(token_counts.most_common(10))
-
- # Convert above words and frequencies to a dataframe
- df = pd.DataFrame(token_counts.most_common(20), columns=['Token','Count'])
- df.head()
-
- # Plotting frequencies using Matplotlib barplot
- import matplotlib.pyplot as plt
- plt.rcParams["figure.figsize"] = (12,8)
- df.sort_values(by = 'Count').plot.bar(x='Token', y='Count')
- plt.title('Most Used Words')
- plt.show()
本文总结出关于文本预处理的大多数处理方法。对于文本处理的实际过程,可以抽取某些过程进行整合处理。对于更加特殊的处理也可以特别处理。
下一篇文章介绍文本表示技术:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。