赞
踩
- import pandas as pd
- data = pd.read_excel(r'D:\python\zxzy\amazon_asin\review.xlsx')
- title = data['review_revs']
- data.head(1)
- #分句
- import nltk
- from nltk.tokenize import sent_tokenize
- sent = []
- for i in title:
- sent.append(sent_tokenize(str(i)))
-
- sent[0:3]
- #分词
- from nltk.tokenize import word_tokenize
- words = []
- for i in sent:
- for j in i:
- words.extend(word_tokenize(j))
-
- words[0:5]
- #小写处理
- words_lower = [i.lower() for i in words]
- # 去除标点符号和停用词
- from nltk.corpus import stopwords
- english_stopwords = stopwords.words("english")
- english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '...',"'s"]
- words_clear = []
- for i in words_lower:
- if i not in english_stopwords:
- if i not in english_punctuations:
- words_clear.append(i)
- print("/".join(words_clear[0:10]))
- # 词干化处理
- from nltk.stem.porter import PorterStemmer
- st = PorterStemmer()
- words_stem = [st.stem(word) for word in words_clear]
- print("/".join(words_stem[0:10]))
- # 将分词结果转化为text格式
- from nltk.text import Text
- word_text = Text(words_stem)
- # 识别评论文本中常用固定词组搭配
- word_text.collocations(num = 50,window_size = 2)
- # 利用Counter计数器统计出现次数最多的前20个单词
- from collections import Counter
- words_counter = Counter(word_text)
- words_counter.most_common(20)
-
- #保存words_counter结果
- #fo = open(r'D:\python\zxzy\amazon_asin\tshirt.txt', "w")
- #for i in words_counter:
- # fo.write(i + ":" + str(words_counter[i]) + "\n")
- #fo.close()
- #通过查看高频词上下文相关内容,可以了解评论的具体内容
- #如果想直接查看原始评论文本,可以通过索引查看
- word_text.concordance("dress",lines=10)
- # 词性标注,找出形容词ADJ和名词NOUN
- from nltk.tag import pos_tag
- ADJ = []
- NOUN = []
- for a,b in pos_tag(words_stem,tagset = "universal"):
- if b == "ADJ":
- ADJ.append(a)
- elif b == "NOUN":
- NOUN.append(a)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。