赞
踩
第一种情况:
利用Counter函数进行词频统计,比较简洁,代码如下:
数据:movie_comments.csv文件为23万的影评数据
- # -*- coding:utf-8 -*-
- import jieba
- import re
- import pandas as pd
- from collections import Counter
-
- content = pd.read_csv(r"movie_comments.csv")
- # print(content.head())
- articles = content['comment'].tolist()
- print(len(articles))
-
- def token(string):
- return re.findall('\w+',string)
- articles_clean = [''.join(token(str(s))) for s in articles]
- print(articles_clean[100])
-
- def cut_word(string):
- return list(jieba.cut(string))
-
- articles_words = [cut_word(string) for string in articles_clean]
- list_set = []
- for i in articles_words:
- list_set.extend(i)
-
- words_count = Counter(list_set)
- statics = words_count.most_common()[:50]
- print(statics)
结果:
[('的', 328262), ('了', 102420), ('是', 73106), ('我', 50338), ('都', 36255), ('很', 34712), ('看', 34022), ('电影', 33675), ('也', 32065), ('和', 31290), ('在', 31245), ('不', 28435), ('有', 27939), ('就', 25685), ('人', 23909), ('好', 22858), ('啊', 20803), ('这', 17484), ('还', 17449), ('一个', 17343), ('你', 17282), ('还是', 16425), ('但', 15578), ('故事', 15010), ('没有', 14343), ('就是', 14007), ('喜欢', 13566), ('让', 13304), ('太', 12676), ('又', 11566), ('剧情', 11359), ('没', 10858), ('说', 10764), ('吧', 10747), ('他', 10675), ('不错', 10416), ('得', 10349), ('到', 10341), ('给', 10300), ('这个', 10058), ('上', 10054), ('被', 9939), ('对', 9824), ('最后', 9694), ('一部', 9693), ('片子', 9590), ('什么', 9571), ('能', 9532), ('与', 9168), ('多', 8977)]
第二种情况:
但是如是大批量语料,电脑配置不好的,建议先预处理保存到本地,再去统计词频,代码如下:
先预处理保存到本地:
- # -*- coding:utf-8 -*-
- import re
- import pandas as pd
-
- content = pd.read_csv(r"movie_comments.csv")
- # print(content.head())
- articles = content['comment'].tolist()
- print(len(articles))
-
- def token(string):
- return re.findall('\w+',string)
-
- articles_clean = [''.join(token(str(s))) for s in articles]
- #print(articles_clean[600])
-
- with open('writedic.txt','w',encoding='utf-8') as f:
- for line in articles_clean:
- f.write(line+'\n')
再进行分词、词频统计:
- import jieba
- from collections import Counter
-
- Token = []
- for i,lines in enumerate(open('writedic.txt',encoding='utf-8')):
- if i%10000 == 0:
- print(i)
- Token += jieba.cut(lines)
-
- print(Token[:10])
- words_count = Counter(Token)
- statics = words_count.most_common()[:50]
- print(statics)
结果同上。
1、读取文件
2、分词,加载自定义词典,去数字,去停用词
3、统计词频并排序
代码如下:
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
-
- import pandas as pd
- import re
- import jieba
-
- """
- 第一步:读取文件
- """
- f1=open("write.txt", "w", encoding='utf-8')
- reviews=pd.read_excel('1.xlsx',usecols=[0],skiprow=[0],sheetname='Sheet1')#skiprow=[0]去除第一行.usecols的第一列是B列
- #打印前三行
- # for i in range(3):
- # print("Review #", i + 1)
- # print(reviews.answer[i])
- # print()
- # ## 数据预处理
- """
- 第二步:分词,加载有用词典,去数字,去停用词
- """
- # 清除不想要的单词(频率比较低的),停用词
- jieba.load_userdict("jiebauserdict.txt")
- stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
- def clean_text(text):
- newtext = []
- text = re.sub(r'\d+', ' ', text) #去除数字
- text = re.sub(r'\n', '', text)
- text = jieba.lcut(text) # 分词
- for word in text:
- if word not in stopword: # 去停用词 + 词性筛选
- newtext.append(word)
- lineswords=' '.join(newtext)
- # print(lineswords)
- return lineswords
-
- # 预处理文章内容 Clean the answer
- clean_content = []
- for summary in reviews.answer:
- clean_content.append(clean_text(summary))
- print("Content are complete.")
-
- # 打印清洗后的前三行
- # for i in range(3):
- # print("Clean Review #", i + 1)
- # print(clean_content[i])
- # print()
-
- """
- 第三步:统计文本内容中每个单词出现的频率并排序
- """
- # 统计每个句子中每个单词出现的频率
- def count_words(count_dict, text):
- for sentence in text:
- for word in sentence.split():
- if word not in count_dict:
- count_dict[word] = 1 #如果不在count_dict,则存入词典
- else:
- count_dict[word] += 1
-
- # print(count_dict)
- result = sorted(count_dict.items(),key=lambda item:item[1],reverse=True) #排序
- print(result)
- for key in result:
- # print(key[:500])
- f1.write(key.__str__()+'\n')
- # # # print(key,count_dict[key])
- # # f1.write(key+' '+str(count_dict[key])+'\n')
- f1.flush()
- # 统计所有词的词频 比如:{'hello':7,'good':3} Find the number of times each word was used and the size of the vocabulary
- word_counts = {}
- count_words(word_counts, clean_content)
- print("词汇数量:", len(word_counts)) #词汇数量
结果:
三部曲:读取文件、文件清洗与分词、统计词频
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
-
- import re
- import jieba
- from collections import Counter
-
- stopword = [line.rstrip() for line in open("stopwords.txt", 'r', encoding='utf-8')]
-
- # 文本清洗
- def clean_text(text):
- newtext = []
- text = re.sub(r'\d+', ' ', text) #去除数字
- text = jieba.lcut(text) # 分词
- for word in text:
- if word not in stopword: # 去停用词 + 词性筛选
- newtext.append(word)
- lineswords=' '.join(newtext)
- return lineswords
-
-
- # 统计词频
- def counter_word(data):
- lines = ''
- for line in data:
- lines += line
- data_list = lines.split(' ')
- words_count = Counter(data_list)
- # print(words_count)
- count_res = words_count.most_common()[:50]
- return count_res
-
-
- # 读取txt文本文件
- def read_content():
- data = []
- contents = [line.strip() for line in open("langchao.txt", 'r', encoding='utf-8-sig')]
- for content in contents:
- text = clean_text(content)
- data.append(text)
- result = counter_word(data)
- return result
-
-
- if __name__ == '__main__':
- result = read_content()
- print(result)
读取的文档:
- #encoding: utf-8
- from collections import Counter
- f = open("userdic.txt",'w',encoding='utf-8')
- result = []
- for line in open("writedic.txt", "r", encoding='utf-8'):
- result.append(list(line.strip().split(" ")))
-
- def get_keys(d, value):
- return [k for k, v in d.items() if v == value]
- # print(result)
- for words in result:
- words_count = Counter(words)
- for word in words_count:
- if words_count[word] > 1:
- print(word)
- f.write(word+' ')
- f.write('\n')
存储的文档:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。