赞
踩
关注公众号:『AI学习星球
』
回复:扒一扒蔡徐坤
即可获取数据下载。
论文辅导
或算法学习
可以通过公众号滴滴我
项目主要随机抓取蔡徐坤100万+转发的微博《再见,“任性的”千千…》的10万条转发数据,并且分析蔡徐坤真假转发流量的比例以及真假粉丝的用户画像。根据数据分析出真假流量所占比例各有多少,假流量粉丝是如何生产出来的。
导入模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pandas.io.json import json_normalize
plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['PingFang'] #解决seaborn中文字体显示问题
plt.rc('figure', figsize=(10, 10)) #把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
%matplotlib inline
data = pd.read_csv('caixukun.csv')
data.to_csv('caixukun.csv', index=False)
data.info()
# 先来看看蔡徐坤的粉丝性别比例
fans_num = data['user.gender'].value_counts()
fans_num
from pyecharts import Bar
bar = Bar("蔡徐坤粉丝性别比例初探", width = 600,height=500)
bar.add("(总数据102313条)", ['男', '女'], fans_num.values, is_stack=True,
xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar
np.round(fans_num/fans_num.sum()*100, 2)
data[data['user.gender']=='m'].sample(5)
data=data.fillna({'user.description':'wu'})
data_fake = data[((data['user.follow_count']<=5)|(data['user.followers_count']<=5))&
(data['user.description']=='wu')&
(data['comments_count']==0)&
(data['attitudes_count']==0)&
(data['reposts_count']==0)&
(data['user.mbrank']==0)]
data_fake.sample(5)
data_fake.shape
# 昵称里包含“用户”的,基本上可以断定是假粉丝
data_fake2_index = data[(data['user.follow_count']>5)&
(data['user.followers_count']>5)&
(data['user.screen_name'].str.contains('用户'))].index
# 把假的流量粉丝转发组合起来
data_fake = pd.concat([data_fake, data.iloc[data_fake2_index]])
data_fake.shape
# 取出真粉的转发# 取出真粉的转发
data_true = data.drop(data_fake.index)
data_true.shape
print('真粉丝转发数占总转发数的{}%'.format(np.round(data_true.shape[0]/data.shape[0]*100, 2)))
print('假粉丝转发数占总转发数的{}%'.format(np.round(data_fake.shape[0]/data.shape[0]*100, 2)))
bar = Bar("蔡徐坤真假流量的转发量", width = 600,height=500)
bar.add("(总数据102313条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'],
[data.shape[0], data_fake.shape[0], data_true.shape[0]], is_stack=True,
xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar
real_fans_num = data_true.drop_duplicates(subset='user.id').shape[0]
bar = Bar("蔡徐坤真假流量的转发量与真实转发粉丝量(总数据102313条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'],
[data.shape[0], data_fake.shape[0], data_true.shape[0], real_fans_num], is_stack=True,
xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar
print('真实转发粉丝量占总转发数的{}%'.format(np.round(real_fans_num/data.shape[0]*100, 2)))
真实转发粉丝量占总转发数的3.84%
data.sample(5)
data_fake_gender = data_fake.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_fake_gender
data_fake[data_fake['user.gender']=='f'].sample(5)
bar = Bar("蔡徐坤假粉丝性别比例", width = 600,height=500)
bar.add("(假粉丝总数为40838)", ['男', '女'], data_fake_gender.values, is_stack=True,
xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar
38969/40838
0.954233801851217
data_fake['raw_text'].value_counts()
fake_source = data_fake['source'].value_counts()[:10]
bar = Bar("蔡徐坤假粉丝Top10转发设备", width = 600,height=600)
bar.add("", fake_source.index, fake_source.values, is_stack=True,
xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar
data_fake['user.follow_count'].mean()
3.4412612555950397
data_fake['user.followers_count'].mean()
1.04576663836389
data_fake_sample = data_fake.sample(5)
data_fake_sample['user.screen_name']
data_fake_sample['user.profile_image_url'].values
data_fake.sample(5)['user.screen_name']
data_fake['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()
41766
data_fake.shape[0]
95397
data_fake['user.statuses_count'].mean()
72.4942503433022
data_true.sample(5)
data_true_gender = data_true.drop_duplicates(subset='user.id')['user.gender'].value_counts()
data_true_gender
bar = Bar("蔡徐坤真粉丝性别比例", width = 600,height=500)
bar.add("(真粉丝总数为3926)", ['女', '男'], data_true_gender.values, is_stack=True,
xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar
data_true['raw_text'].value_counts()
true_source = data_true['source'].value_counts()[:10]
bar = Bar("蔡徐坤真粉丝Top10转发设备", width = 600,height=600)
bar.add("", true_source.index, true_source.values, is_stack=True,
xaxis_label_textsize=11, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=30)
bar
data_true['user.follow_count'].mean()
222.0597165991903
data_true['user.followers_count'].mean()
178.9480913823019
data_true.sample(5)['user.screen_name']
data_true['user.screen_name'].str.contains('蔡|坤|葵|kun').sum()
3153
data_true.shape[0]
6916
# 绘制蔡徐坤真粉丝的简介词云图
import jieba
from collections import Counter
from pyecharts import WordCloud
jieba.add_word('蔡徐坤')
swords = [x.strip() for x in open ('stopwords.txt')]
def plot_word_cloud(data, swords, columns):
text = ''.join(data[columns])
words = list(jieba.cut(text))
ex_sw_words = []
for word in words:
if len(word)>1 and (word not in swords):
ex_sw_words.append(word)
c = Counter()
c = Counter(ex_sw_words)
wc_data = pd.DataFrame({'word':list(c.keys()), 'counts':list(c.values())}).sort_values(by='counts', ascending=False).head(100)
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("", wc_data['word'], wc_data['counts'], word_size_range=[20, 100])
return wordcloud
plot_word_cloud(data=data_true, swords=swords, columns='user.description')
plot_word_cloud(data=data_true, swords=swords, columns='raw_text')
关注公众号:『AI学习星球
』
回复:扒一扒蔡徐坤
即可获取数据下载。
论文辅导
或算法学习
可以通过公众号滴滴我
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。