赞
踩
本文以“Mate30”为关键字进行分析
import requests as re import time #搜索关键字Mate30并获取url target_url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3DMate30&page_type=searchall&page=" # 爬取的微博总页数 total_page=400 #存放微博与用户数据 mblog = list() user = list() # 根据是否为长文本采集不同的微博文本数据 long_text_count = 0 for page in range(total_page): print('Crawling page:%d/%d'%(page+1,total_page)) cur_url=target_url + str(page+1) source_json = re.get(cur_url).json() time.sleep(1) source_data=source_json['data'] cards = source_data['cards'] if 'cards' in source_data else [] for card in cards: if 'mblog' in card: cur_blog = card['mblog'] cur_text = '' is_long_text = 0 if cur_blog['isLongText'] == True: cur_text = cur_blog['longText']['longTextContent'] is_long_text = 1 else: cur_text = cur_blog['text'] verified_reason = cur_blog['user']['verified_reason'] if 'verified_reason' in cur_blog['user'] else None # 微博数据记录 mblog.append({'mid':cur_blog['mid'] ,'raw_text':cur_text ,'isLongText':is_long_text ,'like_count':cur_blog['attitudes_count'] ,'comments_count':cur_blog['comments_count'] ,'reposts_count':cur_blog['reposts_count'] ,'created_at':cur_blog['created_at'] ,'source':cur_blog['source'] ,'user_id':cur_blog['user']['id'] }) # 用户数据记录 user.append({'user_id':cur_blog['user']['id'] ,'screen_name':cur_blog['user']['screen_name'] ,'description':cur_blog['user']['description'] ,'follow_count':cur_blog['user']['follow_count'] ,'followers_count':cur_blog['user']['followers_count'] ,'gender':cur_blog['user']['gender'] ,'verified':cur_blog['user']['verified'] ,'verified_reason':verified_reason ,'profile_url':cur_blog['user']['profile_url'] }) print('Crawl finished') print('共爬取数据条数:',len(mblog))
将微博数据转换为pandas数据,便于分析与保存
import pandas as pd
mblog_frame = pd.DataFrame(mblog)
user_frame = pd.DataFrame(user)
# 去除重复数据
mblog_frame.drop_duplicates(subset=['mid'],inplace=True)
user_frame.drop_duplicates(subset=['user_id'],inplace=True)
#数据保存到文件
mblog_frame.to_csv('mblog.csv',index=None,encoding='utf_8_sig')
user_frame.to_csv('users.csv', encoding='utf_8_sig',index=False)
获取用户地理位置信息(由于在微博网站在用户信息部分没有地理信息,这个部分花费了很多时间,最终选择跳转到用户主页来爬取用户的地理信息)
def get_containerid(id): """ 获取用户的containerid """ containerid_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id containerid_json = re.get(containerid_url).json() containerid_data = containerid_json['data'] if 'data' in containerid_json else None if not containerid_data: return None tabsInfo = containerid_data['tabsInfo'] if 'tabsInfo'in containerid_data else None if not tabsInfo: return None tabs = tabsInfo['tabs'] if 'tabs' in tabsInfo else None if not tabs: return None containerid = tabs[0]['containerid'] if 'containerid'in tabs[0] else None return containerid def get_user_location(raw): profile_url = raw['profile_url'] left,right = profile_url.split('?') uid = left.split('/')[-1] pre = 'https://m.weibo.cn/api/container/getIndex?' mid = '&type=uid&value=' containerid = get_containerid(uid) if not containerid: return None final_url = pre+right+mid+uid+'&containerid='+containerid local_json = re.get(final_url).json() data = local_json['data'] if 'data' in local_json else None if not data: return None cards = data['cards'] if 'cards' in data else None if not cards: return None card_group = cards[0]['card_group'] if 'card_group' in cards[0] else None if not card_group: return None location = card_group[0]['item_content'] if 'item_content' in card_group[0] else None return location #批量处理获取用户所在区域并将结果更新保存到文件 user_frame['location'] = user_frame.apply(get_user_location,axis=1) user_frame.to_csv('users.csv',index=None,encoding='utf_8_sig') #若要保存到数据库,运行下面的代码 import pymysql from sqlalchemy import create_engine engine = create_engine('mysql+pymysql://root:password@localhost:3306/weibo_keyword?charset=utf8mb4',echo=False) mblog_frame.to_sql(name='ori_mblog',con=engine,if_exists='append',index=False) user_frame.to_sql(name='ori_users',con=engine,if_exists='append',index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。