赞
踩
某大大在微博上更文,老翻来翻去的太麻烦,一时兴起便想用python爬一下,结果保存为txt文档
目前列出了 文章的网站版url 保存为JSON文件
- # -*- coding: utf-8 -*-
- # author-Svv 18.08.16
- # 爬取手机版微博个人主页中的文章链接
- # 代码来源 https://blog.csdn.net/d1240673769/article/details/74278547
-
- import urllib.request
- import json
- import re
- from bs4 import BeautifulSoup
-
- #定义要爬取的微博大V的微博ID 一串数字
- id='更换'
- #小说名字
- art_name = '小说名字'
- #设置代理IP
- proxy_addr="122.241.72.191:808"
-
-
- title_list = []
-
- #定义页面打开函数
- def use_proxy(url,proxy_addr):
- req=urllib.request.Request(url)
- req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36")
- proxy=urllib.request.ProxyHandler({'http':proxy_addr})
- opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
- urllib.request.install_opener(opener)
- data=urllib.request.urlopen(req,timeout=10).read().decode('utf-8','ignore')
- return data
-
- #获取微博主页的containerid,爬取微博内容时需要此id
- def get_containerid(url):
- data=use_proxy(url,proxy_addr)
- content=json.loads(data).get('data')
- for data in content.get('tabsInfo').get('tabs'):
- if(data.get('tab_type')=='weibo'):
- containerid=data.get('containerid')
- return containerid
-
- #获取微博内容信息,并保存到文本中,内容包括:每条微博的内容、微博详情页面地址、点赞数、评论数、转发数等
- def get_weibo(id):
- i=1
- while True:
- url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id
- weibo_url='https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url)+'&page='+str(i)
- try:
- data=use_proxy(weibo_url,proxy_addr)
- content=json.loads(data).get('data')
- cards=content.get('cards') #duilie
- # print(cards)
- if (i<=13):
- for j in range(len(cards)):
- print("-----正在爬取第" + str(i) + "页,第" + str(j) + "条微博------")
- card_type = cards[j].get('card_type')
- if (card_type == 9):
- mblog = cards[j].get('mblog')
- page_info = mblog.get('page_info')
- # print(page_info)
- if (page_info != None):
- content = page_info.get('content1')
- # print(content)
- if(re.findall(art_name,content) == [art_name]):
- # print('**'+content+'**')
- created_at = mblog.get('created_at')
- text = mblog.get('text')
- soup = BeautifulSoup(text,"lxml")
- for link in soup.find_all('a'):
- if(link.get('data-url') != None):
- art_url = link.get('data-url')
- title_list.append({
- 'content': content,
- 'time': str(created_at),
- 'art-url': art_url
- })
- i += 1
- else:
- break
- except Exception as e:
- print(e)
- pass
- # print(title_list)
-
- def main():
- get_weibo(id)
- line_d = json.dumps(title_list, ensure_ascii=False) # json.dumps()函数是将字典转化为字符串
- line_j = json.loads(line_d) # json.loads()函数是将字符串转化为字典
- with open('title_url.json', 'w') as fp:
- json.dump(line_j, fp)
-
- if __name__=="__main__":
- main()
下面贴一下cards的结构,明白这个结构就可以找到自己想要的内容了
- {
- 'card_type': 9,
- 'itemid': '**********',
- 'scheme': '**********',
- 'mblog': {'created_at': '07-22', 'id': '**********', 'idstr': '4264686415012792', 'mid': '4264686415012792',
- 'can_edit': False,
- 'text': '<a href="**********" data-hide=""><span class="surl-text">**********
- </span></a>**********<a data-url="**********"
- href="**********"
- data-hide="">',
- 'textLength': 234,
- 'source': '微博 weibo.com',
- 'favorited': False,
- 'is_paid': False, 'mblog_vip_type': 0,
- 'user': {'id': **********, 'screen_name': '**********',
- 'profile_image_url': '**********g',
- 'profile_url': '**********',
- 'statuses_count': 3071, 'verified': False, 'verified_type': -1, 'close_blue_v': False,
- 'description': '**********', 'gender': 'f',
- 'mbtype': 11, 'urank': 34, 'mbrank': 4, 'follow_me': False, 'following': False, 'followers_count': 15337,
- 'follow_count': 114, 'cover_image_phone': '**********',
- 'avatar_hd': '**********', 'like': False, 'like_me': False,
- 'badge': {'bind_taobao': 1, 'zongyiji': 1, 'follow_whitelist_video': 1, 'user_name_certificate': 1}
- },
- 'reposts_count': 20, 'comments_count': 65, 'attitudes_count': 144, 'pending_approval_count': 0, 'isLongText': False, 'hide_flag': 0,
- 'visible': {'type': 0, 'list_id': 0},
- 'mblogtype': 0, 'more_info_type': 0, 'content_auth': 0,
- 'edit_config': {'edited': False},
- 'weibo_position': 1,
- 'page_info': {
- 'page_pic': {'url': '**********'},
- 'page_url': '**********',
- 'page_title': **********',
- 'content1': '**********',
- 'content2': '',
- 'icon': '**********',
- 'type': 'article', 'object_id': '1022:2309404264686416735534'
- },
- 'bid': 'Gr8MVl23u'
- },
- 'show_type': 0
- }
下一篇准备把文章内容全爬出来
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。