赞
踩
目录
爬虫:一段自动抓取互联网信息的程序,从互联网上抓取对于我们有价值的信息。
目标:获取微博社会栏目热点数据
首先定位到要爬取的网站链接,通过F12打开控制台,找到请求包列表查看预览,定位到所需数据所在的具体请求包,找到待爬取网页的url(标头)、参数列表(载荷)、用户Cookie和Agent(标头)。
比如现在要爬取微博热点数据:
获取到这些信息后,就可以开始写request请求获取网页的数据源码,如下:
- import requests
- import csv
- import numpy as np
- import os
- import time
- from datetime import datetime
-
- articalUrl = 'https://weibo.com/ajax/feed/hottimeline'
-
- headers={
- 'Cookie' : '...',
- 'User-Agent': '...'
- }
-
- # 此为社会热点页面参数
- params = {
- 'group_id':'1028034188',
- 'containerid':'102803_ctg1_4188_-_ctg1_4188',
- 'max_id':0,
- 'count':20,
- 'extparam':'discover|new_feed'
- }
-
- # response得到get后的数据源码
- response = requests.get(articalUrl,headers = headers,params = params)
可以观察到,我们所需的各类数据,都在statuses模块的分组内,以此不断切分出所需数据并导入目标文件即可:
- def get_data(url,params):
- headers={
- 'Cookie' : '...',
- 'User-Agent': '...'
- }
- response = requests.get(url,headers = headers,params = params)
- if response.status_code == 200: # “200”标识代表GET请求成功
- return response.json()['statuses'] # 此处直接提取出statuses中的内容
- else:
- print('出错了')
- return None
-
- # 切分数据
- def parse_json(response):
- for artice in response:
- id = artice['id']
- LikeNum = artice['attitudes_count']
- commentsLen = artice['comments_count']
- reposts_count = artice['reposts_count']
- try:
- region=artice['region_name'].replace('发布于 ','')
- except:
- region = '无'
- content = artice['text_raw']
- contentLen = artice['textLength']
- created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d %H:%M:%S')
- try:
- detailUrl = 'https://weibo.com/'+ str(artice['id']) + '/' + str(artice['mblogid'])
- except:
- detailUrl = '无'
- authorName = artice['user']['screen_name']
- authorAvatar = artice['user']['avatar_large']
- authorDetail = 'https://weibo.com/u/' + str(artice['user']['id'])
- writerRow([
- # id,
- # LikeNum,
- # commentsLen,
- # reposts_count,
- # region,
- # content,
- # contentLen,
- # created_at,
- # type,
- # detailUrl,
- # authorAvatar,
- # authorDetail
- content # 此处只爬取内容,可以修改
- ])
- file_path = './articaleData'
-
- def init():
- if not os.path.exists(file_path):
- with open(file_path,'w',encoding='utf-8',newline='') as csvFile:
- writer = csv.writer(csvFile)
- writer.writerow([
- 'id',
- 'likeNum',
- 'commentsLen',
- 'reposts_count',
- 'region',
- 'content',
- 'contentLen',
- 'created_at',
- 'detailurl',
- 'authorAvatar',
- 'authorName',
- 'authorDetail',
- ])
-
- def writerRow(row):
- with open(file_path,'a',encoding='utf-8',newline='') as csvFile:
- writer = csv.writer(csvFile)
- writer.writerow(row)
-
- def get_data(url,params):
- headers={
- 'Cookie' : '...',
- 'User-Agent': '...'
- }
- response = requests.get(url,headers = headers,params = params)
- if response.status_code == 200:
- return response.json()['statuses']
- else:
- print('出错了')
- return None
-
- def parse_json(response):
- for artice in response:
- id = artice['id']
- LikeNum = artice['attitudes_count']
- commentsLen = artice['comments_count']
- reposts_count = artice['reposts_count']
- try:
- region=artice['region_name'].replace('发布于 ','')
- except:
- region = '无'
- content = artice['text_raw']
- contentLen = artice['textLength']
- created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d %H:%M:%S')
- try:
- detailUrl = 'https://weibo.com/'+ str(artice['id']) + '/' + str(artice['mblogid'])
- except:
- detailUrl = '无'
- authorName = artice['user']['screen_name']
- authorAvatar = artice['user']['avatar_large']
- authorDetail = 'https://weibo.com/u/' + str(artice['user']['id'])
- writerRow([
- # id,
- # LikeNum,
- # commentsLen,
- # reposts_count,
- # region,
- # content,
- # contentLen,
- # created_at,
- # detailUrl,
- # authorAvatar,
- # authorDetail
- content
- ])
-
- def start(pageNum = 10):
- articalUrl = 'https://weibo.com/ajax/feed/hottimeline'
- init()
- for page in range(0,pageNum):
- print('正在爬取的类型: %s 中的第%s页文章数据'%('社会',page +1))
- parmas = {
- 'group_id':'1028034188',
- 'containerid':'102803_ctg1_4188_-_ctg1_4188',
- 'max_id':page,
- 'count':20,
- 'extparam':'discover|new_feed'
- }
- response = get_data(articalUrl,parmas)
- parse_json(response)
- print('爬取完毕!')
-
- if __name__ == "__main__":
- start()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。