当前位置:   article > 正文

python3 协程爬虫,爬取豆瓣排行榜json数据

python3 协程爬虫,爬取豆瓣排行榜json数据

python携程爬取豆瓣电影排行榜json接口数据。仅学习使用,如有侵权、请联系删除
不说废话直接贴代码,大体方法一样,解析方式不同

#python3 jupyter
import asyncio
import aiohttp
import json
import ssl
import random
from time import time
import os
url='https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&genres=%E6%83%85%E8%89%B2'

headers = { 
          'Host':'movie.douban.com',
          'Connection':'keep-alive',
          'Cache-Control':'max-age=0',
          'Accept': 'text/html, */*; q=0.01',
          'X-Requested-With': 'XMLHttpRequest',
          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
          'DNT':'1',
          'Referer': 'https://movie.douban.com/typerank?type_name=%E6%83%85%E8%89%B2&type=6&interval_id=100:90&action=',
          'Accept-Encoding': 'gzip, deflate, sdch',
          'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
    }

async def fetch_content(url,semaphore):
    async with semaphore:
        stats_time=random.randrange(1,10)
        await asyncio.sleep(stats_time) # 防止请求过快 等待随机秒 再加上限制协程数量 防止被pass
        async with aiohttp.ClientSession(
            headers=headers, connector=aiohttp.TCPConnector(ssl=False)
        ) as session:
            async with session.get(url,timeout=60) as response:
                return await response.json()

async def param(url):
    fetch_list = []
    semaphore = asyncio.Semaphore(2) # 限制并发量为2
    for i in range(0,200,20):
        fetch_list.append(url + '&start='+str(i))
    tasks =[fetch_content(url,semaphore) for url in fetch_list]
    jsons = await asyncio.gather(*tasks)    
    
    for i in jsons: 
        for e in i['data']:
            title=e['title']
            directors='/'.join(e['directors'])#导演
            rate=e['rate']#评分
            casts='/'.join(e['casts'])#演员
            url=e['url']#地址
            await write_fo_file(title,directors, rate, casts, url)

async def write_fo_file(title,directors, rate, casts, url):
    f = open('movie_top250.csv', 'a',encoding='utf8')
    f.write(f'{title},{directors},{rate},{casts},{url}\n')
    f.closed 
    
async def main():
    if os.path.exists('movie_top250.csv'):
        os.remove('movie_top250.csv')
    start=time()    
    await param(url)
    end=time()
    print("Cost {} seconds".format((end - start) / 5))
    
if __name__ =='__main__':
    await main()
    
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/939788
推荐阅读
相关标签
  

闽ICP备14008679号