赞
踩
python携程爬取豆瓣电影排行榜json接口数据。仅学习使用,如有侵权、请联系删除
不说废话直接贴代码,大体方法一样,解析方式不同
#python3 jupyter import asyncio import aiohttp import json import ssl import random from time import time import os url='https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&genres=%E6%83%85%E8%89%B2' headers = { 'Host':'movie.douban.com', 'Connection':'keep-alive', 'Cache-Control':'max-age=0', 'Accept': 'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36', 'DNT':'1', 'Referer': 'https://movie.douban.com/typerank?type_name=%E6%83%85%E8%89%B2&type=6&interval_id=100:90&action=', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6' } async def fetch_content(url,semaphore): async with semaphore: stats_time=random.randrange(1,10) await asyncio.sleep(stats_time) # 防止请求过快 等待随机秒 再加上限制协程数量 防止被pass async with aiohttp.ClientSession( headers=headers, connector=aiohttp.TCPConnector(ssl=False) ) as session: async with session.get(url,timeout=60) as response: return await response.json() async def param(url): fetch_list = [] semaphore = asyncio.Semaphore(2) # 限制并发量为2 for i in range(0,200,20): fetch_list.append(url + '&start='+str(i)) tasks =[fetch_content(url,semaphore) for url in fetch_list] jsons = await asyncio.gather(*tasks) for i in jsons: for e in i['data']: title=e['title'] directors='/'.join(e['directors'])#导演 rate=e['rate']#评分 casts='/'.join(e['casts'])#演员 url=e['url']#地址 await write_fo_file(title,directors, rate, casts, url) async def write_fo_file(title,directors, rate, casts, url): f = open('movie_top250.csv', 'a',encoding='utf8') f.write(f'{title},{directors},{rate},{casts},{url}\n') f.closed async def main(): if os.path.exists('movie_top250.csv'): os.remove('movie_top250.csv') start=time() await param(url) end=time() print("Cost {} seconds".format((end - start) / 5)) if __name__ =='__main__': await main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。