赞
踩
目标:爬取京东前三页商品数据,利用协程
思路:
源码:
# 请求链接 # page2: https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720405956855&body=%7B%22keyword%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22qrst%22%3A%221%22%2C%22wq%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22stock%22%3A%221%22%2C%22pvid%22%3A%2231f843077aa2407b95785a72340fe7e7%22%2C%22page%22%3A%222%22%2C%22s%22%3A%2227%22%2C%22scrolling%22%3A%22y%22%2C%22log_id%22%3A%221720405812225.4544%22%2C%22tpl%22%3A%221_M%22%2C%22isList%22%3A0%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720405516.1&area=5_148_0_0&h5st=20240708103236857%3B5t9mgnyi65z65t97%3Bf06cc%3Btk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV%3Bd9b05f319cfb3214b43322b0819b6e4f6400b1a94fa5f756059946f426746e39%3B4.7%3B1720405956857%3BVSTdEx_T1kXZarkswckNeGNuhxfvcx7qwVbjbfZbJCw-qtP330LH_RLHo3Rkwl9CoWL9PUmTlTAUXvDuzdc5HevP3O0_FOX9xfOSI0mNNg4T_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSAV7CNYAAAAADCQFPXFHNCSEDUX # page3: https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720407258079&body=%7B%22keyword%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22qrst%22%3A%221%22%2C%22wq%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22stock%22%3A%221%22%2C%22pvid%22%3A%2231f843077aa2407b95785a72340fe7e7%22%2C%22isList%22%3A0%2C%22page%22%3A%223%22%2C%22s%22%3A%2257%22%2C%22click%22%3A%220%22%2C%22log_id%22%3A%221720405955645.6707%22%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720405516.1&area=5_148_0_0&h5st=20240708105418081%3B5t9mgnyi65z65t97%3Bf06cc%3Btk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV%3B5eb4d823b3d8dd2ff5859add39c06b452fb6dbcb07647f9b117418f6c7671564%3B4.7%3B1720407258081%3BVe3XdZE4uu81PFR3UERrJ8gt477o6yrRoFtAToB_YYSYld0tTDkgFG8h888NDDAnXYAup31SarNKXQ0_y4sWdI533d6lNV-w7tIjZOWlBOcz_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSAV7CNYAAAAADCQFPXFHNCSEDUX # page4: https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720407349236&body=%7B%22keyword%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22qrst%22%3A%221%22%2C%22wq%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22stock%22%3A%221%22%2C%22pvid%22%3A%2231f843077aa2407b95785a72340fe7e7%22%2C%22page%22%3A%224%22%2C%22s%22%3A%2286%22%2C%22scrolling%22%3A%22y%22%2C%22log_id%22%3A%221720407256984.9129%22%2C%22tpl%22%3A%221_M%22%2C%22isList%22%3A0%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720405516.1&area=5_148_0_0&h5st=20240708105549238%3B5t9mgnyi65z65t97%3Bf06cc%3Btk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV%3B53a1df01312ad6e415be12f3c271e5fde0371759f24f6df9d931cb7400793dda%3B4.7%3B1720407349238%3BVW26R-pcaaZF5vMsSl1AT2KBeBy2Mrpwr3LyiC87YKTO9t9-5eogEsbfN2l2-FvE8ladWvhvaXYyZJgUs5nY8C52izSYXnyiFNimDBz0Y0j8_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSAV7CNYAAAAADCQFPXFHNCSEDUX # page5: https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720407374827&body=%7B%22keyword%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22qrst%22%3A%221%22%2C%22wq%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22stock%22%3A%221%22%2C%22pvid%22%3A%2231f843077aa2407b95785a72340fe7e7%22%2C%22isList%22%3A0%2C%22page%22%3A%225%22%2C%22s%22%3A%22116%22%2C%22click%22%3A%220%22%2C%22log_id%22%3A%221720407348116.1660%22%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720405516.1&area=5_148_0_0&h5st=20240708105614829%3B5t9mgnyi65z65t97%3Bf06cc%3Btk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV%3B5d0169127b27592f8f4ed0c02afa60c6bd6f44f914f34b24623b173f52315e88%3B4.7%3B1720407374829%3BV2txYCUHGzP6Mdbj6ef1WEHd94GBw4BqN0XuAobRNPZRJ9h6goijEKMPd90k_-yy0h23f4vPUy24MZEFy-5Dkbl30-8KTuYJGRkWhbQHJagZ_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSAV7CNYAAAAADCQFPXFHNCSEDUX # page6: https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720407399435&body=%7B%22keyword%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22qrst%22%3A%221%22%2C%22wq%22%3A%22%E5%A4%A7%E5%9C%B0%E7%93%9C%22%2C%22stock%22%3A%221%22%2C%22pvid%22%3A%2231f843077aa2407b95785a72340fe7e7%22%2C%22page%22%3A%226%22%2C%22s%22%3A%22146%22%2C%22scrolling%22%3A%22y%22%2C%22log_id%22%3A%221720407373638.7150%22%2C%22tpl%22%3A%221_M%22%2C%22isList%22%3A0%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720405516.1&area=5_148_0_0&h5st=20240708105639436%3B5t9mgnyi65z65t97%3Bf06cc%3Btk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV%3B084f60286a2ba65bc260d4c7d3f7bdaa938a94c061fb3bda340d4e211057e575%3B4.7%3B1720407399436%3BVW3lXgDJ-h5kW0QIYbrMUU6F9AS1a8dAr6OGtN5S3KhBefMf4_l2V5bCBt-RPuEDdkv6E_Sti0yZeINs6J-Zs5Aq7rBLAmLrbvW3Jo6x3IqP_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSAV7CNYAAAAADCQFPXFHNCSEDUX # URL解码: # https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t=1720407890290&body={"keyword":"大地瓜","qrst":"1","wq":"大地瓜","stock":"1","pvid":"31f843077aa2407b95785a72340fe7e7","page":"10","s":"266","scrolling":"y","log_id":"1720407864619.8300","tpl":"1_M","isList":0,"show_items":""}&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720405516.1720407835.2&area=5_148_0_0&h5st=20240708110450292;5t9mgnyi65z65t97;f06cc;tk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV;704bed4f68c2ee25664115650234dcd9720fc69e33bbb0be7c2e4aa8b90d84db;4.7;1720407890292;VWQ64Dsf8LGqkpzRCzOQ0lP6zovZ-d9nI1SPesbrg2R6Xc9xh2X2O7UnaBZj7fFoer7TANkKb3zj0YV_8UO5MSpLxS9WfllrVxwRuBd53r2P_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSBGDWHIAAAAAC6MCL4YILLLO3QX import asyncio import logging import time from multiprocessing import Pool import aiohttp import aiomysql import requests from aiohttp import ContentTypeError from lxml import etree # 请求参数 # appid: search-pc-java # functionId: pc_search_s_new # client: pc # clientVersion: 1.0.0 # t: 1720419837554 # body: {"keyword":"大地瓜","qrst":"1","wq":"大地瓜","stock":"1","pvid":"31f843077aa2407b95785a72340fe7e7","isList":0,"page":"3","s":"56","click":"0","log_id":"1720407889073.8949","show_items":""} # loginType: 3 # uuid: 143920055.1720405515970186801854.1720405516.1720405516.1720407835.2 # area: 5_148_0_0 # h5st: 20240708142357556;5t9mgnyi65z65t97;f06cc;tk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV;76b0218808aa807ffb62c60238f484ba51bf66b05c4c975a3e5cf9d7aa316ea4;4.7;1720419837556;VelsisygYUGRe31NktYIIIyGn7JfAPpt-GDNwiz9Lgfc7TrTDDGgsXxz8ylQfb4N-lRHExGRcKi0OHa4W3Nvj8WhEbsZ2IL-0lzWJKmZEC_1_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3 # x-api-eid-token: jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSBGDWHIAAAAAC6MCL4YILLLO3QX # 协程数量 CONCURRENCY = 2 # 配置logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s : %(message)s') class Spider(object): def __init__(self): # 异步请求 self.session = None # 设置协程数量 self.semaphore = asyncio.Semaphore(CONCURRENCY) # 设置数据库连接池 self.pool = None # 初始化数据库连接池 async def init_pool(self): self.pool = await aiomysql.create_pool( host="127.0.0.1", port=3306, user="root", password="123456", db=f"jingdong", autocommit=True # Ensure autocommit is set to True for aiomysql ) # 在 aiomysql.create_pool 方法中,不需要显式传递 loop 参数。aiomysql 会自动使用当前的事件循环(即默认的全局事件循环)。 # 关闭数据库连接池 async def close_pool(self): if self.pool: self.pool.close() await self.pool.wait_closed() # 获取url源码 async def scrape_api(self, url): # 控制协程数量 async with self.semaphore: try: logging.info(f'scraping {url}') async with self.session.get(url) as response: # 控制爬取速率 await asyncio.sleep(1) # 返回源码 return await response.text() except ContentTypeError as e: logging.info(f'error ocurred while scraping {url}', exc_info=True) # 生成爬取链接 def get_urls(self): # 事件戳 t = time.time() s = 26 urls = [] for page in range(1, 7): if page == 1: url = f'https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t={t}&body={{"keyword":"大地瓜","qrst":"1","suggest":"1.his.0.0","wq":"大地瓜","stock":"1","pvid":"6557c5ae68374b69b120ffd848006147","page":"{page}","s":"1","scrolling":"y","log_id":"{t}","tpl":"1_M","isList":0,"show_items":""}}&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720407835.1720426267.3&area=5_148_172_34120&h5st=20240708170416741;5t9mgnyi65z65t97;f06cc;tk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV;428a1e9616a201717b6d47b2b3722bd452df43902900ff626a995e2227171c41;4.7;1720429456741;V_yUY6sbEL-CaIp_dxeIZ5pfbYaWIcva9sozBqHoAg6ZdawBS6lfLQF9JijQSlEpXWZqA93RjP5gIyjMqyv5EMtQXRKP83UlcFUcKqeFCVpE_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSGLBYGYAAAAADLIMW7XCPAQPEIX' urls.append(url) else: url = f'https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t={t}&body={{"keyword":"大地瓜","qrst":"1","suggest":"1.his.0.0","wq":"大地瓜","stock":"1","pvid":"6557c5ae68374b69b120ffd848006147","page":"{page}","s":"{s}","scrolling":"y","log_id":"{t}","tpl":"1_M","isList":0,"show_items":""}}&loginType=3&uuid=143920055.1720405515970186801854.1720405516.1720407835.1720426267.3&area=5_148_172_34120&h5st=20240708170416741;5t9mgnyi65z65t97;f06cc;tk03wb0d01bf518n32iEIGovakaKKTQNowDzR6awWYSGFUuBxwxdqI7rI5zMMEIKtbV1MlLFKx401IvhD1hSA4wpk2GV;428a1e9616a201717b6d47b2b3722bd452df43902900ff626a995e2227171c41;4.7;1720429456741;V_yUY6sbEL-CaIp_dxeIZ5pfbYaWIcva9sozBqHoAg6ZdawBS6lfLQF9JijQSlEpXWZqA93RjP5gIyjMqyv5EMtQXRKP83UlcFUcKqeFCVpE_W44FIX_aU0T-VVaCT05fg_EXF-1PNNZBqaXHYx0BYl8zZ19Td0vLFIjaL01RMbD07ZObz8tLp2Jn3DvJml1suhfTZ89y7tuV7ItBhR7lbKKWByWlB8XRWGSg9XsxxB0HAZMcIUdNrylVgeML9uj9b5qTJ29X23MbI-a8LskjhzRdae8sApBdXTmGY5EmOB90339yeyK6rAKBoUnwtPtsr41wX1NpBGgHFQ9KUYCwLL8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSGLBYGYAAAAADLIMW7XCPAQPEIX' urls.append(url) s += 30 return urls # main方法: 爬取前3页商品信息 async def main(self): # 设置headers头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'Origin': 'https://search.jd.com', # 'Referer': 'https: // search.jd.com /', # 'authority': 'api.m.jd.com' 'Cookie': '__jdv=76161171|www.google.com|-|referral|-|1720405515971; __jdu=1720405515970186801854; areaId=5; ipLoc-djd=5-148-0-0; PCSYCityID=CN_130000_130400_0; shshshfpa=b91bb682-fc61-e92f-d4fa-6f2ab9016096-1720405517; shshshfpx=b91bb682-fc61-e92f-d4fa-6f2ab9016096-1720405517; _pst=jd_JEmItmZNMVjZ; unick=jd_19k0x9pj4djw61; pin=jd_JEmItmZNMVjZ; _tp=x6GXi%2BJW3x6n83VWqaBHsA%3D%3D; pinId=4zoL-jyJrsYADYzmUaUUbg; jsavif=1; jcap_dvzw_fp=D98Z_LJ6_qpQNHV4YAypinlsUHSWxNmfTaynt0Zdv_UsqMZp3i5Qpdsn0w4iLfi6Puo7XtYZf66yNMh-MoPnwg==; unpl=JF8EAJ5nNSttDEhXUh4BT0UUQ1tUW1sPTR9Ra2JRVQ1eTFVSSAIYQkR7XlVdWBRKER9ubhRUWlNLVg4aBisSEHtdVV9eDkIQAmthNWRVUCVXSBtsGHwQBhAZbl4IexcCX2cDV1xdSlABGwYTFBFLVFNXXAhCEwZfZjVUW2h7ZAQrAysTIAAzVRNdDkgWBm5jAVRZUE1VBRIFEhMQQllRblw4SA; 3AB9D23F7A4B3CSS=jdd03V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4AAAAMQSFTVBHQAAAAAC4B6QWJAGOWDJ4X; _gia_d=1; mba_muid=1720405515970186801854; mba_sid=17204263891663834439004533139.1; wlfstk_smdl=cvb04qz3roc5w60cs5zyx6jif17m2plf; logintype=wx; npin=jd_JEmItmZNMVjZ; thor=60B98CF932ADA526FE25802592C19B87D6CE84D9F3BE42CF89AA8311226A742764C1877F1DF3A80F050EC47B4A640838398AACBCEF5DAB9D905EE6050D9280A3DAF6B0C4C1046A9CF34BC2A309838E2D3EF1A02A02BAC5B5D059837D42CE1D0EBB6A877CB19B0142334418A26D7781EB79EE6AF17DB9031460A688639BF795EBA9A05B0BA147282060EF0C8CD1A49E53640ABCC266E9EAA802150EEAB7097923; flash=2_2XfyufGunTsB2yv6828kN5vMWLQatIHqpk1syw5siK-0q3Jn3XMiJ0mUCKH3K-YnrxBL0OKhvHK_nTbXbhP5DfKN0yEorcr34im6PSWjEjl9pstRvn3LitJI2FykXYej-Qgtus5ZXnCFuSowFYxceVWG0GdRbiqx9dVtieXi41P*; __jda=143920055.1720405515970186801854.1720405516.1720407835.1720426267.3; __jdc=143920055; shshshfpb=BApXcuRtvkvVA7RiSDiYHm2YzBl0Cx9FMBmIBUjho9xJ1MqB1loC2; 3AB9D23F7A4B3C9B=V5VWE343CHG6ZQ5SWNXMF26W56JIB22JTI3USWGZ62J6GCA43ERHGX2X3VC7A2CC52WLPNXLZGK6XH6BBTWT7G3ET4; __jdb=143920055.19.1720405515970186801854|3.1720426267' } # 建立异步请求需要的session(用于加headers、代理IP、cookie等信息) self.session = aiohttp.ClientSession(headers=headers) # 获取urls urls = self.get_urls() # 生成任务列表 tasks = [asyncio.ensure_future(self.get_prices(url)) for url in urls] # 获取结果 results = await asyncio.gather(*tasks) # 入库 # 1.初始化数据库连接池 await self.init_pool() # 2.保存至数据库 [await self.save_to_mysql('prices', 'price', tuple(price)) for page_result in results for price in page_result] # 关闭数据库连接池 await self.close_pool() # 关闭连接 await self.session.close() # 入库 async def save_to_mysql(self, table_name, table_column_str, table_info_str): async with self.pool.acquire() as conn: async with conn.cursor() as cursor: sql = f'insert into {table_name}({table_column_str}) values{table_info_str}' # 执行SQL语句 await cursor.execute(sql) await conn.commit() # 获取商品价格 async def get_prices(self, url): # 获取url源码 source = await self.scrape_api(url) # print(source) # xpath解析数据 prices = etree.HTML(source).xpath('//div[@class="p-price"]/i/text()') return prices if __name__ == '__main__': # 初始化spider spider = Spider() # 创建事件循环池 loop = asyncio.get_event_loop() # 注册事件 loop.run_until_complete(spider.main())
asyncio
和aiohttp
)主要是针对IO密集型任务设计的。在这种情况下,多个网络请求可以并行执行,但每个请求本身在CPU上的计算量通常较小,因为大部分时间都是在等待响应返回。asyncio
)会更适合,因为它可以在单个线程内高效地管理大量并发的IO操作,而不需要引入多进程的复杂性。asyncio
和aiohttp
)能够提供很好的性能,因为它可以在单个线程内高效地管理并发的IO操作。后续发布爬虫更多精致内容(按某培训机构爬虫课程顺序发布,欢迎关注后续发布)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。