赞
踩
安装豆果美食(QQ分享至模拟器)→打开Fidller并remove all sessions→启动豆果美食→点击菜谱分类→点击红烧肉→点击学做多→查看数据包
问题:提示net::ERR_PROXY_CONNECTION_FAILED
解决方法:重启FIddler
打开Fiddler→点击find→输入api.douguo.net→点击Find Sessions进行高亮显示→保留菜谱分类数据包、土豆学做多前40条数据包
# 导包 import json from multiprocessing import Queue import requests # 创建队列 queue_list = Queue() # 定义请求函数:3个请求header部分是一样的,只需传入url和data def handle_request(url,data): # 正则替换 (.*?):(.*) --→ "$1":"$2", header = { "client":"4", "version":"7109.2", "device":"M2007J22C", "sdk":"25,7.1.2", "imei":"351564354264020", "channel":"qqkp", "resolution":"1600*900", "dpi":"2.0", "brand":"Xiaomi", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"2", # "carrier":"CMCC", # 没有这项数据 "User-Agent":"Mozilla/5.0(Linux;Android 7.1.2;M2007J22C Build/QP1A.190711.020;wv) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36", "reach":"1", "newbie":"1", "Content-Type":"application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding":"gzip, deflate", "Connection":"Keep-Alive", "Host":"logs.douguo.net", } # 请求数据(3个请求都是post方法) response = requests.post(url=url,headers=header,data=data) return response # 定义 def handle_index(): # & 换成 \n 并 加"" url = "http://api.douguo.net/recipe/flatcatalogs" data = { "client":"4", "_vs" : "2305", } response = handle_request(url=url,data=data) # 解析json数据 catalog_response_dict = json.loads(response.text) for catalog_list in catalog_response_dict['result']['cs']: for catalog in catalog_list['cs']: for dishes in catalog['cs']: data2 = { "client": "4", "keyword": dishes['name'], "order": "3", "_vs": "400", } # 放入队列 queue_list.put(data2) handle_index() print(queue_list.qsize())
spider_duogou_0.py运行结果如下:
# -*- coding: utf-8 -*- # @Author : 袁天琪 # @Time : 2022/3/14 16:53 # 将菜谱放入列表(在spider_duogou_0.py基础上进一步分析) # 导包 import json from multiprocessing import Queue import requests # 创建队列 queue_list = Queue() # 定义请求函数:3个请求header部分是一样的,只需传入url和data def handle_request(url,data): # 正则替换 (.*?):(.*) --→ "$1":"$2", header = { "client":"4", "version":"7109.2", "device":"M2007J22C", "sdk":"25,7.1.2", "imei":"351564354264020", "channel":"qqkp", "resolution":"1600*900", "dpi":"2.0", "brand":"Xiaomi", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"2", # "carrier":"CMCC", # 没有这项数据 "User-Agent":"Mozilla/5.0(Linux;Android 7.1.2;M2007J22C Build/QP1A.190711.020;wv) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36", "reach":"1", "newbie":"1", "Content-Type":"application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding":"gzip, deflate", "Connection":"Keep-Alive", "Host":"logs.douguo.net", } # 请求数据(3个请求都是post方法) response = requests.post(url=url,headers=header,data=data) return response # 请求首页数据 def handle_index(): # & 换成 \n 并 加"" url = "http://api.douguo.net/recipe/flatcatalogs" data = { "client":"4", "_vs" : "2305", } response = handle_request(url=url,data=data) # 解析json数据 catalog_response_dict = json.loads(response.text) for catalog_list in catalog_response_dict['result']['cs']: for catalog in catalog_list['cs']: for dishes in catalog['cs']: data2 = { "client": "4", "keyword": dishes['name'], "order": "3", "_vs": "400", } # 放入队列 queue_list.put(data2) def handle_recipe_list(data): print("当前处理的食材是:",data['keyword']) recipe_list_url='http://api.douguo.net/recipe/v2/search/0/20' recipe_list_response=handle_request(url=recipe_list_url,data=data) recipe_list_response_dict = json.loads(recipe_list_response.text) for recipe_list in recipe_list_response_dict['result']['list']: recipe_info = {} recipe_info['main_ingredient'] = data['keyword'] if recipe_list['type'] == 13: recipe_info['username'] = recipe_list['r']['an'] recipe_info['ingredients_id'] =recipe_list['r']['id'] recipe_info['describe'] =recipe_list['r']['cookstory'].replace('\n','').replace(' ','') recipe_info['dishname'] =recipe_list['r']['n'] recipe_info['ingredients'] =recipe_list['r']['major'] detail_url = 'http://api.douguo.net/recipe/detail/'+str(recipe_info['ingredients_id']) # 构造请求参数 detail_data = { "client": "4", "author_id":"0", "_vs": "2803", "_ext": '{"query":{"id":'+str(recipe_info['ingredients_id'])+',"kw":'+recipe_info['main_ingredient']+',"idx":"4";"src":"2803";"type":"13"}}', } # 请求 detail_response = handle_request(url=detail_url,data=detail_data) detail_response_dict = json.loads(detail_response.text) recipe_info['tips'] = detail_response_dict['result']['recipe']['tips'] recipe_info['cookstep'] = detail_response_dict['result']['recipe']['cookstep'] print(json.dumps(recipe_info)) else: continue handle_index() handle_recipe_list(queue_list.get())
spider_douguo_1.py运行结果如下:
出现的问题
# -*- coding: utf-8 -*- # @Author : 袁天琪 # @Time : 2022/3/15 16:01 # 导包 import pymongo from pymongo.collection import Collection # 定义类 class Connect_mongo(object): def __init__(self): self.client = pymongo.MongoClient(host='127.0.0.1',port=27017) self.db_data = self.client['douguo'] # 数据库名 def insert_item(self,item): db_collection = Collection(self.db_data,'douhuo_item') db_collection.insert(item) # 实例化 mongo_info = Connect_mongo()
# -*- coding: utf-8 -*- # @Author : 袁天琪 # @Time : 2022/3/14 21:18 # 将数据保存到mongodb中 # 导包 import json from multiprocessing import Queue import requests from handle_mongo import mongo_info # 创建队列 queue_list = Queue() # 定义请求函数:3个请求header部分是一样的,只需传入url和data def handle_request(url,data): # 正则替换 (.*?):(.*) --→ "$1":"$2", header = { "client":"4", "version":"7109.2", "device":"M2007J22C", "sdk":"25,7.1.2", "imei":"351564354264020", "channel":"qqkp", "resolution":"1600*900", "dpi":"2.0", "brand":"Xiaomi", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"2", # "carrier":"CMCC", # 没有这项数据 "User-Agent":"Mozilla/5.0(Linux;Android 7.1.2;M2007J22C Build/QP1A.190711.020;wv) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36", "reach":"1", "newbie":"1", "Content-Type":"application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding":"gzip, deflate", "Connection":"Keep-Alive", "Host":"logs.douguo.net", } # 请求数据(3个请求都是post方法) response = requests.post(url=url,headers=header,data=data) return response # 请求首页数据 def handle_index(): # & 换成 \n 并 加"" url = "http://api.douguo.net/recipe/flatcatalogs" data = { "client":"4", "_vs" : "2305", } response = handle_request(url=url,data=data) # 解析json数据 catalog_response_dict = json.loads(response.text) for catalog_list in catalog_response_dict['result']['cs']: for catalog in catalog_list['cs']: for dishes in catalog['cs']: data2 = { "client": "4", "keyword": dishes['name'], "order": "3", "_vs": "400", } # 放入队列 queue_list.put(data2) def handle_recipe_list(data): print("当前处理的食材是:",data['keyword']) recipe_list_url='http://api.douguo.net/recipe/v2/search/0/20' recipe_list_response=handle_request(url=recipe_list_url,data=data) recipe_list_response_dict = json.loads(recipe_list_response.text) for recipe_list in recipe_list_response_dict['result']['list']: recipe_info = {} recipe_info['main_ingredient'] = data['keyword'] if recipe_list['type'] == 13: recipe_info['username'] = recipe_list['r']['an'] recipe_info['ingredients_id'] =recipe_list['r']['id'] recipe_info['describe'] =recipe_list['r']['cookstory'].replace('\n','').replace(' ','') recipe_info['dishname'] =recipe_list['r']['n'] recipe_info['ingredients'] =recipe_list['r']['major'] detail_url = 'http://api.douguo.net/recipe/detail/'+str(recipe_info['ingredients_id']) # 构造请求参数 detail_data = { "client": "4", "author_id":"0", "_vs": "2803", "_ext": '{"query":{"id":'+str(recipe_info['ingredients_id'])+',"kw":'+recipe_info['main_ingredient']+',"idx":"4";"src":"2803";"type":"13"}}', } # 请求 detail_response = handle_request(url=detail_url,data=detail_data) detail_response_dict = json.loads(detail_response.text) recipe_info['tips'] = detail_response_dict['result']['recipe']['tips'] recipe_info['cookstep'] = detail_response_dict['result']['recipe']['cookstep'] print('当前入库的菜谱是:',recipe_info['dishname']) mongo_info.insert_item(recipe_info) else: continue handle_index() handle_recipe_list(queue_list.get())
spider_douguo_2.py运行结果如下:
# -*- coding: utf-8 -*- # @Author : 袁天琪 # @Time : 2022/3/14 21:18 # 通过python多线程-线程池抓取数据 # 导包 import json from multiprocessing import Queue import requests from handle_mongo import mongo_info from concurrent.futures import ThreadPoolExecutor # 创建队列 queue_list = Queue() # 定义请求函数:3个请求header部分是一样的,只需传入url和data def handle_request(url,data): # 正则替换 (.*?):(.*) --→ "$1":"$2", header = { "client":"4", "version":"7109.2", "device":"M2007J22C", "sdk":"25,7.1.2", "imei":"351564354264020", "channel":"qqkp", "resolution":"1600*900", "dpi":"2.0", "brand":"Xiaomi", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"2", # "carrier":"CMCC", # 没有这项数据 "User-Agent":"Mozilla/5.0(Linux;Android 7.1.2;M2007J22C Build/QP1A.190711.020;wv) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36", "reach":"1", "newbie":"1", "Content-Type":"application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding":"gzip, deflate", "Connection":"Keep-Alive", "Host":"logs.douguo.net", } # 请求数据(3个请求都是post方法) response = requests.post(url=url,headers=header,data=data) return response # 请求首页数据 def handle_index(): # & 换成 \n 并 加"" url = "http://api.douguo.net/recipe/flatcatalogs" data = { "client":"4", "_vs" : "2305", } response = handle_request(url=url,data=data) # 解析json数据 catalog_response_dict = json.loads(response.text) for catalog_list in catalog_response_dict['result']['cs']: for catalog in catalog_list['cs']: for dishes in catalog['cs']: data2 = { "client": "4", "keyword": dishes['name'], "order": "3", "_vs": "400", } # 放入队列 queue_list.put(data2) # 线程的处理函数,把队列中的数据get出来 # 请求的是菜谱的列表页和详情页 def handle_recipe_list(data): print("当前处理的食材是:",data['keyword']) recipe_list_url='http://api.douguo.net/recipe/v2/search/0/20' # 第一次请求 recipe_list_response=handle_request(url=recipe_list_url,data=data) recipe_list_response_dict = json.loads(recipe_list_response.text) for recipe_list in recipe_list_response_dict['result']['list']: recipe_info = {} recipe_info['main_ingredient'] = data['keyword'] if recipe_list['type'] == 13: recipe_info['username'] = recipe_list['r']['an'] recipe_info['ingredients_id'] =recipe_list['r']['id'] recipe_info['describe'] =recipe_list['r']['cookstory'].replace('\n','').replace(' ','') recipe_info['dishname'] =recipe_list['r']['n'] recipe_info['ingredients'] =recipe_list['r']['major'] detail_url = 'http://api.douguo.net/recipe/detail/'+str(recipe_info['ingredients_id']) # 构造请求参数 detail_data = { "client": "4", "author_id":"0", "_vs": "2803", "_ext": '{"query":{"id":'+str(recipe_info['ingredients_id'])+',"kw":'+recipe_info['main_ingredient']+',"idx":"4";"src":"2803";"type":"13"}}', } # 第二次请求 detail_response = handle_request(url=detail_url,data=detail_data) detail_response_dict = json.loads(detail_response.text) recipe_info['tips'] = detail_response_dict['result']['recipe']['tips'] recipe_info['cookstep'] = detail_response_dict['result']['recipe']['cookstep'] print('当前入库的菜谱是:',recipe_info['dishname']) # 存入mongodb mongo_info.insert_item(recipe_info) else: continue handle_index() # 实现多线程抓取,引入了线程池 pool = ThreadPoolExecutor(max_workers=20) while queue_list.qsize()>0: pool.submit(handle_recipe_list,queue_list.get()) # handle_recipe_list(queue_list.get())
spider_douguo_3.py运行结果如下:
# -*- coding: utf-8 -*- # @Author : 袁天琪 # @Time : 2022/3/14 21:18 # 通过python多线程-线程池抓取数据 # 导包 import json from multiprocessing import Queue import requests from handle_mongo import mongo_info from concurrent.futures import ThreadPoolExecutor # 创建队列 queue_list = Queue() # 定义请求函数:3个请求header部分是一样的,只需传入url和data def handle_request(url,data): # 正则替换 (.*?):(.*) --→ "$1":"$2", header = { "client":"4", "version":"7109.2", "device":"M2007J22C", "sdk":"25,7.1.2", "imei":"351564354264020", "channel":"qqkp", "resolution":"1600*900", "dpi":"2.0", "brand":"Xiaomi", "scale":"2.0", "timezone":"28800", "language":"zh", "cns":"2", # "carrier":"CMCC", # 没有这项数据 "User-Agent":"Mozilla/5.0(Linux;Android 7.1.2;M2007J22C Build/QP1A.190711.020;wv) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/92.0.4515.131 Mobile Safari/537.36", "reach":"1", "newbie":"1", "Content-Type":"application/x-www-form-urlencoded; charset=utf-8", "Accept-Encoding":"gzip, deflate", "Connection":"Keep-Alive", "Host":"logs.douguo.net", } # 设置代理ip proxy = {'http': '106.54.128.253:999'} # 请求数据(3个请求都是post方法) response = requests.post(url=url,headers=header,data=data,proxies=proxy) return response # 请求首页数据 def handle_index(): # & 换成 \n 并 加"" url = "http://api.douguo.net/recipe/flatcatalogs" data = { "client":"4", "_vs" : "2305", } response = handle_request(url=url,data=data) # 解析json数据 catalog_response_dict = json.loads(response.text) for catalog_list in catalog_response_dict['result']['cs']: for catalog in catalog_list['cs']: for dishes in catalog['cs']: data2 = { "client": "4", "keyword": dishes['name'], "order": "3", "_vs": "400", } # 放入队列 queue_list.put(data2) # 线程的处理函数,把队列中的数据get出来 # 请求的是菜谱的列表页和详情页 def handle_recipe_list(data): print("当前处理的食材是:",data['keyword']) recipe_list_url='http://api.douguo.net/recipe/v2/search/0/20' # 第一次请求 recipe_list_response=handle_request(url=recipe_list_url,data=data) recipe_list_response_dict = json.loads(recipe_list_response.text) for recipe_list in recipe_list_response_dict['result']['list']: recipe_info = {} recipe_info['main_ingredient'] = data['keyword'] if recipe_list['type'] == 13: recipe_info['username'] = recipe_list['r']['an'] recipe_info['ingredients_id'] =recipe_list['r']['id'] recipe_info['describe'] =recipe_list['r']['cookstory'].replace('\n','').replace(' ','') recipe_info['dishname'] =recipe_list['r']['n'] recipe_info['ingredients'] =recipe_list['r']['major'] detail_url = 'http://api.douguo.net/recipe/detail/'+str(recipe_info['ingredients_id']) # 构造请求参数 detail_data = { "client": "4", "author_id":"0", "_vs": "2803", "_ext": '{"query":{"id":'+str(recipe_info['ingredients_id'])+',"kw":'+recipe_info['main_ingredient']+',"idx":"4";"src":"2803";"type":"13"}}', } # 第二次请求 detail_response = handle_request(url=detail_url,data=detail_data) detail_response_dict = json.loads(detail_response.text) recipe_info['tips'] = detail_response_dict['result']['recipe']['tips'] recipe_info['cookstep'] = detail_response_dict['result']['recipe']['cookstep'] print('当前入库的菜谱是:',recipe_info['dishname']) # 存入mongodb mongo_info.insert_item(recipe_info) else: continue handle_index() # 实现多线程抓取,引入了线程池 pool = ThreadPoolExecutor(max_workers=2) while queue_list.qsize()>0: pool.submit(handle_recipe_list,queue_list.get())
spider_douguo_4.py运行结果如下:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。