赞
踩
要爬取的内容是农业智能知识服务平台-病虫害百科的相关内容
pandas、requests、concurrent.futures等第三方库的下载
导入第三方库:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Thread
(1)打开抓包工具—network并且刷新网页,在抓包工具中找内容所在,在一个json文件里面(注意guid内容)
(2)点击网页,看子页链接,发现是要爬取网页的负载
(3)所以我们的步骤是:爬取主页的json文件获取子页url负载,接着爬取子页json获取需要的内容
(1)爬取主页json文件,获取guid部分
def insert_list(list, i): #主函数定义的list列表,用来存放guid
list.append(i)
#通过主函数爬取guid
def func_main(url):
resp = requests.post(url, data=dic)
resp.close()
content = resp.json()
dic_list = content['data']
for dic_guid in dic_list:
guid = dic_guid['guid']
t = Thread(target=insert_list, args=(guid_list, guid))
t.start()
def func_child(guid): new_url = "http://agri.nais.net.cn/insectpests/insectPestsDetail" param = { "gid": f'{guid}' } resp = requests.get(new_url, params=param) resp.close() all_content = resp.json() content = all_content['data'] # 主题 简介 主要危害作物 危害症状 发生因素 防治方法 name = content['zhName'] intro = content['introduction'] mainHarmCrops = content['mainHarmCrops'] hazardSymptoms = content['hazardSymptoms'] driver = content['driver'] preventionMethods = content['preventionMethods'] dic = { '主题': name, '简介': intro, '主要危害作物': mainHarmCrops, '危害症状': hazardSymptoms, '发生因素': driver, '防治方法': preventionMethods } pd_list.append(dic) #在主函数定义的pd_list,存放获取的数据字典
if __name__ == '__main__': #定义空guid列表 guid_list = [] pd_list = [] url = "http://agri.nais.net.cn/insectpests/insectPestsList" for i in range(0, 315): dic = { "start": f'{i}', "end": 10, "subType": 0, "searchText": "" } thread_run(func_main, url, num=30) with ThreadPoolExecutor(200) as t: for guid in guid_list: t.submit(func_child, guid) df = pd.DataFrame(pd_list) with pd.ExcelWriter("病害虫百科.xlsx", mode="a") as w: df.to_excel(w, sheet_name="第一个表格", index=False)
import pandas as pd import requests from concurrent.futures import ThreadPoolExecutor, as_completed from threading import Thread # 爬取主页面 def thread_run(target, args, num): with ThreadPoolExecutor(num) as t: t.submit(target, args) def insert_list(list, i): list.append(i) #通过主函数爬取guid def func_main(url): resp = requests.post(url, data=dic) resp.close() content = resp.json() dic_list = content['data'] for dic_guid in dic_list: guid = dic_guid['guid'] t = Thread(target=insert_list, args=(guid_list, guid)) t.start() #爬取子页内容函数 def func_child(guid): new_url = "http://agri.nais.net.cn/insectpests/insectPestsDetail" param = { "gid": f'{guid}' } resp = requests.get(new_url, params=param) resp.close() all_content = resp.json() content = all_content['data'] # 主题 简介 主要危害作物 危害症状 发生因素 防治方法 name = content['zhName'] intro = content['introduction'] mainHarmCrops = content['mainHarmCrops'] hazardSymptoms = content['hazardSymptoms'] driver = content['driver'] preventionMethods = content['preventionMethods'] dic = { '主题': name, '简介': intro, '主要危害作物': mainHarmCrops, '危害症状': hazardSymptoms, '发生因素': driver, '防治方法': preventionMethods } pd_list.append(dic) if __name__ == '__main__': #定义空guid列表 guid_list = [] pd_list = [] url = "http://agri.nais.net.cn/insectpests/insectPestsList" for i in range(0, 315): dic = { "start": f'{i}', "end": 10, "subType": 0, "searchText": "" } thread_run(func_main, url, num=30) with ThreadPoolExecutor(200) as t: for guid in guid_list: t.submit(func_child, guid) df = pd.DataFrame(pd_list) with pd.ExcelWriter("病害虫百科.xlsx", mode="a") as w: df.to_excel(w, sheet_name="第一个表格", index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。