赞
踩
这个脚本使用了requests,json,time,pandas以及BeautifulSoup等模块。requests用于发送网络请求,json用于处理JSON类型数据,time用于添加休眠抗拒请求过度频繁被服务器封禁的风险,pandas用于处理和存储数据,BeautifulSoup用于解析HTML页面。
import requests
import json
import time
import pandas as pd
from bs4 import BeautifulSoup
此部分定义了头部信息(headers)和请求网址(url)。头部信息用于构造符合服务器要求的http请求,避免因为缺乏必要的头部信息而导致请求被拒绝。请求网址是数据抓取的源头
headers = { 'Accept':'application/json, text/plain, */*', 'Accept-Encoding':'gzip, deflate, br, zstd', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Host':'yiqifu.baidu.com', 'Referer':'https://yiqifu.baidu.com/g/aqc/joblist?q=python', 'Sec-Ch-Ua':'"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', 'Sec-Ch-Ua-Mobile':'?0', 'Sec-Ch-Ua-Platform':'"Windows"', 'Sec-Fetch-Dest':'empty', 'Sec-Fetch-Mode':'cors', 'Sec-Fetch-Site':'same-origin', 'X-Requested-With':'XMLHttpRequest', 'Cookie':'BIDUPSID=FFE582BA7343E4BDE8F2B0969587933A; PSTM=1701944630; BAIDUID=FFE582BA7343E4BDDB41B7BF2E661BA5:FG=1; BDUSS=NrUG9jTlVkRFBXa3V0bW5pNjNFUGdHaTdnc21rdXpkZUpvTU9nbFpaaGpVZEJsSVFBQUFBJCQAAAAAAAAAAAEAAABJQjjR0-nA1mNhcmV5eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGPEqGVjxKhlc0; BDUSS_BFESS=NrUG9jTlVkRFBXa3V0bW5pNjNFUGdHaTdnc21rdXpkZUpvTU9nbFpaaGpVZEJsSVFBQUFBJCQAAAAAAAAAAAEAAABJQjjR0-nA1mNhcmV5eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGPEqGVjxKhlc0; MCITY=-75%3A; H_WISE_SIDS_BFESS=40045_40166_40202_39662_40210_40216_40222; H_WISE_SIDS=39662_40210_40216_40222_40271_40294_40291_40289_40286_40317_40079; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=39662_40210_40216_40222_40271_40294_40291_40289_40286_40317_40079_40364_40352_40301_40381_40366; BA_HECTOR=81ak8h048gak8ga1a485a1849i0vgo1iuja9s1t; ZFY=SJTaRNG4jPGf5XpXAboM31VLOh8ATplB5TW1u:Atu7Tk:C; BAIDUID_BFESS=FFE582BA7343E4BDDB41B7BF2E661BA5:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=7; clue_site=pc; clue_ext=%7B%22referer%22%3A%22www.baidu.com%22%2C%22ref_eqid%22%3A%22b9d3408400103e780000000665e9c22e%22%7D; log_guid=9c965543f29ee6e76083129d371aaa8a; log_first_time=1709818419524; Hm_lvt_37e1bd75d9c0b74f7b4a8ba07566c281=1709818420; Hm_lpvt_37e1bd75d9c0b74f7b4a8ba07566c281=1709818903; log_last_time=1709818910917', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' }
这个方法是在抓取分页数据时使用。它从参数中获取要抓取的页数,然后构造请求url并发送GET请求,最后解析得到的JSON数据并返回’data’字段中的’list’元素(这个元素包含了职位的详细信息)。
# 请求地址 url = 'https://yiqifu.baidu.com/g/aqc/joblist/getDataAjax?' # 发送请求 def send_get(page): try: # 设置请求参数,其中q是查询关键字,page是页码,district是城市代码,salaryrange是薪资范围 params = f'q=python&page={page}&district=510100&salaryrange=' res = requests.get(url,headers=headers,params=params) # 将请求结果转为JSON格式 res_loads = json.loads(res.text) # 通过对应的关键字获取请求数据 res_list = res_loads['data']['list'] # 返回请求结果列表 return res_list except: # 如果请求失败,则返回一个空列表 return []
这个方法用于处理从send_get方法获取的JSON数据。首先构造一个字典,然后从参数传入的职位数据中提取出所需的信息并放入字典中,同时还调用了responsibility()函数获取职位详细描述。函数最后将处理后的职位信息返回。
# 处理数据(pandas需要) def process_data(data): # 创建一个字典用于存放数据 job_data = {} # 提取和存放职位信息 job_data['城市'] = data['city'] job_data['公司名称'] = data['company'] job_data['学历要求'] = data['edu'] job_data['工作经验'] = data['exp'] # 将<em>和</em>删除掉,替换招聘岗位名称中的HTML标签 job_data['招聘岗位'] = data['jobName'].replace('<em>', '').replace('</em>', '') job_data['薪资待遇'] = data['salary'] # 提取招聘详情的链接 bid = data['bid'] jobId = data['jobId'] job_url = f'https://yiqifu.baidu.com/g/aqc/jobDetail?bid={bid}&jobId={jobId}&from=ps&fr=job_ald&rq=pos' # 获取岗位职责信息,并存放到字典中 job_data['岗位职责'] = responsibility(job_url) print(f'正在获取{job_data}') # 返回职位信息字典 return job_data
这个方法基于BeautifulSoup模块,用于解析职位详情页中的职责信息。它首先发送请求获取职位详情页的内容,然后使用BeautifulSoup解析页面并进一步提取出职位职责信息。
# 获取岗位职责 def responsibility(job_url): detail_res = requests.get(job_url) res = requests.get(job_url,headers=headers) bs = BeautifulSoup(res.text,"html.parser") scripts = bs.find_all("script") text = "" for script in scripts: if "window.pageData" in script.text: text=script.text start = text.find("window.pageData = ")+len("window.pageData = ") end = text.find(" || {}") job_des = text[start:end] data = json.loads(job_des) time.sleep(1) return data["desc"].replace("<br />","").replace("</p>","").replace("<p>","").replace(" ","")
这是主函数,用于执行脚本的主要任务。它定义一个空列表all_data来存放所有解析到的职位数据。然后循环调用send_get()和process_data()方法以获取和处理数据。处理完的数据被添加到all_data列表中。循环结束后,返回包含所有职位信息的all_data列表。
# 循环获取数据 def while_data(): # 创建一个列表用于存放所有的职位信息 all_data = [] # 循环获取数据 for i in range(1,3): data = send_get(i) time.sleep(1) # 如果有获取到数据则进行处理 if data: for item in data: # 处理数据并添加到职位信息列表中 job = process_data(item) all_data.append(job) # 返回包含所有职位信息的列表 return all_data
调用上述定义的函数进行请求、处理数据并使用Pandas将最后的结果存储为Excel文件。
total_data = while_data()
df = pd.DataFrame(total_data)
df.to_excel('job.xlsx',index=False)
整体上,这个脚本用于抓取网站的招聘信息,并进行相关的清洗和整理工作,最后将得到的数据保存为Excel文件,方便后续的分析和使用。
import requests import json import time import pandas as pd from bs4 import BeautifulSoup # 必须要完整的headers,否则会拒绝请求 headers = { 'Accept':'application/json, text/plain, */*', 'Accept-Encoding':'gzip, deflate, br, zstd', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Host':'yiqifu.baidu.com', 'Referer':'https://yiqifu.baidu.com/g/aqc/joblist?q=python', 'Sec-Ch-Ua':'"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"', 'Sec-Ch-Ua-Mobile':'?0', 'Sec-Ch-Ua-Platform':'"Windows"', 'Sec-Fetch-Dest':'empty', 'Sec-Fetch-Mode':'cors', 'Sec-Fetch-Site':'same-origin', 'X-Requested-With':'XMLHttpRequest', 'Cookie':'BIDUPSID=FFE582BA7343E4BDE8F2B0969587933A; PSTM=1701944630; BAIDUID=FFE582BA7343E4BDDB41B7BF2E661BA5:FG=1; BDUSS=NrUG9jTlVkRFBXa3V0bW5pNjNFUGdHaTdnc21rdXpkZUpvTU9nbFpaaGpVZEJsSVFBQUFBJCQAAAAAAAAAAAEAAABJQjjR0-nA1mNhcmV5eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGPEqGVjxKhlc0; BDUSS_BFESS=NrUG9jTlVkRFBXa3V0bW5pNjNFUGdHaTdnc21rdXpkZUpvTU9nbFpaaGpVZEJsSVFBQUFBJCQAAAAAAAAAAAEAAABJQjjR0-nA1mNhcmV5eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGPEqGVjxKhlc0; MCITY=-75%3A; H_WISE_SIDS_BFESS=40045_40166_40202_39662_40210_40216_40222; H_WISE_SIDS=39662_40210_40216_40222_40271_40294_40291_40289_40286_40317_40079; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=39662_40210_40216_40222_40271_40294_40291_40289_40286_40317_40079_40364_40352_40301_40381_40366; BA_HECTOR=81ak8h048gak8ga1a485a1849i0vgo1iuja9s1t; ZFY=SJTaRNG4jPGf5XpXAboM31VLOh8ATplB5TW1u:Atu7Tk:C; BAIDUID_BFESS=FFE582BA7343E4BDDB41B7BF2E661BA5:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=7; clue_site=pc; clue_ext=%7B%22referer%22%3A%22www.baidu.com%22%2C%22ref_eqid%22%3A%22b9d3408400103e780000000665e9c22e%22%7D; log_guid=9c965543f29ee6e76083129d371aaa8a; log_first_time=1709818419524; Hm_lvt_37e1bd75d9c0b74f7b4a8ba07566c281=1709818420; Hm_lpvt_37e1bd75d9c0b74f7b4a8ba07566c281=1709818903; log_last_time=1709818910917', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' } # 请求地址 url = 'https://yiqifu.baidu.com/g/aqc/joblist/getDataAjax?' # 发送请求 def send_get(page): try: # 设置请求参数,其中q是查询关键字,page是页码,district是城市代码,salaryrange是薪资范围 params = f'q=python&page={page}&district=510100&salaryrange=' res = requests.get(url,headers=headers,params=params) # 将请求结果转为JSON格式 res_loads = json.loads(res.text) # 通过对应的关键字获取请求数据 res_list = res_loads['data']['list'] # 返回请求结果列表 return res_list except: # 如果请求失败,则返回一个空列表 return [] # 处理数据(pandas需要) def process_data(data): # 创建一个字典用于存放数据 job_data = {} # 提取和存放职位信息 job_data['城市'] = data['city'] job_data['公司名称'] = data['company'] job_data['学历要求'] = data['edu'] job_data['工作经验'] = data['exp'] # 将<em>和</em>删除掉,替换招聘岗位名称中的HTML标签 job_data['招聘岗位'] = data['jobName'].replace('<em>', '').replace('</em>', '') job_data['薪资待遇'] = data['salary'] # 提取招聘详情的链接 bid = data['bid'] jobId = data['jobId'] job_url = f'https://yiqifu.baidu.com/g/aqc/jobDetail?bid={bid}&jobId={jobId}&from=ps&fr=job_ald&rq=pos' # 获取岗位职责信息,并存放到字典中 job_data['岗位职责'] = responsibility(job_url) print(f'正在获取{job_data}') # 返回职位信息字典 return job_data # 获取岗位职责 def responsibility(job_url): detail_res = requests.get(job_url) res = requests.get(job_url,headers=headers) bs = BeautifulSoup(res.text,"html.parser") scripts = bs.find_all("script") text = "" for script in scripts: if "window.pageData" in script.text: text=script.text start = text.find("window.pageData = ")+len("window.pageData = ") end = text.find(" || {}") job_des = text[start:end] data = json.loads(job_des) time.sleep(1) return data["desc"].replace("<br />","").replace("</p>","").replace("<p>","").replace(" ","") # 循环获取数据 def while_data(): # 创建一个列表用于存放所有的职位信息 all_data = [] # 循环获取数据 for i in range(1,3): data = send_get(i) time.sleep(1) # 如果有获取到数据则进行处理 if data: for item in data: # 处理数据并添加到职位信息列表中 job = process_data(item) all_data.append(job) # 返回包含所有职位信息的列表 return all_data total_data = while_data() df = pd.DataFrame(total_data) df.to_excel('job.xlsx',index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。