赞
踩
需要源码的关注公众号: 麻不辣清汤
后续对字段的爬取代码进入
博文:https://blog.csdn.net/weixin_52001949/article/details/135480669
pip install selenium
pip install BeautifulSoup
pip install json
pip install time
利用selenium获取指定cookie
from selenium import webdriver
import time
import json
def 获取cookie(url,cookie文件名):
#1. 打开浏览器
driver = webdriver.Chrome()
#2. 进入网页
driver.get(url)
#3. 进入网页之后,手动点击登录页码快速登录进去
time.sleep(15)
#4.在15s之内登录,获取所有cookie信息(返回是字典)
dictCookies = driver.get_cookies()
#5.是将dict转化成str格式
jsonCookies = json.dumps(dictCookies)
# 登录完成后,自动创建一个boss直聘.json的文件,将cookies保存到该环境根目录下
with open(cookie文件名, "w") as fp:
fp.write(jsonCookies)
print('cookies保存成功!')
url='https://www.zhipin.com/web/geek/job-recommend'
获取cookie(url,cookie文件名='boss直聘.json')
soup.find_all(class_='job-card-left')
#获取所有class=job_card_left_elements'的标签
-遍历每个元素,取出href链接
for element in job_card_left_elements:
href = element['href']
full_link = 'https://www.zhipin.com' + href
'将每个链接放到一个空列表里'
详情列表.append(full_link)
该部分完整代码
代码解释:
```python
from selenium import webdriver
import time
import json
from bs4 import BeautifulSoup
'1. 建立一个空列表存储链接'
详情列表=[]
'2. 遍历每个页面,获取页面的源代码'
for i in range (1,11):
print(i)
boss = webdriver.Chrome()
# 1.打开网页
url =f'https://www.zhipin.com/web/geek/job?query=bi&city=101210100&page={i}'
boss.get(url)
#2.注入cookie
with open(r"E:\Anacada__\工作项目\爬虫\boss直聘.json", "r") as fp:
jsonCookies = fp.read()
#3. 将 JSON 格式的 Cookie 转换为字典
cookies = json.loads(jsonCookies)
#4.添加 Cookie 到 WebDriver 对象
for cookie in cookies:
boss.add_cookie(cookie)
#5.进入网页等待6s加载,然后获取源代码
boss.get(url)
time.sleep(6)
boss_text = boss.page_source
'3. 根据页面源代码,获取每个页面的岗位链接'
#1.将源代码加载进beatifulsoup
soup = BeautifulSoup(boss_text, 'html.parser')
#2.查找所有 class="job-card-left" 的元素
job_card_left_elements = soup.find_all(class_='job-card-left')
# 遍历每个元素,获取 <a> 标签的 href 链接
for element in job_card_left_elements:
href = element['href']
full_link = 'https://www.zhipin.com' + href
详情列表.append(full_link)
count=0
源代码=[]
for i in 详情列表:
count=count+1
boss = webdriver.Chrome()
# 打开网页
url =f'{i}'
boss.get(url)
#2.注入cookie
with open(r"E:\Anacada__\工作项目\爬虫\boss直聘.json", "r") as fp:
jsonCookies = fp.read()
'1. 将 JSON 格式的 Cookie 转换为字典'
cookies = json.loads(jsonCookies)
' # 添加 Cookie 到 WebDriver 对象'
for cookie in cookies:
boss.add_cookie(cookie)
boss.get(url)
time.sleep(5)
boss_text = boss.page_source
源代码.append(boss_text)
from selenium import webdriver
import time
import json
import random
from bs4 import BeautifulSoup
import pandas as pd
def 获取cookie(url,cookie文件名):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(15)
dictCookies = driver.get_cookies() #获得所有cookie信息(返回是字典)
jsonCookies = json.dumps(dictCookies) #dumps是将dict转化成str格式
# 登录完成后,将cookies保存到本地文件
with open(cookie文件名, "w") as fp:
fp.write(jsonCookies)
print('cookies保存成功!')
获取cookie('https://www.zhipin.com/web/geek/job-recommend',cookie文件名='boss直聘.json')
详情列表=[]
for i in range (1,11):
print(i)
boss = webdriver.Chrome()
# 打开网页
url =f'https://www.zhipin.com/web/geek/job?query=bi&city=101210100&page={i}'
boss.get(url)
#2.注入cookie
with open(r"E:\Anacada__\工作项目\爬虫\boss直聘.json", "r") as fp:
jsonCookies = fp.read()
# 将 JSON 格式的 Cookie 转换为字典
cookies = json.loads(jsonCookies)
# 添加 Cookie 到 WebDriver 对象
for cookie in cookies:
boss.add_cookie(cookie)
boss.get(url)
time.sleep(6)
boss_text = boss.page_source
# print("2. 获取页面源代码")
from bs4 import BeautifulSoup
soup = BeautifulSoup(boss_text, 'html.parser')
# 查找所有 class="job-card-left" 的元素
job_card_left_elements = soup.find_all(class_='job-card-left')
# 遍历每个元素,获取 <a> 标签的 href 链接
for element in job_card_left_elements:
href = element['href']
full_link = 'https://www.zhipin.com' + href
详情列表.append(full_link)
count=0
源代码=[]
for i in 详情列表:
count=count+1
boss = webdriver.Chrome()
# 打开网页
url =f'{i}'
boss.get(url)
#2.注入cookie
with open(r"E:\Anacada__\工作项目\爬虫\boss直聘.json", "r") as fp:
jsonCookies = fp.read()
'1. 将 JSON 格式的 Cookie 转换为字典'
cookies = json.loads(jsonCookies)
' # 添加 Cookie 到 WebDriver 对象'
for cookie in cookies:
boss.add_cookie(cookie)
boss.get(url)
time.sleep(random.uniform(5,15))
boss_text = boss.page_source
源代码.append(boss_text)
后续对字段的爬取代码进入
博文:https://blog.csdn.net/weixin_52001949/article/details/135480669
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。