赞
踩
# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告
##清除无效字符
def clear(val_list):
illegal_char = [' ','\n','','\r','\r','\t','\f']
for i in illegal_char:
val = re.sub(i,'',val_list)
return val
class job(object):
def __init__(self,url):
self.url=url
self.s = requests.session() ## 创建一个session对象
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'jobs.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',
}
self.s.headers.update(headers) ### 设置请求头
def getdata(self):###获取企业内招聘目录
all=pd.DataFrame()
for i in range(1,16):
time.sleep(0.3) ##延时
p=str(i)
##请求数据
data={
'pageno':p,
'hidTotal':'1060',
'type':'undefined',
'code':'undefined',
}
req=self.s.post(url=self.url,data=data,verify=False).text
title=re.findall('title="(.*?)">',req) #职位
href=re.findall('href="([https].*?)"',req) ##链接
t2=re.findall('class="t2">(.*?)(.*?)(.*?)(.*?)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。