当前位置:   article > 正文

python爬取前程无忧_python爬虫爬取前程无忧51job企业招聘信息

from bs4 import beautifulsoup import lxml import pandas as pd gdps = [] # 保

# -*- coding:utf-8 -*-

import requests

import re

import random

import time

import pandas as pd

from bs4 import BeautifulSoup

import lxml

from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告

##清除无效字符

def clear(val_list):

illegal_char = [' ','\n','','\r','\r','\t','\f']

for i in illegal_char:

val = re.sub(i,'',val_list)

return val

class job(object):

def __init__(self,url):

self.url=url

self.s = requests.session() ## 创建一个session对象

headers = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, br',

'Accept-Language':'zh-CN,zh;q=0.9',

'Cache-Control':'max-age=0',

'Connection':'keep-alive',

'Host':'jobs.51job.com',

'Upgrade-Insecure-Requests':'1',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',

}

self.s.headers.update(headers) ### 设置请求头

def getdata(self):###获取企业内招聘目录

all=pd.DataFrame()

for i in range(1,16):

time.sleep(0.3) ##延时

p=str(i)

##请求数据

data={

'pageno':p,

'hidTotal':'1060',

'type':'undefined',

'code':'undefined',

}

req=self.s.post(url=self.url,data=data,verify=False).text

title=re.findall('title="(.*?)">',req) #职位

href=re.findall('href="([https].*?)"',req) ##链接

t2=re.findall('class="t2">(.*?)(.*?)(.*?)(.*?)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/499212
推荐阅读
相关标签
  

闽ICP备14008679号