赞
踩
自己捣鼓了几天写的代码,基本上把51job的岗位相关的数据都爬下来了,可以视要求自行增减,代码虽然有些简陋,不过我爬取的时候没报什么错。代码适合初学者学习使用,废话不多说,代码如下:
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import NoSuchElementException
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium import webdriver
- from time import sleep
- import pymysql
- import re
-
- class Crawler:
- def __init__(self):
- self.wd = webdriver.Chrome()
- self.wd.implicitly_wait(20)
- self.DBHOST = "localhost"
- self.DBUSER = "root"
- self.DBPASS = "123456"
- self.DBNAME = "51job"
- # 获取当前页面的数据
- def getData(self, len_Css):
- rows = []
- for i in range(1, len_Css):
- # 岗位名称
- job_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.jname.at'.format(i)).text
- # 公司名称
- company_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).text
- # 城市 工作经验 学历 招聘人数
- al = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.d.at'.format(i)).text.split('|')
- # 分别对应不同情况,有的岗位缺少学历,有的缺少工作经验
- if len(al) == 4:
- city = al[0]
- experience = al[1]
- education = al[2]
- recruits_Number = al[3]
- elif len(al) == 3:
- city = al[0]
- experience = al[1]
- education = None
- recruits_Number = al[2]
- elif len(al) == 2:
- city = al[0]
- experience = None
- education = None
- recruits_Number = al[1]
- else:
- city = None
- experience = None
- education = None
- recruits_Number = None
- # 发布日期
- release_Date = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.time'.format(i)).text
- # 公司福利
- # 有的岗位不能定位到福利元素,通过自定义NoExists方法判断能否定位到元素
- # if self.NoExists('div.j_joblist > div:nth-child({0}) p.tags'.format(i)):
- # welfare = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.tags'.format(i)).get_attribute("title")
- # else:
- # welfare = None
- # 薪水
- # 有的岗位薪水能定位到元素,但是是空串,防止报错
- if bool(self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text):
- salary = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text
- else:
- salary = None
- # 公司类型
- company_type = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.int.at'.format(i)).text
- # 招聘详情url
- job_ex_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.el[target=_blank]'.format(i)).get_attribute("href")
- # 公司url
- company_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).get_attribute("href")
- rows.append([job_name, company_name, city, experience, education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url])
- return rows
- # 将爬取的数据存进数据库
- def saveData(self, rows):
- db = pymysql.connect(host=self.DBHOST, user=self.DBUSER, password=self.DBPASS, database=self.DBNAME)
- cur = db.cursor()
- sql = "INSERT INTO ods_51job_job(job_name, company_name, job_city, job_experience, job_education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url) " \
- "VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
- try:
- for row in rows:
- cur.execute(sql, row)
- db.commit()
- except pymysql.Error as e:
- print(e)
- finally:
- cur.close()
- db.close()
- # 一次爬取存储一页数据,自动递增直到爬完
- def scrapingData(self, City, keyWord, start_Page):
- wait = WebDriverWait(self.wd, 20, 0.5)
-
- # 得出总页数
- isNextpage = self.wd.find_element(By.CSS_SELECTOR,
- 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_page > div > div > div > span:nth-child(1)').text
- result = re.findall(r'\d+', isNextpage)
- condition = int(result[0])
-
- sleep(2)
- print('城市编号:%s 关键词:%s 总页数:%d' % (City, keyWord, condition))
-
- while start_Page <= condition:
- # 当前页面总共有多少条招聘岗位(一般是50条)
- pubCss = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
- 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div.e')))
- # 获取当前页面数据并存进数据库
- rows1 = self.getData(len(pubCss)+1)
- self.saveData(rows1)
- print('\t已爬取第%d页;' % start_Page)
-
- # 判断是否最后一页
- if start_Page < condition:
- nextpage = self.wd.find_element(By.CSS_SELECTOR, 'li.next a[style="cursor: pointer;"]')
- nextpage.click()
- self.wd.refresh()
- start_Page += 1
- else:
- print('已爬取完当前城市关键词!')
- break
- sleep(2)
-
- def NoExists(self, Css):
- try:
- self.wd.find_element(By.CSS_SELECTOR, Css)
- return True
- except NoSuchElementException:
- return False
- # 自动循环遍历城市和关键词
- def getUrl(self, workCity, startPage, keywords):
- # 爬取中断后需要更改i,j的下标初始位置和start_page重新继续爬取
- for i in range(0, len(workCity)):
- for j in range(0, len(keywords)):
- suffix = str(
- startPage) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- url = 'https://search.51job.com/list/' + str(
- workCity[i]) + ',000000,0000,00,9,99,' + keywords[j] + ',2,' + suffix
- self.wd.get(url)
- self.scrapingData(workCity[i], keywords[j], startPage)
- # 更改start_page后会从start_page页开始爬,爬到下一个关键词再把start_page重置成1从第一页开始爬
- if startPage > 1:
- startPage = 1
- # 热门城市编码
- # {"北京", "010000"}, {"上海", "020000"}, {"广州", "030200"}, {"深圳", "040000"}, {"武汉", "180200"},
- # {"西安", "200200"}, {"杭州", "080200"}, {"南京", "070200"}, {"成都", "090200"}, {"重庆", "060000"},
- # {"东莞", "030800"}, {"大连", "230300"}, {"沈阳", "230200"}, {"苏州", "070300"}, {"昆明", "250200"},
- # {"长沙", "190200"}, {"合肥", "150200"}, {"宁波", "080300"}, {"郑州", "170200"}, {"天津", "050000"},
- # {"青岛", "120300"}, {"哈尔滨", "220200"}, {"长春", "240200"}, {"福州", "110200"}, {"珠三角", "01"};
-
- if __name__ == '__main__':
- # 将需要爬取的城市编号和关键词放进数组,start_page为从第几页开始爬
- cities = ['040000', '080200', '070200', '190200', '090200', '180200']
- keyword = ['大数据', 'python', '爬虫', 'Hadoop', '数据分析师', 'Hadoop']
- start_page = 1
-
- a = Crawler()
- a.getUrl(cities, start_page, keyword)
上面的代码里公司福利的数据我注释掉了,因为基本每页都有几条没有公司福利的岗位数据,处理错误耗时太久,爬取大量数据的时候太煎熬了,干脆不要了。还有就是css路径我都是直接复制的,好多都还可以再删减优化,不过我比较懒,也可以换成xpath路径,可以更精简。最后就是数据库需要自己建表,连接的时候注意改下代码里的参数还有sql里的字段名称就行,还是比较简单的。
我自己运行代码的时候出错一般都是爬了很久后报timeout错误,可以把等待时间稍微加长点,不过估计爬多了也还会报错,毕竟51job虽然很随便但爬多了也会反爬,只是不像boss直聘爬了几千条数据就封ip两天那么狠(表示被封过好几次
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。