当前位置:   article > 正文

python:使用selenium爬取51job(前程无忧)并将爬取数据存储到MySql数据库中的代码实例_python+selenium网站数据表格快速收集存储到mysql

python+selenium网站数据表格快速收集存储到mysql

         自己捣鼓了几天写的代码,基本上把51job的岗位相关的数据都爬下来了,可以视要求自行增减,代码虽然有些简陋,不过我爬取的时候没报什么错。代码适合初学者学习使用,废话不多说,代码如下:

  1. from selenium.webdriver.support import expected_conditions as EC
  2. from selenium.common.exceptions import NoSuchElementException
  3. from selenium.webdriver.support.wait import WebDriverWait
  4. from selenium.webdriver.common.by import By
  5. from selenium import webdriver
  6. from time import sleep
  7. import pymysql
  8. import re
  9. class Crawler:
  10. def __init__(self):
  11. self.wd = webdriver.Chrome()
  12. self.wd.implicitly_wait(20)
  13. self.DBHOST = "localhost"
  14. self.DBUSER = "root"
  15. self.DBPASS = "123456"
  16. self.DBNAME = "51job"
  17. # 获取当前页面的数据
  18. def getData(self, len_Css):
  19. rows = []
  20. for i in range(1, len_Css):
  21. # 岗位名称
  22. job_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.jname.at'.format(i)).text
  23. # 公司名称
  24. company_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).text
  25. # 城市 工作经验 学历 招聘人数
  26. al = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.d.at'.format(i)).text.split('|')
  27. # 分别对应不同情况,有的岗位缺少学历,有的缺少工作经验
  28. if len(al) == 4:
  29. city = al[0]
  30. experience = al[1]
  31. education = al[2]
  32. recruits_Number = al[3]
  33. elif len(al) == 3:
  34. city = al[0]
  35. experience = al[1]
  36. education = None
  37. recruits_Number = al[2]
  38. elif len(al) == 2:
  39. city = al[0]
  40. experience = None
  41. education = None
  42. recruits_Number = al[1]
  43. else:
  44. city = None
  45. experience = None
  46. education = None
  47. recruits_Number = None
  48. # 发布日期
  49. release_Date = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.time'.format(i)).text
  50. # 公司福利
  51. # 有的岗位不能定位到福利元素,通过自定义NoExists方法判断能否定位到元素
  52. # if self.NoExists('div.j_joblist > div:nth-child({0}) p.tags'.format(i)):
  53. # welfare = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.tags'.format(i)).get_attribute("title")
  54. # else:
  55. # welfare = None
  56. # 薪水
  57. # 有的岗位薪水能定位到元素,但是是空串,防止报错
  58. if bool(self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text):
  59. salary = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text
  60. else:
  61. salary = None
  62. # 公司类型
  63. company_type = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.int.at'.format(i)).text
  64. # 招聘详情url
  65. job_ex_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.el[target=_blank]'.format(i)).get_attribute("href")
  66. # 公司url
  67. company_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).get_attribute("href")
  68. rows.append([job_name, company_name, city, experience, education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url])
  69. return rows
  70. # 将爬取的数据存进数据库
  71. def saveData(self, rows):
  72. db = pymysql.connect(host=self.DBHOST, user=self.DBUSER, password=self.DBPASS, database=self.DBNAME)
  73. cur = db.cursor()
  74. sql = "INSERT INTO ods_51job_job(job_name, company_name, job_city, job_experience, job_education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url) " \
  75. "VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
  76. try:
  77. for row in rows:
  78. cur.execute(sql, row)
  79. db.commit()
  80. except pymysql.Error as e:
  81. print(e)
  82. finally:
  83. cur.close()
  84. db.close()
  85. # 一次爬取存储一页数据,自动递增直到爬完
  86. def scrapingData(self, City, keyWord, start_Page):
  87. wait = WebDriverWait(self.wd, 20, 0.5)
  88. # 得出总页数
  89. isNextpage = self.wd.find_element(By.CSS_SELECTOR,
  90. 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_page > div > div > div > span:nth-child(1)').text
  91. result = re.findall(r'\d+', isNextpage)
  92. condition = int(result[0])
  93. sleep(2)
  94. print('城市编号:%s 关键词:%s 总页数:%d' % (City, keyWord, condition))
  95. while start_Page <= condition:
  96. # 当前页面总共有多少条招聘岗位(一般是50条)
  97. pubCss = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
  98. 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div.e')))
  99. # 获取当前页面数据并存进数据库
  100. rows1 = self.getData(len(pubCss)+1)
  101. self.saveData(rows1)
  102. print('\t已爬取第%d页;' % start_Page)
  103. # 判断是否最后一页
  104. if start_Page < condition:
  105. nextpage = self.wd.find_element(By.CSS_SELECTOR, 'li.next a[style="cursor: pointer;"]')
  106. nextpage.click()
  107. self.wd.refresh()
  108. start_Page += 1
  109. else:
  110. print('已爬取完当前城市关键词!')
  111. break
  112. sleep(2)
  113. def NoExists(self, Css):
  114. try:
  115. self.wd.find_element(By.CSS_SELECTOR, Css)
  116. return True
  117. except NoSuchElementException:
  118. return False
  119. # 自动循环遍历城市和关键词
  120. def getUrl(self, workCity, startPage, keywords):
  121. # 爬取中断后需要更改i,j的下标初始位置和start_page重新继续爬取
  122. for i in range(0, len(workCity)):
  123. for j in range(0, len(keywords)):
  124. suffix = str(
  125. startPage) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  126. url = 'https://search.51job.com/list/' + str(
  127. workCity[i]) + ',000000,0000,00,9,99,' + keywords[j] + ',2,' + suffix
  128. self.wd.get(url)
  129. self.scrapingData(workCity[i], keywords[j], startPage)
  130. # 更改start_page后会从start_page页开始爬,爬到下一个关键词再把start_page重置成1从第一页开始爬
  131. if startPage > 1:
  132. startPage = 1
  133. # 热门城市编码
  134. # {"北京", "010000"}, {"上海", "020000"}, {"广州", "030200"}, {"深圳", "040000"}, {"武汉", "180200"},
  135. # {"西安", "200200"}, {"杭州", "080200"}, {"南京", "070200"}, {"成都", "090200"}, {"重庆", "060000"},
  136. # {"东莞", "030800"}, {"大连", "230300"}, {"沈阳", "230200"}, {"苏州", "070300"}, {"昆明", "250200"},
  137. # {"长沙", "190200"}, {"合肥", "150200"}, {"宁波", "080300"}, {"郑州", "170200"}, {"天津", "050000"},
  138. # {"青岛", "120300"}, {"哈尔滨", "220200"}, {"长春", "240200"}, {"福州", "110200"}, {"珠三角", "01"};
  139. if __name__ == '__main__':
  140. # 将需要爬取的城市编号和关键词放进数组,start_page为从第几页开始爬
  141. cities = ['040000', '080200', '070200', '190200', '090200', '180200']
  142. keyword = ['大数据', 'python', '爬虫', 'Hadoop', '数据分析师', 'Hadoop']
  143. start_page = 1
  144. a = Crawler()
  145. a.getUrl(cities, start_page, keyword)

         上面的代码里公司福利的数据我注释掉了,因为基本每页都有几条没有公司福利的岗位数据,处理错误耗时太久,爬取大量数据的时候太煎熬了,干脆不要了。还有就是css路径我都是直接复制的,好多都还可以再删减优化,不过我比较懒,也可以换成xpath路径,可以更精简。最后就是数据库需要自己建表,连接的时候注意改下代码里的参数还有sql里的字段名称就行,还是比较简单的。

        我自己运行代码的时候出错一般都是爬了很久后报timeout错误,可以把等待时间稍微加长点,不过估计爬多了也还会报错,毕竟51job虽然很随便但爬多了也会反爬,只是不像boss直聘爬了几千条数据就封ip两天那么狠(表示被封过好几次

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/696073
推荐阅读
相关标签