当前位置:   article > 正文

python实现对前程无忧的信息的爬取_python 51job爬取

python 51job爬取

1,使用的模块:

  1. import codecs
  2. import csv
  3. import requests
  4. import re
  5. import json
  6. import pprint

2,主要爬取内容:

  1. '职位名称',
  2. '基本信息',
  3. '公司名字',
  4. '工作地点',
  5. '公司类型',
  6. '公司规模',
  7. '公司性质',
  8. '福利',
  9. '工资',
  10. '信息发布时间',
  11. '职位详情页',

3,不固定url资源路径,通过if和elif对城市进行判断然后选择url,可以选择不同的城市。

4,最终实现代码

  1. import codecs
  2. import csv
  3. import requests
  4. import re
  5. import json
  6. import pprint
  7. f = open(
  8. '前程无忧.csv',
  9. mode='a',
  10. encoding='utf-8-sig',
  11. newline='')
  12. #创建一个csv文件,mode=a表示对文件只能写入,encoding是内容文字,newline避免有换行字符等产生
  13. csv__ = csv.DictWriter(
  14. f,
  15. fieldnames = [
  16. '职位名称',
  17. '基本信息',
  18. '公司名字',
  19. '工作地点',
  20. '公司类型',
  21. '公司规模',
  22. '公司性质',
  23. '福利',
  24. '工资',
  25. '信息发布时间',
  26. '职位详情页']
  27. )
  28. #f是创建的csv文件,fieldnames表示列名
  29. csv__.writeheader()
  30. print("输入你的城市:")
  31. str = input()
  32. if str=='成都':
  33. url=\
  34. 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  35. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=' \
  36. '99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  37. elif str=='北京':
  38. url = \
  39. 'https://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  40. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  41. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  42. elif str==\
  43. '上海':
  44. url = \
  45. 'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  46. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  47. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  48. elif str == '广州':
  49. url = \
  50. 'https://search.51job.com/list/030200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  51. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  52. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  53. elif str=='深圳':
  54. url = \
  55. 'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  56. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  57. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  58. elif str=='武汉':
  59. url = \
  60. 'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  61. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  62. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  63. elif str=='西安':
  64. url = \
  65. 'https://search.51job.com/list/200200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  66. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  67. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  68. elif str=='杭州':
  69. url = \
  70. 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  71. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  72. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  73. elif str=='南京':
  74. url = \
  75. 'https://search.51job.com/list/070200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  76. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  77. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  78. elif str=='重庆':
  79. url = \
  80. 'https://search.51job.com/list/060000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  81. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  82. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  83. elif str=='东莞':
  84. url = \
  85. 'https://search.51job.com/list/030800,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  86. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  87. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  88. elif str=='大连':
  89. url = \
  90. 'https://search.51job.com/list/230300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  91. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  92. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  93. elif str=='沈阳':
  94. url = \
  95. 'https://search.51job.com/list/230200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  96. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  97. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  98. elif str=='苏州':
  99. url = \
  100. 'https://search.51job.com/list/070300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
  101. ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
  102. '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  103. #不固定url资源路径,通过if和elif对城市进行判断然后选择url
  104. # url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
  105. headers={
  106. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'
  107. }#头文件模仿网页对网页进行爬虫
  108. response = requests.get(
  109. url=url,
  110. headers=headers
  111. )
  112. #获取资源路径下的网页文件
  113. print(response.text)
  114. html_data=re.findall(
  115. 'window.__SEARCH_RESULT__ =(.*?)</script>',
  116. response.text)[0]
  117. #运用正则表达式findall找到需要的资源,[0]表示爬取出来的是字符串
  118. json_data=json.loads(html_data)
  119. #用json.loads对获取到的字符串进行解码返回python字段
  120. # pprint.pprint(json)
  121. engine=json_data['engine_jds']
  122. #找到这个字段的内容
  123. pprint.pprint(engine)
  124. for i in engine:
  125. # pprint.pprint(i)
  126. title=i['job_name']
  127. attribute_text=i['attribute_text']
  128. jjj = ' '.join(attribute_text)
  129. company_name=i['company_name']
  130. companyind_text=i['companyind_text']
  131. companysize_text=i['companysize_text']
  132. companytype_text=i['companytype_text']
  133. jobwelf=i['jobwelf']
  134. providesalary_text=i['providesalary_text']
  135. updatedate=i['updatedate']
  136. job_href=i['job_href']
  137. workarea_text=i['workarea_text']
  138. #对找到的列表拆分为多个字典内容
  139. dit={
  140. '职位名称':title,
  141. '基本信息':jjj,
  142. '公司名字':company_name,
  143. '工作地点':workarea_text,
  144. '公司类型':companyind_text,
  145. '公司规模':companysize_text,
  146. '公司性质':companytype_text,
  147. '福利':jobwelf,
  148. '工资':providesalary_text,
  149. '信息发布时间':updatedate,
  150. '职位详情页':job_href
  151. }
  152. #把拆分的数据整合进一个新的字典
  153. csv__.writerow(dit)
  154. #把dit字典内容写进csv文件

5,结果:

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/499216
推荐阅读
相关标签
  

闽ICP备14008679号