当前位置:   article > 正文

python爬虫— 拉勾网职位信息爬取_python爬取拉勾网近五年ios职位数

python爬取拉勾网近五年ios职位数

在上一篇对拉勾网url分析的基础上,这一篇开始爬取拉勾网上面的职位信息。既然,现在是使用的爬虫,那么就获取拉钩网上的爬虫工程师的职位的信息。上一篇的链接:python爬虫 —爬拉勾网python爬虫职位(一)

(一)动工前分析

 1.需要获取的信息:

(1)职位名称

(2)薪资

(3)要求工作时间

(4)岗位所在地点

2.程序功能分析

根据上面的分析,可以简单地将程序分为三个部分:(1)获取url, (2)获取职位信息,( 3)存储获取的信息

有了以上的分析,既可以开始动手写程序了。

(二)编写程序

1.导入相关模块

  1. import requests
  2. from bs4 import BeautifulSoup
  3. from time import sleep, time
  4. import csv
  5. import json
  6. import random

最终的信息保存在csv文件中,因此,在这里导入 csv 模块

2.url构造

根据python爬虫 —爬拉勾网python爬虫职位(一)的介绍,url 的构造函数如下:

  1. #构造 url
  2. def url_create():
  3. headers={'Cookie':'user_trace_token=20180617062143-a2c67f89-f721-42a0-a431-0713866d0fc1; __guid=237742470.3953059058839704600.1529187726497.5256;\
  4. LGUID=20180617062145-a70aea81-71b3-11e8-a55c-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD;\
  5. JSESSIONID=ABAAABAAAIAACBIA653C35B2B23133DCDB86365CEC619AE; PRE_UTM=; PRE_HOST=; PRE_SITE=;\
  6. PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_pythonpytho%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E5%2585%25A8%25E5%259B%25BD;\
  7. TG-TRACK-CODE=search_code; X_MIDDLE_TOKEN=8a8c6419e33ae49c13de4c9881b4eb1e; X_HTTP_TOKEN=5dd639be7b63288ce718c96fdd4a0035;\
  8. _ga=GA1.2.1060168094.1529187728; _gid=GA1.2.1053446384.1529187728; _gat=1;\
  9. Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1529190520,1529198463,1529212181,1529212712;\
  10. Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1529225935; LGSID=20180617164003-0752289a-720a-11e8-a8bc-525400f775ce;\
  11. LGRID=20180617165832-9c78c400-720c-11e8-a8bf-525400f775ce; SEARCH_ID=1dab13db9fc14397a080b2d8a32b7f27; monitor_count=70',
  12. 'Host':'www.lagou.com',
  13. 'Origin':'https://www.lagou.com',
  14. 'Referer':'https://www.lagou.com/jobs/list_python%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
  15. 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  16. 'X-Requested-With':'XMLHttpRequest'}
  17. city_list = ['北京', '上海', '深圳', '广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']
  18. position_list=['python爬虫',] #职位列表,可以继续添加职位
  19. for city in city_list:
  20. save_single_info(city) #写入城市信息
  21. print("I am in {}".format(city))
  22. for position in position_list:
  23. url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
  24. #获取职位页数的链接
  25. url_page = "https://www.lagou.com/jobs/list_{}?city={}".format(position, city)
  26. #获取职位页码的总页数
  27. r = requests.post(url_page, headers=headers)
  28. doc = BeautifulSoup(r.text, 'lxml')
  29. page_count = int(doc.find(class_='span totalNum').string)
  30. #职位页 遍历
  31. for page in range(1,page_count+1):
  32. if page == 1:
  33. flag = 'true'
  34. else:
  35. flag = 'false'
  36. data = {
  37. 'first':flag,
  38. 'pn':page,
  39. 'kd':position,
  40. }
  41. time_sleep = random.random()
  42. sleep(time_sleep*10)
  43. response = requests.post(url, headers = headers, data= data, timeout=10)
  44. data= response.json()
  45. html_parse(data)

2.职位信息的获取:

  1. #职位详细信息获取
  2. def html_parse(item):
  3. info = item.get('content')
  4. p_lsit = info.get('positionResult').get('result')
  5. print(len(p_lsit))
  6. for p in p_lsit:
  7. result_item = {
  8. "positionName": p.get("positionName"),
  9. "salary": p.get("salary"),
  10. "workYear": p.get('workYear'),
  11. }
  12. result_save(result_item)

职位信息还可以根据需要继续添加,这里只获取了名称,薪资和要求的工作年限

3.信息的存储

  1. #结果信息存储
  2. def result_save(result_item):
  3. with open('lagou.csv','a',newline='',encoding='utf-8') as csvfile: #打开一个csv文件,用于存储
  4. fieldnames=['positionName','salary','workYear']
  5. writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
  6. writer.writerow(result_item)
  7. #单行信息存储
  8. def save_single_info(info):
  9. with open('lagou.csv','a',newline='',encoding='utf-8') as csvfile:
  10. writer=csv.writer(csvfile)
  11. if type(info) == list:
  12. writer.writerow(info)
  13. else:
  14. writer.writerow([info])

4.主函数

  1. #主程序
  2. def main():
  3. box_header = ['positionName','salary','workYear']
  4. save_single_info(box_header) #写入表头
  5. url_create() #url创建,并返回提取到的信息

5.运行

  1. #运行程序
  2. if __name__ == '__main__':
  3. start_time = time()
  4. print("working...")
  5. main()
  6. end_time = time()
  7. print("运行结束,用时:")
  8. total_time = (end_time - start_time)/60
  9. print(total_time)

6.运行结果

(1)运行过程和总用时


(2)存储的信息


(3)获取的职位总数:


(三)全部源码

  1. import requests
  2. from bs4 import BeautifulSoup
  3. from time import sleep, time
  4. import csv
  5. import json
  6. import random
  7. #构造 url
  8. def url_create():
  9. headers={'Cookie':'user_trace_token=20180617062143-a2c67f89-f721-42a0-a431-0713866d0fc1; __guid=237742470.3953059058839704600.1529187726497.5256;\
  10. LGUID=20180617062145-a70aea81-71b3-11e8-a55c-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD;\
  11. JSESSIONID=ABAAABAAAIAACBIA653C35B2B23133DCDB86365CEC619AE; PRE_UTM=; PRE_HOST=; PRE_SITE=;\
  12. PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_pythonpytho%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E5%2585%25A8%25E5%259B%25BD;\
  13. TG-TRACK-CODE=search_code; X_MIDDLE_TOKEN=8a8c6419e33ae49c13de4c9881b4eb1e; X_HTTP_TOKEN=5dd639be7b63288ce718c96fdd4a0035;\
  14. _ga=GA1.2.1060168094.1529187728; _gid=GA1.2.1053446384.1529187728; _gat=1;\
  15. Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1529190520,1529198463,1529212181,1529212712;\
  16. Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1529225935; LGSID=20180617164003-0752289a-720a-11e8-a8bc-525400f775ce;\
  17. LGRID=20180617165832-9c78c400-720c-11e8-a8bf-525400f775ce; SEARCH_ID=1dab13db9fc14397a080b2d8a32b7f27; monitor_count=70',
  18. 'Host':'www.lagou.com',
  19. 'Origin':'https://www.lagou.com',
  20. 'Referer':'https://www.lagou.com/jobs/list_python%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
  21. 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  22. 'X-Requested-With':'XMLHttpRequest'}
  23. city_list = ['北京', '上海', '深圳', '广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']
  24. position_list=['python爬虫',] #职位列表,可以继续添加职位
  25. for city in city_list:
  26. save_single_info(city) #写入城市信息
  27. print("I am in {}".format(city))
  28. for position in position_list:
  29. url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
  30. #获取职位页数的链接
  31. url_page = "https://www.lagou.com/jobs/list_{}?city={}".format(position, city)
  32. #获取职位页码的总页数
  33. r = requests.post(url_page, headers=headers)
  34. doc = BeautifulSoup(r.text, 'lxml')
  35. page_count = int(doc.find(class_='span totalNum').string)
  36. #职位页 遍历
  37. for page in range(1,page_count+1):
  38. if page == 1:
  39. flag = 'true'
  40. else:
  41. flag = 'false'
  42. data = {
  43. 'first':flag,
  44. 'pn':page,
  45. 'kd':position,
  46. }
  47. time_sleep = random.random()
  48. sleep(time_sleep*10)
  49. response = requests.post(url, headers = headers, data= data, timeout=10)
  50. data= response.json()
  51. html_parse(data)
  52. #职位详细信息获取
  53. def html_parse(item):
  54. info = item.get('content')
  55. p_lsit = info.get('positionResult').get('result')
  56. print(len(p_lsit))
  57. for p in p_lsit:
  58. result_item = {
  59. "positionName": p.get("positionName"),
  60. "salary": p.get("salary"),
  61. "workYear": p.get('workYear'),
  62. }
  63. result_save(result_item)
  64. #结果信息存储
  65. def result_save(result_item):
  66. with open('lagou.csv','a',newline='',encoding='utf-8') as csvfile: #打开一个csv文件,用于存储
  67. fieldnames=['positionName','salary','workYear']
  68. writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
  69. writer.writerow(result_item)
  70. #单行信息存储
  71. def save_single_info(info):
  72. with open('lagou.csv','a',newline='',encoding='utf-8') as csvfile:
  73. writer=csv.writer(csvfile)
  74. if type(info) == list:
  75. writer.writerow(info)
  76. else:
  77. writer.writerow([info])
  78. #主程序
  79. def main():
  80. box_header = ['positionName','salary','workYear']
  81. save_single_info(box_header) #写入表头
  82. url_create() #url创建,并返回提取到的信息
  83. #运行程序
  84. if __name__ == '__main__':
  85. start_time = time()
  86. print("working...")
  87. main()
  88. end_time = time()
  89. print("运行结束,用时:")
  90. total_time = (end_time - start_time)/60
  91. print(total_time)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/酷酷是懒虫/article/detail/939842
推荐阅读
相关标签
  

闽ICP备14008679号