当前位置:   article > 正文

node 爬虫 实战 - 爬取拉勾网职位数据_sensorsdata2015jssdkcross node使用

sensorsdata2015jssdkcross node使用

node 爬虫 实战 - 爬取拉勾网职位数据,主要想把数据用于大数据学习,到时候大数据分析可以自己分析一下职位的情况,和比较一些我现在的职位在深圳乃至全国的开发人员水平。

涉及到的技术栈:node.js ,mongoDB , express 

源码下载:https://gitee.com/draven_lee/node-spider ,如果有帮助到你,不需要打赏,欢迎给star。

这个爬虫功能有点简单,仅供学习参考。

  1. 爬取lagou.com首页的menu菜单的url,并且把menu的前30页url保存进去爬虫队列中,之后沿着队列的url,把需要的数据提取出来。
  2. 爬取的过程中,发现,如果太频繁的和没有用户的登录态,会被和谐到其他页面去,应该是拉勾网做了防爬虫机制,所以我把爬取的速度放慢,加上模拟的登录态,可以成功爬取到数据。不过按照这个爬取有点慢的速度,我爬了四天四夜,爬了10W+条职位数据。

废话不说,先上代码。

  1. var express = require('express');
  2. var MongoClient = require('mongodb').MongoClient;
  3. const request = require('superagent');
  4. var url = "mongodb://localhost:27017/draven";
  5. var router = express.Router();
  6. var cheerio = require('cheerio');
  7. var Crawler = require("crawler");
  8. //爬取 数据
  9. router.get('/crawlData', function(req, res, next) {
  10. var url = "mongodb://localhost:27017/draven";
  11. var menuList = [];
  12. var urlList = []
  13. var location = '深圳';
  14. MongoClient.connect(url, function(err, db) {
  15. if (err) throw err;
  16. var dbo = db.db("draven");
  17. var c = new Crawler({
  18. preRequest: function(options, done) {
  19. // 'options' here is not the 'options' you pass to 'c.queue', instead, it's the options that is going to be passed to 'request' module
  20. console.log(options.uri);
  21. // when done is called, the request will start
  22. done();
  23. },
  24. jQuery: true, //是否用 cheerio 的jQuery语法
  25. rateLimit:25000, //爬取速度 25秒 爬一次
  26. maxConnections : 1, //最大爬取并发量 1
  27. headers:{ //模拟真实用户浏览器状态
  28. 'Cookie':'index_location_city='+encodeURI(location)+'user_trace_token=20181127172617-5d56fc60-618b-4486-9762-21efad3c49df; JSESSIONID=ABAAABAAAFCAAEG70DFEA8B139FF80287ABDF2F4C137946; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; _ga=GA1.2.405959562.1543310779; _gid=GA1.2.577762828.1543310779; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543310402,1543310779; LGSID=20181127172618-7ef404ed-f226-11e8-80e4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20181127172618-7ef406c2-f226-11e8-80e4-525400f775ce; _gat=1; TG-TRACK-CODE=index_navigation; SEARCH_ID=88db5c7fa2464090a6dd7041f35074ba; X_HTTP_TOKEN=492369107a1a20441020ab9b771f2f6d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%2C%22%24device_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%7D; sajssdk_2015_cross_new_user=1; ab_test_random_num=0; _putrc=69D503B669D896FC123F89F2B170EADC; login=true; hasDeliver=0; gate_login_token=33f3414d87f12e09e089b3b6daf10134f0a5ebf49fad63dfd9b8bc4e3a4f162b; unick=hello; LGRID=20181127174101-8d501f2b-f228-11e8-8c21-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543311662',
  29. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
  30. },
  31. callback : function (error, res, done) {
  32. if(error){
  33. console.log(error);
  34. done();
  35. }else{
  36. var $ = res.$;
  37. var jobList = []
  38. console.log($('title').text())
  39. $('.con_list_item').each(function (idx, item) {
  40. var $item = $(item);
  41. jobList.push({
  42. name: $item.find('.position_link').find('h3').text(),
  43. address: $item.find('.add').find('em').text(),
  44. company: $item.find('.company_name').find('a').text(),
  45. companyLink: $item.find('a').attr('href'),
  46. companyImg: $item.find('.com_logo').find('img').attr('src'),
  47. money: $item.find('.money').text(),
  48. label:$item.find('.list_item_bot').find('span').text(),
  49. welfare:$item.find('.li_b_r').text()
  50. });
  51. });
  52. try {
  53. console.log('c.queueSize',c.queueSize);
  54. console.log('jobList',jobList.length)
  55. if(jobList.length > 0 ){
  56. //保存到数据库中
  57. dbo.collection("job").insertMany(jobList, function(err, res) {
  58. if (err) throw err;
  59. console.log('job 数据导入成功!');
  60. // db.close();
  61. })
  62. }
  63. done();
  64. }catch(e){
  65. console.log(e);
  66. done();
  67. }
  68. }
  69. }
  70. });
  71. //爬取首页的menu的数据
  72. c.queue([
  73. {
  74. uri: 'https://www.lagou.com/',
  75. headers:{
  76. 'Set-Cookie':'index_location_city='+encodeURI(location),
  77. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
  78. 'JSESSIONID':'ABAAABAAAGFABEFC5E29AF672C4DAF0B10AEE494D83FD62',
  79. 'login':true
  80. },
  81. // The global callback won't be called
  82. callback: function (error, res, done) {
  83. if(error){
  84. console.log(error);
  85. }else{
  86. var $ = res.$;
  87. $('.menu_sub a').each(function (idx, element) {
  88. var $element = $(element);
  89. menuList.push({
  90. name: $element.text(),
  91. tjId: $element.attr('data-lg-tj-id'),
  92. // tjIdName:changeName($element.attr('data-lg-tj-id')),
  93. tjNo:$element.attr('data-lg-tj-no'),
  94. tjCid:$element.attr('data-lg-tj-cid'),
  95. link:$element.attr('href'),
  96. });
  97. //组装menu前30页的url
  98. for(var i = 1 ;i<=30 ;i++){
  99. urlList.push($element.attr('href')+i+'/');
  100. }
  101. });
  102. //把首页爬取的menu URL数据加入到需要爬取的队列中
  103. c.queue(urlList);
  104. console.log(urlList,urlList.length)
  105. console.log('menuList 共',menuList.length ,'条数据');
  106. dbo.collection("menu").insertMany(menuList, function(err, res) {
  107. if (err) throw err;
  108. console.log('数据导入成功!');
  109. // db.close();
  110. })
  111. }
  112. done();
  113. }
  114. }])
  115. res.render('craw');
  116. });
  117. });
  118. module.exports = router;

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/654837
推荐阅读
相关标签
  

闽ICP备14008679号