node 爬虫 实战 - 爬取拉勾网职位数据,主要想把数据用于大数据学习,到时候大数据分析可以自己分析一下职位的情况,和比较一些我现在的职位在深圳乃至全国的开发人员水平。
涉及到的技术栈:node.js ,mongoDB , express
源码下载:https://gitee.com/draven_lee/node-spider ,如果有帮助到你,不需要打赏,欢迎给star。
- var express = require('express');
- var MongoClient = require('mongodb').MongoClient;
- const request = require('superagent');
- var url = "mongodb://localhost:27017/draven";
- var router = express.Router();
- var cheerio = require('cheerio');
- var Crawler = require("crawler");
- //爬取 数据
- router.get('/crawlData', function(req, res, next) {
- var url = "mongodb://localhost:27017/draven";
- var menuList = [];
- var urlList = []
- var location = '深圳';
- MongoClient.connect(url, function(err, db) {
- if (err) throw err;
- var dbo = db.db("draven");
- var c = new Crawler({
- preRequest: function(options, done) {
- // 'options' here is not the 'options' you pass to 'c.queue', instead, it's the options that is going to be passed to 'request' module
- console.log(options.uri);
- // when done is called, the request will start
- done();
- },
- jQuery: true, //是否用 cheerio 的jQuery语法
- rateLimit:25000, //爬取速度 25秒 爬一次
- maxConnections : 1, //最大爬取并发量 1
- headers:{ //模拟真实用户浏览器状态
- 'Cookie':'index_location_city='+encodeURI(location)+'user_trace_token=20181127172617-5d56fc60-618b-4486-9762-21efad3c49df; JSESSIONID=ABAAABAAAFCAAEG70DFEA8B139FF80287ABDF2F4C137946; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; _ga=GA1.2.405959562.1543310779; _gid=GA1.2.577762828.1543310779; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543310402,1543310779; LGSID=20181127172618-7ef404ed-f226-11e8-80e4-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20181127172618-7ef406c2-f226-11e8-80e4-525400f775ce; _gat=1; TG-TRACK-CODE=index_navigation; SEARCH_ID=88db5c7fa2464090a6dd7041f35074ba; X_HTTP_TOKEN=492369107a1a20441020ab9b771f2f6d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%2C%22%24device_id%22%3A%221675489482d244-0f3ef5ad6aef94-4313362-2073600-1675489482e36f%22%7D; sajssdk_2015_cross_new_user=1; ab_test_random_num=0; _putrc=69D503B669D896FC123F89F2B170EADC; login=true; hasDeliver=0; gate_login_token=33f3414d87f12e09e089b3b6daf10134f0a5ebf49fad63dfd9b8bc4e3a4f162b; unick=hello; LGRID=20181127174101-8d501f2b-f228-11e8-8c21-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543311662',
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
- },
- callback : function (error, res, done) {
- if(error){
- console.log(error);
- done();
- }else{
- var $ = res.$;
- var jobList = []
- console.log($('title').text())
- $('.con_list_item').each(function (idx, item) {
- var $item = $(item);
- jobList.push({
- name: $item.find('.position_link').find('h3').text(),
- address: $item.find('.add').find('em').text(),
- company: $item.find('.company_name').find('a').text(),
- companyLink: $item.find('a').attr('href'),
- companyImg: $item.find('.com_logo').find('img').attr('src'),
- money: $item.find('.money').text(),
- label:$item.find('.list_item_bot').find('span').text(),
- welfare:$item.find('.li_b_r').text()
- });
- });
- try {
- console.log('c.queueSize',c.queueSize);
- console.log('jobList',jobList.length)
- if(jobList.length > 0 ){
- //保存到数据库中
- dbo.collection("job").insertMany(jobList, function(err, res) {
- if (err) throw err;
- console.log('job 数据导入成功!');
- // db.close();
- })
- }
- done();
- }catch(e){
- console.log(e);
- done();
- }
- }
- }
- });
- //爬取首页的menu的数据
- c.queue([
- {
- uri: 'https://www.lagou.com/',
- headers:{
- 'Set-Cookie':'index_location_city='+encodeURI(location),
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
- 'login':true
- },
- // The global callback won't be called
- callback: function (error, res, done) {
- if(error){
- console.log(error);
- }else{
- var $ = res.$;
- $('.menu_sub a').each(function (idx, element) {
- var $element = $(element);
- menuList.push({
- name: $element.text(),
- tjId: $element.attr('data-lg-tj-id'),
- // tjIdName:changeName($element.attr('data-lg-tj-id')),
- tjNo:$element.attr('data-lg-tj-no'),
- tjCid:$element.attr('data-lg-tj-cid'),
- link:$element.attr('href'),
- });
- //组装menu前30页的url
- for(var i = 1 ;i<=30 ;i++){
- urlList.push($element.attr('href')+i+'/');
- }
- });
- //把首页爬取的menu URL数据加入到需要爬取的队列中
- c.queue(urlList);
- console.log(urlList,urlList.length)
- console.log('menuList 共',menuList.length ,'条数据');
- dbo.collection("menu").insertMany(menuList, function(err, res) {
- if (err) throw err;
- console.log('数据导入成功!');
- // db.close();
- })
- }
- done();
- }
- }])
- res.render('craw');
- });
- });
- module.exports = router;
