当前位置:   article > 正文

Spring Boot + WebMagic 实现网页爬虫,写得太好了!

webmagic springboot

点击关注公众号,Java干货及时送达

来源:www.jianshu.com/p/cfead4b3e34e

WebMagic是一个开源的java爬虫框架。

WebMagic框架的使用并不是本文的重点,具体如何使用请参考官方文档:http://webmagic.io/docs/。

本文是对spring boot+WebMagic+MyBatis做了整合,使用WebMagic爬取数据,然后通过MyBatis持久化爬取的数据到mysql数据库。

本文提供的源代码可以作为java爬虫项目的脚手架。

1.添加maven依赖

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0"
  3.          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4.          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5.     <modelVersion>4.0.0</modelVersion>
  6.     <groupId>hyzx</groupId>
  7.     <artifactId>qbasic-crawler</artifactId>
  8.     <version>1.0.0</version>
  9.     <parent>
  10.         <groupId>org.springframework.boot</groupId>
  11.         <artifactId>spring-boot-starter-parent</artifactId>
  12.         <version>1.5.21.RELEASE</version>
  13.         <relativePath/> <!-- lookup parent from repository -->
  14.     </parent>
  15.     <properties>
  16.         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  17.         <maven.test.skip>true</maven.test.skip>
  18.         <java.version>1.8</java.version>
  19.         <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
  20.         <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
  21.         <mysql.connector.version>5.1.47</mysql.connector.version>
  22.         <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
  23.         <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
  24.         <fastjson.version>1.2.58</fastjson.version>
  25.         <commons.lang3.version>3.9</commons.lang3.version>
  26.         <joda.time.version>2.10.2</joda.time.version>
  27.         <webmagic.core.version>0.7.3</webmagic.core.version>
  28.     </properties>
  29.     <dependencies>
  30.         <dependency>
  31.             <groupId>org.springframework.boot</groupId>
  32.             <artifactId>spring-boot-devtools</artifactId>
  33.             <scope>runtime</scope>
  34.             <optional>true</optional>
  35.         </dependency>
  36.         <dependency>
  37.             <groupId>org.springframework.boot</groupId>
  38.             <artifactId>spring-boot-starter-test</artifactId>
  39.             <scope>test</scope>
  40.         </dependency>
  41.         <dependency>
  42.             <groupId>org.springframework.boot</groupId>
  43.             <artifactId>spring-boot-configuration-processor</artifactId>
  44.             <optional>true</optional>
  45.         </dependency>
  46.         <dependency>
  47.             <groupId>mysql</groupId>
  48.             <artifactId>mysql-connector-java</artifactId>
  49.             <version>${mysql.connector.version}</version>
  50.         </dependency>
  51.         <dependency>
  52.             <groupId>com.alibaba</groupId>
  53.             <artifactId>druid-spring-boot-starter</artifactId>
  54.             <version>${druid.spring.boot.starter.version}</version>
  55.         </dependency>
  56.         <dependency>
  57.             <groupId>org.mybatis.spring.boot</groupId>
  58.             <artifactId>mybatis-spring-boot-starter</artifactId>
  59.             <version>${mybatis.spring.boot.starter.version}</version>
  60.         </dependency>
  61.         <dependency>
  62.             <groupId>com.alibaba</groupId>
  63.             <artifactId>fastjson</artifactId>
  64.             <version>${fastjson.version}</version>
  65.         </dependency>
  66.         <dependency>
  67.             <groupId>org.apache.commons</groupId>
  68.             <artifactId>commons-lang3</artifactId>
  69.             <version>${commons.lang3.version}</version>
  70.         </dependency>
  71.         <dependency>
  72.             <groupId>joda-time</groupId>
  73.             <artifactId>joda-time</artifactId>
  74.             <version>${joda.time.version}</version>
  75.         </dependency>
  76.         <dependency>
  77.             <groupId>us.codecraft</groupId>
  78.             <artifactId>webmagic-core</artifactId>
  79.             <version>${webmagic.core.version}</version>
  80.             <exclusions>
  81.                 <exclusion>
  82.                     <groupId>org.slf4j</groupId>
  83.                     <artifactId>slf4j-log4j12</artifactId>
  84.                 </exclusion>
  85.             </exclusions>
  86.         </dependency>
  87.     </dependencies>
  88.     <build>
  89.         <plugins>
  90.             <plugin>
  91.                 <groupId>org.apache.maven.plugins</groupId>
  92.                 <artifactId>maven-compiler-plugin</artifactId>
  93.                 <version>${maven.compiler.plugin.version}</version>
  94.                 <configuration>
  95.                     <source>${java.version}</source>
  96.                     <target>${java.version}</target>
  97.                     <encoding>${project.build.sourceEncoding}</encoding>
  98.                 </configuration>
  99.             </plugin>
  100.             <plugin>
  101.                 <groupId>org.apache.maven.plugins</groupId>
  102.                 <artifactId>maven-resources-plugin</artifactId>
  103.                 <version>${maven.resources.plugin.version}</version>
  104.                 <configuration>
  105.                     <encoding>${project.build.sourceEncoding}</encoding>
  106.                 </configuration>
  107.             </plugin>
  108.             <plugin>
  109.                 <groupId>org.springframework.boot</groupId>
  110.                 <artifactId>spring-boot-maven-plugin</artifactId>
  111.                 <configuration>
  112.                     <fork>true</fork>
  113.                     <addResources>true</addResources>
  114.                 </configuration>
  115.                 <executions>
  116.                     <execution>
  117.                         <goals>
  118.                             <goal>repackage</goal>
  119.                         </goals>
  120.                     </execution>
  121.                 </executions>
  122.             </plugin>
  123.         </plugins>
  124.     </build>
  125.     <repositories>
  126.         <repository>
  127.             <id>public</id>
  128.             <name>aliyun nexus</name>
  129.             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
  130.             <releases>
  131.                 <enabled>true</enabled>
  132.             </releases>
  133.         </repository>
  134.     </repositories>
  135.     <pluginRepositories>
  136.         <pluginRepository>
  137.             <id>public</id>
  138.             <name>aliyun nexus</name>
  139.             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
  140.             <releases>
  141.                 <enabled>true</enabled>
  142.             </releases>
  143.             <snapshots>
  144.                 <enabled>false</enabled>
  145.             </snapshots>
  146.         </pluginRepository>
  147.     </pluginRepositories>
  148. </project>

2.项目配置文件 application.properties

配置mysql数据源,druid数据库连接池以及MyBatis的mapper文件的位置。Spring Boot 基础就不介绍了,最全教程和示例源码推荐看这里:https://github.com/javastacks/spring-boot-best-practice

  1. # mysql数据源配置
  2. spring.datasource.name=mysql
  3. spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
  4. spring.datasource.driver-class-name=com.mysql.jdbc.Driver
  5. spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
  6. spring.datasource.username=root
  7. spring.datasource.password=root
  8. # druid数据库连接池配置
  9. spring.datasource.druid.initial-size=5
  10. spring.datasource.druid.min-idle=5
  11. spring.datasource.druid.max-active=10
  12. spring.datasource.druid.max-wait=60000
  13. spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
  14. spring.datasource.druid.test-on-borrow=false
  15. spring.datasource.druid.test-on-return=false
  16. spring.datasource.druid.test-while-idle=true
  17. spring.datasource.druid.time-between-eviction-runs-millis=60000
  18. spring.datasource.druid.min-evictable-idle-time-millis=300000
  19. spring.datasource.druid.max-evictable-idle-time-millis=600000
  20. # mybatis配置
  21. mybatis.mapperLocations=classpath:mapper/**/*.xml

3.数据库表结构

  1. CREATE TABLE `cms_content` (
  2.   `contentId` varchar(40) NOT NULL COMMENT '内容ID',
  3.   `title` varchar(150) NOT NULL COMMENT '标题',
  4.   `content` longtext COMMENT '文章内容',
  5.   `releaseDate` datetime NOT NULL COMMENT '发布日期',
  6.   PRIMARY KEY (`contentId`)
  7. ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';

4.实体类

  1. import java.util.Date;
  2. public class CmsContentPO {
  3.     private String contentId;
  4.     private String title;
  5.     private String content;
  6.     private Date releaseDate;
  7.     public String getContentId() {
  8.         return contentId;
  9.     }
  10.     public void setContentId(String contentId) {
  11.         this.contentId = contentId;
  12.     }
  13.     public String getTitle() {
  14.         return title;
  15.     }
  16.     public void setTitle(String title) {
  17.         this.title = title;
  18.     }
  19.     public String getContent() {
  20.         return content;
  21.     }
  22.     public void setContent(String content) {
  23.         this.content = content;
  24.     }
  25.     public Date getReleaseDate() {
  26.         return releaseDate;
  27.     }
  28.     public void setReleaseDate(Date releaseDate) {
  29.         this.releaseDate = releaseDate;
  30.     }
  31. }

5.mapper接口

  1. public interface CrawlerMapper {
  2.     int addCmsContent(CmsContentPO record);
  3. }

6.CrawlerMapper.xml文件

  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
  3. <mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
  4.     <insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
  5.         insert into cms_content (contentId,
  6.                                  title,
  7.                                  releaseDate,
  8.                                  content)
  9.         values (#{contentId,jdbcType=VARCHAR},
  10.                 #{title,jdbcType=VARCHAR},
  11.                 #{releaseDate,jdbcType=TIMESTAMP},
  12.                 #{content,jdbcType=LONGVARCHAR})
  13.     </insert>
  14. </mapper>

7.XXX页面内容处理类XXXPageProcessor

主要用于解析爬取到的XXX html页面。

点击关注公众号,Java干货及时送达

  1. @Component
  2. public class XXXPageProcessor implements PageProcessor {
  3.     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
  4.     @Override
  5.     public void process(Page page) {
  6.         page.addTargetRequests(page.getHtml().links().regex("https://www\\.xxx\\.com/question/\\d+/answer/\\d+.*").all());
  7.         page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
  8.         page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
  9.         if (page.getResultItems().get("title") == null) {
  10.             // 如果是列表页,跳过此页,pipeline不进行后续处理
  11.             page.setSkip(true);
  12.         }
  13.     }
  14.     @Override
  15.     public Site getSite() {
  16.         return site;
  17.     }
  18. }

8.XXX数据处理类XXXPipeline

主要用于将XXX html页面解析出的数据存储到mysql数据库。另外,MySQL 系列面试题和答案全部整理好了,微信搜索Java技术栈,在后台发送:面试,可以在线阅读

  1. @Component
  2. public class XXXPipeline implements Pipeline {
  3.     private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
  4.     @Autowired
  5.     private CrawlerMapper crawlerMapper;
  6.     public void process(ResultItems resultItems, Task task) {
  7.         String title = resultItems.get("title");
  8.         String answer = resultItems.get("answer");
  9.         CmsContentPO contentPO = new CmsContentPO();
  10.         contentPO.setContentId(UUID.randomUUID().toString());
  11.         contentPO.setTitle(title);
  12.         contentPO.setReleaseDate(new Date());
  13.         contentPO.setContent(answer);
  14.         try {
  15.             boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
  16.             LOGGER.info("保存文章成功:{}", title);
  17.         } catch (Exception ex) {
  18.             LOGGER.error("保存文章失败", ex);
  19.         }
  20.     }
  21. }

9.爬虫任务类XXXTask

每十分钟启动一次爬虫。

  1. @Component
  2. public class XXXTask {
  3.     private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
  4.     @Autowired
  5.     private XXXPipeline XXXPipeline;
  6.     @Autowired
  7.     private XXXPageProcessor xxxPageProcessor;
  8.     private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
  9.     public void crawl() {
  10.         // 定时任务,每10分钟爬取一次
  11.         timer.scheduleWithFixedDelay(() -> {
  12.             Thread.currentThread().setName("xxxCrawlerThread");
  13.             try {
  14.                 Spider.create(xxxPageProcessor)
  15.                         // 从https://www.xxx.com/explore开始抓
  16.                         .addUrl("https://www.xxx.com/explore")
  17.                         // 抓取到的数据存数据库
  18.                         .addPipeline(xxxPipeline)
  19.                         // 开启2个线程抓取
  20.                         .thread(2)
  21.                         // 异步启动爬虫
  22.                         .start();
  23.             } catch (Exception ex) {
  24.                 LOGGER.error("定时抓取数据线程执行异常", ex);
  25.             }
  26.         }, 010, TimeUnit.MINUTES);
  27.     }
  28. }

10.Spring boot程序启动类

  1. @SpringBootApplication
  2. @MapperScan(basePackages = "com.hyzx.qbasic.dao")
  3. public class Application implements CommandLineRunner {
  4.     @Autowired
  5.     private XXXTask xxxTask;
  6.     public static void main(String[] args) throws IOException {
  7.         SpringApplication.run(Application.class, args);
  8.     }
  9.     @Override
  10.     public void run(String... strings) throws Exception {
  11.         // 爬取数据
  12.         xxxTask.crawl();
  13.     }
  14. }

最后,关注公众号Java技术栈,在后台回复:面试,可以获取我整理的 Java/ Spring Boot 系列面试题和答案,非常齐全。



关注Java技术栈看更多干货

获取 Spring Boot 实战笔记!


声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号