赞
踩
点击关注公众号,Java干货及时送达
来源:www.jianshu.com/p/cfead4b3e34e
WebMagic是一个开源的java爬虫框架。
WebMagic框架的使用并不是本文的重点,具体如何使用请参考官方文档:http://webmagic.io/docs/。
本文是对spring boot+WebMagic+MyBatis做了整合,使用WebMagic爬取数据,然后通过MyBatis持久化爬取的数据到mysql数据库。
本文提供的源代码可以作为java爬虫项目的脚手架。
- <?xml version="1.0" encoding="UTF-8"?>
- <project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <groupId>hyzx</groupId>
- <artifactId>qbasic-crawler</artifactId>
- <version>1.0.0</version>
-
- <parent>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-starter-parent</artifactId>
- <version>1.5.21.RELEASE</version>
- <relativePath/> <!-- lookup parent from repository -->
- </parent>
-
- <properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <maven.test.skip>true</maven.test.skip>
- <java.version>1.8</java.version>
- <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
- <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
-
- <mysql.connector.version>5.1.47</mysql.connector.version>
- <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
- <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
- <fastjson.version>1.2.58</fastjson.version>
- <commons.lang3.version>3.9</commons.lang3.version>
- <joda.time.version>2.10.2</joda.time.version>
- <webmagic.core.version>0.7.3</webmagic.core.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-devtools</artifactId>
- <scope>runtime</scope>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-starter-test</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-configuration-processor</artifactId>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>mysql</groupId>
- <artifactId>mysql-connector-java</artifactId>
- <version>${mysql.connector.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.alibaba</groupId>
- <artifactId>druid-spring-boot-starter</artifactId>
- <version>${druid.spring.boot.starter.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.mybatis.spring.boot</groupId>
- <artifactId>mybatis-spring-boot-starter</artifactId>
- <version>${mybatis.spring.boot.starter.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.alibaba</groupId>
- <artifactId>fastjson</artifactId>
- <version>${fastjson.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- <version>${commons.lang3.version}</version>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>${joda.time.version}</version>
- </dependency>
-
- <dependency>
- <groupId>us.codecraft</groupId>
- <artifactId>webmagic-core</artifactId>
- <version>${webmagic.core.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>${maven.compiler.plugin.version}</version>
- <configuration>
- <source>${java.version}</source>
- <target>${java.version}</target>
- <encoding>${project.build.sourceEncoding}</encoding>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-resources-plugin</artifactId>
- <version>${maven.resources.plugin.version}</version>
- <configuration>
- <encoding>${project.build.sourceEncoding}</encoding>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-maven-plugin</artifactId>
- <configuration>
- <fork>true</fork>
- <addResources>true</addResources>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>repackage</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
-
- <repositories>
- <repository>
- <id>public</id>
- <name>aliyun nexus</name>
- <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
- <releases>
- <enabled>true</enabled>
- </releases>
- </repository>
- </repositories>
-
- <pluginRepositories>
- <pluginRepository>
- <id>public</id>
- <name>aliyun nexus</name>
- <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
- <releases>
- <enabled>true</enabled>
- </releases>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- </pluginRepository>
- </pluginRepositories>
- </project>
配置mysql数据源,druid数据库连接池以及MyBatis的mapper文件的位置。Spring Boot 基础就不介绍了,最全教程和示例源码推荐看这里:https://github.com/javastacks/spring-boot-best-practice
- # mysql数据源配置
- spring.datasource.name=mysql
- spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
- spring.datasource.driver-class-name=com.mysql.jdbc.Driver
- spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
- spring.datasource.username=root
- spring.datasource.password=root
-
- # druid数据库连接池配置
- spring.datasource.druid.initial-size=5
- spring.datasource.druid.min-idle=5
- spring.datasource.druid.max-active=10
- spring.datasource.druid.max-wait=60000
- spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
- spring.datasource.druid.test-on-borrow=false
- spring.datasource.druid.test-on-return=false
- spring.datasource.druid.test-while-idle=true
- spring.datasource.druid.time-between-eviction-runs-millis=60000
- spring.datasource.druid.min-evictable-idle-time-millis=300000
- spring.datasource.druid.max-evictable-idle-time-millis=600000
-
- # mybatis配置
- mybatis.mapperLocations=classpath:mapper/**/*.xml
- CREATE TABLE `cms_content` (
- `contentId` varchar(40) NOT NULL COMMENT '内容ID',
- `title` varchar(150) NOT NULL COMMENT '标题',
- `content` longtext COMMENT '文章内容',
- `releaseDate` datetime NOT NULL COMMENT '发布日期',
- PRIMARY KEY (`contentId`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';
- import java.util.Date;
-
- public class CmsContentPO {
- private String contentId;
-
- private String title;
-
- private String content;
-
- private Date releaseDate;
-
- public String getContentId() {
- return contentId;
- }
-
- public void setContentId(String contentId) {
- this.contentId = contentId;
- }
-
- public String getTitle() {
- return title;
- }
-
- public void setTitle(String title) {
- this.title = title;
- }
-
- public String getContent() {
- return content;
- }
-
- public void setContent(String content) {
- this.content = content;
- }
-
- public Date getReleaseDate() {
- return releaseDate;
- }
-
- public void setReleaseDate(Date releaseDate) {
- this.releaseDate = releaseDate;
- }
- }
- public interface CrawlerMapper {
- int addCmsContent(CmsContentPO record);
- }
- <?xml version="1.0" encoding="UTF-8"?>
- <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
- <mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
-
- <insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
- insert into cms_content (contentId,
- title,
- releaseDate,
- content)
- values (#{contentId,jdbcType=VARCHAR},
- #{title,jdbcType=VARCHAR},
- #{releaseDate,jdbcType=TIMESTAMP},
- #{content,jdbcType=LONGVARCHAR})
- </insert>
- </mapper>
主要用于解析爬取到的XXX html页面。
点击关注公众号,Java干货及时送达
- @Component
- public class XXXPageProcessor implements PageProcessor {
-
- private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
-
- @Override
- public void process(Page page) {
- page.addTargetRequests(page.getHtml().links().regex("https://www\\.xxx\\.com/question/\\d+/answer/\\d+.*").all());
- page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
- page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
- if (page.getResultItems().get("title") == null) {
- // 如果是列表页,跳过此页,pipeline不进行后续处理
- page.setSkip(true);
- }
- }
-
- @Override
- public Site getSite() {
- return site;
- }
- }
主要用于将XXX html页面解析出的数据存储到mysql数据库。另外,MySQL 系列面试题和答案全部整理好了,微信搜索Java技术栈,在后台发送:面试,可以在线阅读。
- @Component
- public class XXXPipeline implements Pipeline {
- private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
-
- @Autowired
- private CrawlerMapper crawlerMapper;
-
- public void process(ResultItems resultItems, Task task) {
- String title = resultItems.get("title");
- String answer = resultItems.get("answer");
-
- CmsContentPO contentPO = new CmsContentPO();
- contentPO.setContentId(UUID.randomUUID().toString());
- contentPO.setTitle(title);
- contentPO.setReleaseDate(new Date());
- contentPO.setContent(answer);
-
- try {
- boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
- LOGGER.info("保存文章成功:{}", title);
- } catch (Exception ex) {
- LOGGER.error("保存文章失败", ex);
- }
- }
- }
- @Component
- public class XXXTask {
- private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
-
- @Autowired
- private XXXPipeline XXXPipeline;
-
- @Autowired
- private XXXPageProcessor xxxPageProcessor;
-
- private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
-
- public void crawl() {
- // 定时任务,每10分钟爬取一次
- timer.scheduleWithFixedDelay(() -> {
- Thread.currentThread().setName("xxxCrawlerThread");
-
- try {
- Spider.create(xxxPageProcessor)
- // 从https://www.xxx.com/explore开始抓
- .addUrl("https://www.xxx.com/explore")
- // 抓取到的数据存数据库
- .addPipeline(xxxPipeline)
- // 开启2个线程抓取
- .thread(2)
- // 异步启动爬虫
- .start();
- } catch (Exception ex) {
- LOGGER.error("定时抓取数据线程执行异常", ex);
- }
- }, 0, 10, TimeUnit.MINUTES);
- }
- }
- @SpringBootApplication
- @MapperScan(basePackages = "com.hyzx.qbasic.dao")
- public class Application implements CommandLineRunner {
-
- @Autowired
- private XXXTask xxxTask;
-
- public static void main(String[] args) throws IOException {
- SpringApplication.run(Application.class, args);
- }
-
- @Override
- public void run(String... strings) throws Exception {
- // 爬取数据
- xxxTask.crawl();
- }
- }
最后,关注公众号Java技术栈,在后台回复:面试,可以获取我整理的 Java/ Spring Boot 系列面试题和答案,非常齐全。
关注Java技术栈看更多干货
获取 Spring Boot 实战笔记!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。