赞
踩
最近,想自己学习下hadoop,但又缺少点文本数据,所以需要爬取点数据~ 不会写py , 就直接找了个爬虫框架~
webmagic的原理图如下,很简单很好用:
POM.xml
<!-- mybatis start-->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis-version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- mybatis end -->
<!-- webMagic start -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webMagic-version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${webMagic-version}</version>
</dependency>
<!-- webMagic end -->

Application.java
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.support.SpringBootServletInitializer;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableScheduling
public class GlobeFishWebMagicApplication extends SpringBootServletInitializer {
@Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder application) {
return application.sources(GlobeFishWebMagicApplication.class);
}
public static void main(String[] args) {
SpringApplication.run(GlobeFishWebMagicApplication.class, args);
}
}

processor.java
import java.util.Date;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import com.panchen.globeFishWebMagic.entity.CSDNMessage;
import com.panchen.globeFishWebMagic.mapper.CSDNMessageMapper;
import com.panchen.globeFishWebMagic.util.SpringContextUtil;
import com.panchen.globeFishWebMagic.util.UUIDUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 爬虫
*
* @author pc
*
*/
@Component
@SpringBootApplication
public class CSDNProcessor extends Thread implements PageProcessor {
private final static Logger logger = LoggerFactory.getLogger(CSDNProcessor.class);
@Autowired
private CSDNMessageMapper csdnMessageMapper;
private String originalUrl;
private String taskName;
// CountDownLatch作为计数器记录线程
private static CountDownLatch cdl=new CountDownLatch(9);
//使用原子变量
private static AtomicInteger urlCount = new AtomicInteger(0);
private static AtomicInteger pageCount = new AtomicInteger(1);
public CSDNProcessor() {
}
public CSDNProcessor(CountDownLatch cdl) {
this.cdl = cdl;
}
// 抓取配置
private Site site = Site.me().setSleepTime(1000).setRetryTimes(30).setCharset("utf-8").setTimeOut(300000)
.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
if (page.getUrl().regex("http://blog\\.csdn\\.net/(.*)/article/details/(.*)").match()) {
// get
CSDNMessage newCSDNMessage = new CSDNMessage(UUIDUtil.getUUID(), page.getUrl().get(),
page.getHtml().xpath("//*[@id=\"blog_userface\"]/span/a/text()").get(),
page.getHtml().xpath("//*[@class=\"article_title\"]/h1/span/text()").get(),
page.getHtml().xpath("//*[@id=\"article_content\"]").get(),
page.getHtml().xpath("//*[@class=\"link_postdate\"]/text()").get(), new Date(), 1, null, null,
page.getHtml().xpath("//*[@id=\"btnDigg\"]/dd/text()").get(),
page.getHtml().xpath("//*[@id=\"btnBury\"]/dd/text()").get(),
page.getHtml().xpath("//*/[@class=\"link_view\"]/text()").get(),
page.getHtml().xpath("//*[@class=\"link_comments\"]/text()").get(),
page.getHtml().xpath("//*[@class=\"category_r\"]/label/span/text()").get());
csdnMessageMapper.addCSDNMessage(newCSDNMessage);
urlCount.getAndIncrement();
}
List<String> urls = page.getHtml()
.xpath("//*[@class=\"blog_list clearfix\"]/dd/[@class=\"tracking-ad\"]/a/@href").all();
// 跳页
if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html(.*)")) {
pageCount.getAndIncrement();
if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html")) {
page.addTargetRequest(page.getUrl().get() + "?&page=2");
} else {
page.addTargetRequest(
page.getUrl().get().substring(0, page.getUrl().get().lastIndexOf('=') + 1) + pageCount);
}
}
if (null != urls && 0 < urls.size()) {
for (String url : urls) {
if (null != csdnMessageMapper.getMessageByUrl(url)) {
csdnMessageMapper.deleteCSDNMessageByUrl(url);
}
page.addTargetRequest(url);
}
}
}
public void run() {
long startTime, endTime;
logger.info(taskName + "START!!!!!");
startTime = System.currentTimeMillis();
//spring对bean的管理是安全的 无法通过注入来得到bean 工具类实现ApplicationContextAware即可
Spider.create(SpringContextUtil.getBeanByClass(new CSDNProcessor().getClass())).addUrl(originalUrl).thread(1).run();
endTime = System.currentTimeMillis();
logger.info(taskName + "END!!!!!,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount
+ "条记录");
cdl.countDown();
}
/**
*
* 对多模块进行爬取
*/
@Scheduled(cron = "0 46 16 ? * *")
public void scheduled() {
long startTime, endTime;
logger.info("START!!!!!");
startTime = System.currentTimeMillis();
// 手機
CSDNProcessor mobile = new CSDNProcessor(cdl);
mobile.setOriginalUrl("http://blog.csdn.net/mobile/newarticle.html");
mobile.setTaskName("mobile");
// web前端
CSDNProcessor web = new CSDNProcessor(cdl);
web.setOriginalUrl("http://blog.csdn.net/web/newarticle.html");
web.setTaskName("web");
// 研發管理
CSDNProcessor software = new CSDNProcessor(cdl);
software.setOriginalUrl("http://blog.csdn.net/software/newarticle.html");
software.setTaskName("software");
// 架構設計
CSDNProcessor enterprise = new CSDNProcessor(cdl);
enterprise.setOriginalUrl("http://blog.csdn.net/enterprise/newarticle.html");
enterprise.setTaskName("enterprise");
// 程序語言
CSDNProcessor code = new CSDNProcessor(cdl);
code.setOriginalUrl("http://blog.csdn.net/code/newarticle.html");
code.setTaskName("code");
// 互聯網
CSDNProcessor www = new CSDNProcessor(cdl);
www.setOriginalUrl("http://blog.csdn.net/www/newarticle.html");
www.setTaskName("www");
// 數據庫
CSDNProcessor database = new CSDNProcessor(cdl);
database.setOriginalUrl("http://blog.csdn.net/database/newarticle.html");
database.setTaskName("database");
// cloud
CSDNProcessor cloud = new CSDNProcessor(cdl);
cloud.setOriginalUrl("http://blog.csdn.net/cloud/newarticle.html");
cloud.setTaskName("cloud");
// 總和
CSDNProcessor other = new CSDNProcessor(cdl);
other.setOriginalUrl("http://blog.csdn.net/other/newarticle.html");
other.setTaskName("other");
// 子线程开始
mobile.start();
web.start();
software.start();
enterprise.start();
code.start();
www.start();
database.start();
cloud.start();
other.start();
// 主线程等待
try {
cdl.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
endTime = System.currentTimeMillis();
logger.info("END!!!!!,总耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount + "条记录");
}
public String getOriginalUrl() {
return originalUrl;
}
public void setOriginalUrl(String originalUrl) {
this.originalUrl = originalUrl;
}
public String getTaskName() {
return taskName;
}
public void setTaskName(String taskName) {
this.taskName = taskName;
}
}

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。