- CREATE TABLE extract_task_temp
- (
- id integer NOT NULL DEFAULT nextval('extract_task_temp_731_id_seq'::regclass),
- task_init_time timestamp with time zone, -- 初始化抽取任务时间
- task_current_time timestamp with time zone, -- 当前任务抽取时间
- task_next_time timestamp with time zone, -- 下一次任务抽取时间
- create_time timestamp with time zone DEFAULT now(),
- update_time timestamp with time zone, -- 修改时间
- task_type integer, -- 任务类型1:文章,2回复
- website_id integer, -- 站点类型id
- start_size integer, -- 分页起始大小
- limit_size integer, -- 分次取多少条数据
- cid integer, -- 客户id
- authors text, -- 作者昵称
- interval_time integer -- 间隔时间单位(分钟)
- )
- package com.cyyun.mobile.tools;
-
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
-
- import javax.annotation.Resource;
-
- import org.apache.commons.collections.CollectionUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.commons.lang3.time.DateUtils;
- import org.apache.log4j.Logger;
- import org.springframework.scheduling.annotation.Scheduled;
- import org.springframework.stereotype.Component;
-
- import com.cyyun.mobile.dao.ICommentAccountTempDao;
- import com.cyyun.mobile.dao.IExtractTaskTempDao;
- import com.cyyun.mobile.pojo.CommentAccountTemp;
- import com.cyyun.mobile.pojo.ExtractTaskTemp;
- import com.cyyun.mobile.service.ExtractTaskTempService;
- import com.cyyun.mobile.tools.httpconnection.HttpUrlConnection;
- import com.cyyun.mobile.tools.json.JsonEntity;
- import com.cyyun.mobile.tools.json.JsonEntityArray;
- import com.twmacinta.util.MD5;
-
- /**
- * 抽取数据任务
- *
- * @author zhangzm
- *
- */
- @Component
- public class ExtractTask {
-
- static Logger log = Logger.getLogger(ExtractTask.class);
- @Resource
- IExtractTaskTempDao iExtractTaskTempDao;
-
- @Resource
- ICommentAccountTempDao iCommentAccountTempDao;
-
- @Resource
- ExtractTaskTempService extractTaskTempService;
-
- /**
- * 获取任务对象
- *
- * @return
- */
- public List<ExtractTaskTemp> getExtractTaskTemp() {
- List<ExtractTaskTemp> extractTaskTemps = null;
- try {
- extractTaskTemps = extractTaskTempService
- .queryExtractTaskTemp(null);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- }
- return extractTaskTemps;
- }
-
- @Scheduled(cron = "0 0/1 * * * ?")
- public void execute() {
- List<ExtractTaskTemp> extractTaskTemps = getExtractTaskTemp();
- if (CollectionUtils.isEmpty(extractTaskTemps)) {
- log.warn("extractTaskTemps isEmpty");
- return;
- }
- for (ExtractTaskTemp e : extractTaskTemps) {
- if (null != e) {
- if ("1".equals(String.valueOf(e.getTaskType()))) {
- createTask(e);
- } else {
- createReplyTask(e);
- }
- }
- }
- }
-
- /**
- * 获取回复数
- *
- * @param bean
- */
- public void createReplyTask(ExtractTaskTemp bean) {
- if (bean == null) {
- log.warn("SpidTaskSynBean is null ");
- return;
- }
-
- if (null == bean.getTaskType()) {
- log.warn("ExtractTaskTemp getTaskType is null ");
- return;
- }
-
- initTask(bean);
-
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- String url = Constant.GET_ARTICLE_REPLY_URL;
- dataMap.put("order", "rid");
- dataMap.put("desc", "asc");
- dataMap.put("cid", String.valueOf(bean.getCid()));
- dataMap.put("limit", String.valueOf(bean.getLimitSize()));
- dataMap.put("fid", String.valueOf(bean.getWebsiteId()));
- dataMap.put("authors", bean.getAuthors());
- dataMap.put("from", String.valueOf(bean.getTaskCurrentTime().getTime()));
- dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime()));
-
- StringBuilder logBuilder = new StringBuilder();
- logBuilder.append("开始时间为:").append(bean.getCreateTime())
- .append("结束时间为:").append(bean.getTaskNextTime())
- .append(" url :" + url).append(" dataMap :" + dataMap);
- log.info(logBuilder.toString());
-
- String response = null;
- try {
- response = connection.readData(dataMap, url);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- return;
- }
- if ("[]".equals(response)) {
- bean.setAuthors(null);
- bean.setStartSize(0);
- extractTaskTempService.updateExtractTaskTemp(bean);
- log.warn("get response is null " + url + " " + dataMap);
- return;
- }
-
- JsonEntityArray array = new JsonEntityArray(response);
- List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>();
- if (array != null && array.size() > 0) {
- for (int i = 0; i < array.size(); i++) {
- JsonEntity jsonE = array.getJsonEntity(i);
- String rid = jsonE.getString("rid");
- Map<String, Object> map = new HashMap<String, Object>();
- map.put("rid", Integer.valueOf(rid));
- addArticleBeans.add(map);
- }
- try {
- bean.setStartSize(bean.getStartSize() + addArticleBeans.size());
- extractTaskTempService
- .updateExtractTaskTempAndArticleReplyTemp(bean,
- addArticleBeans);
- } catch (Exception e) {
- log.error(e);
- }
- }
-
- }
-
- public String getCommentAccountTempName(
- List<CommentAccountTemp> accountTemps) {
- StringBuilder builder = new StringBuilder();
- try {
- if (CollectionUtils.isNotEmpty(accountTemps)) {
- for (CommentAccountTemp c : accountTemps) {
- if (c != null) {
- String name = c.getCommentNickname();
- builder.append(name).append(",");
- }
- }
- }
-
- } catch (Exception e) {
- log.error(e);
- }
- String s = builder.toString();
- if (s.endsWith(",")) {
- s = s.substring(0, s.length() - 1);
- }
- return s;
-
- }
-
- /**
- * 初始化任务,设置 开始时间,结束时间,以及账号表中的任务起始时间。(每一次任务表中的结束时间=账号表中的开始时间) 在账号表中的时间会
- * 出现的时间范围是 初始化时间+时间间隔*次数
- *
- * @param bean
- */
- public void initTask(ExtractTaskTemp bean) {
-
- if (StringUtils.isNotBlank(bean.getAuthors())) {
- return;
- }
-
- Map<String, Object> map = new HashMap<String, Object>();
- map.put("websiteId", bean.getWebsiteId());
- map.put("deleteFlag", 1);
- map.put("taskTime", bean.getTaskInitTime());
- List<CommentAccountTemp> accountTemps = null;
- try {
- accountTemps = extractTaskTempService.queryCommentAccountTemp(map);
- } catch (Exception e) {
- log.error(e);
- }
- if (CollectionUtils.isEmpty(accountTemps)) {
- map.clear();
- map.put("websiteId", bean.getWebsiteId());
- map.put("deleteFlag", 1);
- map.put("taskTime", bean.getTaskNextTime());
- try {
- accountTemps = extractTaskTempService
- .queryCommentAccountTemp(map);
- bean.setStartSize(0);// 起始页
- bean.setTaskCurrentTime(bean.getTaskNextTime());// 当前处理时间
- Date taskNextTime = DateUtils.addMinutes(
- bean.getTaskNextTime(), bean.getIntervalTime());
- bean.setTaskNextTime(taskNextTime);// 下次处理时间
-
- } catch (Exception e) {
- log.error(e);
- }
- } else {
- bean.setStartSize(0);// 起始页
- bean.setTaskCurrentTime(bean.getTaskInitTime());// 当前处理时间
- Date taskNextTime = DateUtils.addMinutes(bean.getTaskInitTime(),
- bean.getIntervalTime());
- bean.setTaskNextTime(taskNextTime);// 下次处理时间
- }
- try {
- bean.setAuthors(getCommentAccountTempName(accountTemps));// 设置作者
- extractTaskTempService.updateExtractTaskTempAndCommentAccountTemp(
- bean, accountTemps);
- } catch (Exception e) {
- log.error(e);
- }
-
- }
-
- /**
- * 抽取文章数据
- *
- * @param bean
- */
- public void createTask(ExtractTaskTemp bean) {
-
- if (bean == null) {
- log.warn("SpidTaskSynBean is null ");
- return;
- }
-
- if (null == bean.getTaskType()) {
- log.warn("ExtractTaskTemp getSysTypeId is null ");
- return;
- }
- initTask(bean);
-
- while (true) {
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- String url = Constant.GET_ARTICLE_URL;
- dataMap.put("action", "full");
- dataMap.put("sort", "6");
- dataMap.put("order", "1");
- dataMap.put("start", String.valueOf(bean.getStartSize()));
- dataMap.put("cid", String.valueOf(bean.getCid()));
- dataMap.put("limit", String.valueOf(bean.getLimitSize()));
- dataMap.put("fid", String.valueOf(bean.getWebsiteId()));
- dataMap.put("authors", bean.getAuthors());
- dataMap.put("from",
- String.valueOf(bean.getTaskCurrentTime().getTime()));
- dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime()));
-
- StringBuilder logBuilder = new StringBuilder();
- logBuilder.append("开始时间为:").append(bean.getCreateTime())
- .append("结束时间为:").append(bean.getTaskNextTime())
- .append(" url :" + url).append(" dataMap :" + dataMap);
- log.info(logBuilder.toString());
-
- String response = null;
- try {
- response = connection.readData(dataMap, url);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- return;
- }
-
- JsonEntity jsonEntity = new JsonEntity(response);
- int result = Integer.valueOf(jsonEntity.getString("count"));
- if (result == 0) {
- log.error("抽取文件数据为0条 ");
- bean.setAuthors(null);
- bean.setStartSize(0);
- extractTaskTempService.updateExtractTaskTemp(bean);
- return;
- } else {
- JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
- List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>();
- if (array != null && array.size() > 0) {
- for (int i = 0; i < array.size(); i++) {
- JsonEntity jsonE = array.getJsonEntity(i);
- String aid = jsonE.getString("aid");
- Map<String, Object> map = new HashMap<String, Object>();
- map.put("aid", Integer.valueOf(aid));
- addArticleBeans.add(map);
- }
- }
-
- try {
- bean.setStartSize(bean.getStartSize()
- + addArticleBeans.size());
- extractTaskTempService.updateExtractTaskTempAndArticleTemp(
- bean, addArticleBeans);
- } catch (Exception e) {
- log.error(e);
- }
-
- }
- }
- }
-
- public void getArticleByGuid(Set<String> guids) {
- if (CollectionUtils.isNotEmpty(guids)) {
- for (String guid : guids) {
- getArticleByGuid(Constant.CID, guid);
- }
- }
- }
-
- /**
- * 抽取文章数据
- *
- * @param bean
- */
- public void getArticleByGuid(String cid, String guid) {
- if (StringUtils.isBlank(cid)) {
- log.warn("cid is null ");
- return;
- }
-
- if (StringUtils.isBlank(guid)) {
- log.warn("cid is null ");
- return;
- }
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- String url = Constant.GET_ARTICLE_URL;
- dataMap.put("cid", cid);
- dataMap.put("guid", guid);
-
- String response = null;
- try {
- response = connection.readData(dataMap, url);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- return;
- }
- // 解析json
- JsonEntity jsonEntity = new JsonEntity(response);
- int result = Integer.valueOf(jsonEntity.getString("count"));
- if (result == 0) {
- log.error("获取0条数据");
- } else {
-
- }
- }
-
- /**
- * 抽取文章数据
- *
- * @param bean
- */
- public static void testGetArticleByGuid(String cid, String guid) {
- if (StringUtils.isBlank(cid)) {
- log.warn("cid is null ");
- return;
- }
-
- if (StringUtils.isBlank(guid)) {
- log.warn("cid is null ");
- return;
- }
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- dataMap.put("cid", cid);
- dataMap.put("guid", guid);
-
- String response = null;
- try {
- response = connection.readData(dataMap, url);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- return;
- }
- // 解析json
- JsonEntity jsonEntity = new JsonEntity(response);
- int result = Integer.valueOf(jsonEntity.getString("count"));
- if (result == 0) {
- log.error("获取0条数据");
- } else {
-
- }
- }
-
- /**
- * 获取文章内容
- *
- * @param aid
- * @return
- */
- public String getArticleContent(Long aid) {
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- String url = Constant.GET_ARTICLE_CONTENT_URL;
- dataMap.put("aid", String.valueOf(aid));
- String response = null;
- try {
- response = connection.readData(dataMap, url);
- } catch (Exception e) {
- log.error(e.getMessage(), e);
- return null;
- }
- if (StringUtils.isNotBlank(response)) {
- // 解析json
- JsonEntity jsonEntity = new JsonEntity(response);
- return jsonEntity.getString("content");
- }
- return null;
- }
-
- public static void testSpidArticle() {
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
-
- dataMap.put("start", "0");
- dataMap.put("cid", "731");
- dataMap.put("limit", "5");
- dataMap.put("action", "full");
- dataMap.put("sort", "6");
- dataMap.put("order", "1");
- dataMap.put("author", "品味咖啡");
-
- dataMap.clear();
- String response = connection.readData(dataMap, url);
-
- JsonEntity jsonEntity = new JsonEntity(response);
- JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
- }
-
- public static void testArticleReply() {
- HttpUrlConnection connection = new HttpUrlConnection();
- Map<String, String> dataMap = new HashMap<String, String>();
- dataMap.put("start", "0");
- dataMap.put("cid", Constant.CID);
- dataMap.put("limit", "10");
- dataMap.put("author", "最爱看九爷");
- String response = connection.readData(dataMap, url);
- // JsonEntity jsonEntity = new JsonEntity(response);
- JsonEntityArray array = new JsonEntityArray(response);
- // 采集完成
- // JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
- }
-
- public static String getMD5Url(String url) {
- if (StringUtils.isBlank(url)) {
- return url;
- }
- try {
- MD5 md5 = new MD5();
- md5.Update(url);
- return md5.asHex();
- } catch (Exception e) {
- log.error("md5 加密异常", e);
- }
- return null;
- }
-
- public static void main(String[] args) {
-
- }
-
- }
最好把传输数据接口的参数都配置到数据库中,
每次任务记录好当前任务的参数参数,当宕机或者重启的时候,有利于保存当前的查询参数,有利于下一次 的查询