赞
踩
Java是一门强大的编程语言,有很多库和框架可以用于网页抓取。常用的包括Jsoup、Selenium和HttpClient等。其中,Jsoup适用于抓取静态网页,Selenium适用于抓取动态网页,HttpClient适用于发送HTTP请求。根据实际需求选择合适的工具。
开发环境是内网(局域网),项目工程是web项目,jdk使用的1.8,tomcat使用的 8。
web项目: 启动时使用tomcat, tomcat会先加载web.xml配置文件里内容;
maven项目: 和web项目的区别是,需要的jar包,交给maven维护,不用自己导入jar包;
java项目: 启动时,使用main方法,没有web.xml配置文件;
一、创建项目
web.xml:web项目的配置文件,随着tomcat启动而加载;
TimerConfig.xml:springMVC定时配置文件;
proxool.xml: 配置数据库连接池;
log4j.properties:日志文件;
hibernate.cfg.xml: hibernate配置文件;
LoadsRealTimeTask: 定时任务类;
二、环境准备
1、导入selenium所需要的包
2、导入项目所需要的包
3、安装chromedriver.exe
因为我使用的谷歌浏览器来打开页面,所以需要将chromedriver.exe安装到Chrome目录下。
三、编写代码
web.xml
<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://java.sun.com/xml/ns/javaee"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"
id="WebApp_ID" version="3.0">
<display-name>GDreptile</display-name>
<welcome-file-list>
<welcome-file>index.html</welcome-file>
<welcome-file>index.htm</welcome-file>
<welcome-file>index.jsp</welcome-file>
<welcome-file>default.html</welcome-file>
<welcome-file>default.htm</welcome-file>
<welcome-file>default.jsp</welcome-file>
</welcome-file-list>
<!-- 定时器配置文件-->
<context-param>
<param-name>contextConfigLocation</param-name>
<param-value>/WEB-INF/TimerConfig.xml</param-value>
</context-param>
<listener>
<listener-class>
org.springframework.web.context.ContextLoaderListener
</listener-class>
</listener>
<!-- proxool -->
<servlet>
<servlet-name>ServletConfigurator</servlet-name>
<servlet-class>
org.logicalcobwebs.proxool.configuration.ServletConfigurator
</servlet-class>
<init-param>
<param-name>xmlFile</param-name>
<param-value>/WEB-INF/proxool.xml</param-value>
</init-param>
<load-on-startup>1</load-on-startup>
</servlet>
<!-- proxool提供的管理监控工具,可查看当前数据库连接情况。如果运行不成功,请删除本行 -->
<servlet>
<servlet-name>Admin</servlet-name>
<servlet-class>org.logicalcobwebs.proxool.admin.servlet.AdminServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>Admin</servlet-name>
<url-pattern>/admin</url-pattern>
</servlet-mapping>
</web-app>
TimerConfig.xml
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
xmlns:mvc="http://www.springframework.org/schema/mvc" xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-3.1.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-3.1.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx-3.1.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task-3.1.xsd">
<!-- springMVC 定时器开关 -->
<task:annotation-driven />
<bean id="historyTask" class="com.sgcc.gridDispa.LoadsHistoryTask"></bean>
<bean id="realTimeTask" class="com.sgcc.gridDispa.LoadsRealTimeTask"></bean>
<task:scheduled-tasks>
<task:scheduled ref="historyTask" method="run" cron="0 30 7 * * ?" /> <!-- 这里表示的是每天7:30执行一次 -->
<task:scheduled ref="realTimeTask" method="run" cron="0 20 0-23 * * ?" /> <!-- 这里表示的是每小时20分执行一次 -->
</task:scheduled-tasks>
</beans>
proxool.xml
<?xml version="1.0" encoding="UTF-8"?>
<something-else-entirely>
<proxool>
<alias>proxoolpool</alias>
<driver-url>jdbc:oracle:thin:@XX.XX.XX.XX:1521/XXXXX</driver-url>
<driver-class>oracle.jdbc.driver.OracleDriver</driver-class>
<driver-properties>
<property name="user" value="XXXXX" />
<property name="password" value="XXXXX" />
</driver-properties>
<maximum-connection-count>200</maximum-connection-count>
<minimum-connection-count>10</minimum-connection-count>
<house-keeping-sleep-time>30000</house-keeping-sleep-time>
<maximum-new-connections>10</maximum-new-connections>
<prototype-count>5</prototype-count>
<test-before-use>true</test-before-use>
<house-keeping-test-sql>select sysdate from dual</house-keeping-test-sql>
</proxool>
</something-else-entirely>
log4j.properties
log4j.rootLogger=DEBUG,console,FILE
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.threshold=INFO
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH\:mm\:ss} [%5p] - %c -%F(%L) -%m%n
log4j.appender.FILE=org.apache.log4j.RollingFileAppender
log4j.appender.FILE.Append=true
log4j.appender.FILE.File=D:/log/pachong/logs
log4j.appender.FILE.Threshold=INFO
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy-MM-dd HH\:mm\:ss} [%5p] - %c -%F(%L) -%m%n
log4j.appender.FILE.MaxFileSize=10MB
hibernate.cfg.xml
<!DOCTYPE hibernate-configuration PUBLIC
"-//Hibernate/Hibernate Configuration DTD 3.0//EN"
"http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd">
<hibernate-configuration>
<session-factory>
<!-- hibernate自身属性相关参数 -->
<property name="dialect">
com.sgcc.gridDispa.utils.BlobOracleDialect
</property>
<!-- <property name="hbm2ddl.auto">update</property> -->
<property name="hibernate.jdbc.batch_size">50</property>
<!-- 设置连接数 -->
<!-- <property name="connection.pool_size">60</property> -->
<property name="show_sql">false</property>
<property name="format_sql">false</property>
<property name="current_session_context_class">thread</property>
<!-- 提交事务后关闭连接 -->
<property name="connection.release_mode">
after_transaction
</property>
<!-- 提交事务后关闭会话 -->
<property name="transaction.auto_close_session">true</property>
<!-- 统计信息 -->
<property name="hibernate.generate_statistics">true</property>
<!-- proxool连接池 -->
<property name="hibernate.proxool.pool_alias">proxoolpool</property>
<property name="hibernate.proxool.xml">proxool.xml</property>
<property name="hibernate.connection.provider_class">
org.hibernate.connection.ProxoolConnectionProvider
</property>
<property name="hibernate.proxool.existing_pool">true</property>
<!-- 映射文件的注册 -->
<mapping resource="com/sgcc/gridDispa/po/LoadsHistory.hbm.xml" />
<mapping resource="com/sgcc/gridDispa/po/LoadsToday.hbm.xml" />
<mapping resource="com/sgcc/gridDispa/po/TgridLoads.hbm.xml" />
<mapping resource="com/sgcc/gridDispa/po/LoadsRealTime.hbm.xml" />
</session-factory>
</hibernate-configuration>
LoadsRealTimeTask
package com.sgcc.gridDispa;
import java.util.TimerTask;
import com.sgcc.gridDispa.impl.LoadsHistoryImpl;
import com.sgcc.gridDispa.impl.LoadsRealTimeImpl;
public class LoadsRealTimeTask extends TimerTask{
LoadsRealTimeImpl tsk= new LoadsRealTimeImpl();
@Override
public void run() {
try {
Thread thread=new Thread(tsk);
thread.start();
} catch (Exception e) {
e.printStackTrace();
}
}
}
utils文件夹下
WebDriverUtil.java
package com.sgcc.gridDispa.utils;
import java.util.concurrent.TimeUnit;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.Platform;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.remote.CapabilityType;
/**
* @Description:crawler
* @Author: old
* @CreateTime:2017-11-15 :15:16:16
*/
public class WebDriverUtil {
/**
* 创建Chrome
*
* @param path 路径
* @return
* @throws Exception
*/
public static WebDriver createChromeWebDriver(String path) throws Exception {
if (path == null || "".equals(path)) {
throw new Exception("配置错误, 没有配置:chrome path");
}
System.setProperty("webdriver.chrome.driver", path);
WebDriver webDriver = new ChromeDriver();
webDriver.manage().timeouts().pageLoadTimeout(1200, TimeUnit.SECONDS);
webDriver.manage().window().setSize(new Dimension(1024, 768));
return webDriver;
}
}
LogWriter.java
package com.sgcc.gridDispa.utils;
import org.apache.log4j.Logger;
public class LogWriter {
private static Logger logger = Logger.getLogger(LogWriter.class);
public static void error(Object obj){
logger.error(obj);
}
public static void error(Object message,Throwable obj){
logger.error(message,obj);
}
public static void info(Object obj){
logger.info(obj);
}
public static String getError(Throwable e){
StringBuilder sb=new StringBuilder();
sb.append(e.toString()+System.getProperty("line.separator"));
StackTraceElement[] trace = e.getStackTrace();
for (int i=0; i < trace.length; i++)
sb.append("\tat " + trace[i]+System.getProperty("line.separator"));
return sb.toString();
}
}
JDBCUtil.java
package com.sgcc.gridDispa.utils;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.cfg.Configuration;
import org.hibernate.stat.SessionStatistics;
import org.hibernate.stat.Statistics;
/**
* 获得hibernate session对象
* @author kuang
*
*/
public final class JDBCUtil {
private static SessionFactory sessionFactory ;
private JDBCUtil(){}
static{
try{
sessionFactory = new Configuration().configure("hibernate.cfg.xml").buildSessionFactory();
}catch(Exception e){
e.printStackTrace();
LogWriter.error(e);
}
}
public static Session getThreadSession(){
return sessionFactory.getCurrentSession();
}
public static Session noOpen(){
return getThreadSession();
}
public static Session open(){
getThreadSession().beginTransaction();
return getThreadSession();
}
public static void commit(){
getThreadSession().getTransaction().commit();
getThreadSession().close();
}
public static void close(){
if(getThreadSession()!=null)
getThreadSession().close();
}
public static void getStatistics(){
SessionStatistics ss = getThreadSession().getStatistics();
LogWriter.info("SessionStatistics:"+ss);
Statistics st = sessionFactory.getStatistics();
LogWriter.info("Statistics:"+st);
}
public static void rollback(){
getThreadSession().getTransaction().rollback();
}
}
BlobOracleDialect.java
package com.sgcc.gridDispa.utils;
import java.sql.Types;
import org.hibernate.Hibernate;
import org.hibernate.dialect.OracleDialect;
public class BlobOracleDialect extends OracleDialect {
public BlobOracleDialect(){
super();
registerHibernateType(Types.LONGVARBINARY,Hibernate.BLOB.getName());
}
}
BasicDaoImpl.java
package com.sgcc.gridDispa.utils;
import java.util.List;
import org.hibernate.HibernateException;
import org.hibernate.Query;
import org.hibernate.SQLQuery;
import org.hibernate.Session;
import org.hibernate.transform.Transformers;
//obj的属性名
public class BasicDaoImpl<T> {
/*
* 保存实体对象
*/
public void saveOrUpdate(T t) {
try {
JDBCUtil.open().save(t);
JDBCUtil.commit();
} catch (HibernateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
JDBCUtil.close();
}
}
/*
* 查询sql,返回list
*/
public List queryListBySql(String sql){
try {
Session session =JDBCUtil.open();
SQLQuery sqlQuery = session.createSQLQuery(sql);
List result = sqlQuery.list();
JDBCUtil.commit();
return result;
} catch (HibernateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
JDBCUtil.close();
}
return null;
}
/*
* 查询sql,返回list Map
*/
public List queryListMapBySql(String sql){
try {
Session session =JDBCUtil.open();
SQLQuery sqlQuery = session.createSQLQuery(sql);
Query query =sqlQuery.setResultTransformer(Transformers.ALIAS_TO_ENTITY_MAP);
List result = sqlQuery.list();
JDBCUtil.commit();
return result;
} catch (HibernateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
JDBCUtil.close();
}
return null;
}
/**
* 对获取到的气象数据进行过滤,对无效、null进行处理
* @return 过滤后的数据
*/
protected String filterMothed(String object) {
if(object.contains("9999")){
return "";
}
if(object.equals("null")){
return "";
}
if(object==null){
return "";
}
return object.trim();
}
}
impl文件夹下:
LoadsRealTimeImpl.java
package com.sgcc.gridDispa.impl;
import java.math.BigDecimal;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import com.sgcc.gridDispa.po.LoadsHistory;
import com.sgcc.gridDispa.po.LoadsRealTime;
import com.sgcc.gridDispa.utils.BasicDaoImpl;
import com.sgcc.gridDispa.utils.WebDriverUtil;
public class LoadsRealTimeImpl extends BasicDaoImpl implements Runnable{
private Logger logger=Logger.getLogger(LoadsRealTimeImpl.class);
@Override
public void run() {
logger.info("=======各省实时负荷数据【定时任务】===============");
saveAllRealTimeLoad();
}
/**
* 将各省实时负荷数据写入数据库
*/
public synchronized void saveAllRealTimeLoad(){
WebDriver webDriver = null;
try {
webDriver = WebDriverUtil.createChromeWebDriver("D:\\chrome\\Chrome\\Application\\chromedriver.exe");
/*webDriver = WebDriverUtil.createChromeWebDriver("D:\\基础软件\\Chrome\\Application\\chromedriver.exe");*/
webDriver.get("http://10.19.13.50:8080//MWWebSite//PROJECT-HOME//exchange//YYJC//AJBZHDPSJ.jsp");
Thread.sleep(3000);
System.out.println(webDriver.getTitle());
System.out.println(webDriver.getPageSource());
WebElement webBody=webDriver.findElement(By.xpath("//body"));
String bodyStr=webBody.getText();
String[] bodyStrs=bodyStr.split("\n");
String bool=",";
for (int i = 47; i < bodyStrs.length; i++) {
String date="";
if(i==47){
int index=bodyStrs[0].indexOf("='");
date=bodyStrs[0].substring(index+2, index+12);
}
if(i>=49 && i<=85){
String allLoad=bodyStrs[i].replaceAll("\\s{2,}", ",").trim();
String[] allLoadStr=allLoad.split(",");
LoadsRealTime loadRealTime = new LoadsRealTime();
loadRealTime.setDeptName(allLoadStr[1]); //电网名称
//处理电网、省公司id
if(allLoadStr[1].equals("华北电网") || allLoadStr[1].equals("华东电网") || allLoadStr[1].equals("华中电网") || allLoadStr[1].equals("东北电网") || allLoadStr[1].equals("西北电网") || allLoadStr[1].equals("西南电网")){
List<Map> result = queryAreaIdByname(allLoadStr[1]);
if(result.size() >0){
for(Map map : result){
String companyId = map.get("COMPANY_ID").toString();
loadRealTime.setDeptId(companyId);
}
}
}else{
List<Map> result = queryCompanyIdByname(allLoadStr[1]);
if(result.size() >0){
for(Map map : result){
String companyId = map.get("COMPANY_ID").toString();
loadRealTime.setDeptId(companyId);
}
}
}
Float yesterdayLoad = Float.parseFloat(allLoadStr[2])/10; //实时负荷
loadRealTime.setRealtimeLoad(Math.round(yesterdayLoad)+"");
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
int index = allLoadStr[3].indexOf("'");
String rkTime=allLoadStr[3].substring(index+1, index+20);
loadRealTime.setRkTie(format.parse(rkTime));
this.saveOrUpdate(loadRealTime);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (webDriver != null) {
//webDriver.close();
webDriver.quit();
}
}
}
/**
* 根据省公司名称查询公司id
*/
private List queryCompanyIdByname(String companyName){
String sql= "select y.company_id from t_company y where y.dwjb='3' ";
if(companyName != null && !companyName.equals("")){
sql += "and y.company_name like '%" + companyName + "%' ";
}
return this.queryListMapBySql(sql);
}
/**
* 根据各分部电网查询分部id
*/
private List queryAreaIdByname(String companyName){
String sql= "select y.company_id from t_company y where y.dwjb='2' ";
if(companyName != null && !companyName.equals("")){
companyName = companyName.replace("电网", "");
sql += "and y.company_name like '%" + companyName + "%' ";
}
return this.queryListMapBySql(sql);
}
}
四、页面中数据
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。