开发工具:Eclipse、JDK、Maven、VMware Workstation
web环境:Tomcat、Spring、Spring MVC、MyBatis、Echarts
(1)Linux 系统虛拟机的安装与克隆
(2)配置虛拟机网络与 SSH 服务
(3)搭建 Hadoop 集群
(4)安装 MySQL 数据库
(5)安装 Hive
(6)安装 Sqoop
(3)将爬取数据存储到 HDFS
(3)实现 MapReduce 预处理程序进行数据集成和数据转换操作
(4)实现 MapReduce 预处理程序的两种运行模式
(2)通过 HSQL 进行职位区域分析
(3)通过 HSQL 进行职位薪资分析
(4)通过 HSQL 进行公司福利标签分析
(5)通过 HSQL 进行技能标签分析
(2)通过 Sqoop 实现数据迁移
(3)创建 Maven 项目配置项目依赖的信息
(4)编辑配置文件整合 SSM 框架
- #编辑网络
- vi /etc/sysconfig/network-scripts/ifcfg-ens33
- #重启
- service network restart
- #配置ip和主机名映射
- vi /etc/hosts
- #查看SSH服务
- rpm -qa | grep ssh
- #SSH安装命令
- yum -y install openssh openssh-server
- #查看SSH进程
- ps -ef | grep ssh
- #生成密钥对
- ssh-keygen -t rsa
- #复制公钥文件
- ssh-copy-id 主机名
- 1.安装rz,通过rz命令上传安装包
- yum install lrzsz
- 2.解压
- tar -zxvf jdk-8u181-linux-x64.tar.gz -C /usr/local
- 3.修改名字
- mv jdk1.8.0_181/ jdk
- 4.配置环境变量
- vi /etc/profile
- export JAVA_HOME=/usr/local/jdk
- export PATH=$PATH:$JAVA_HOME/bin
- 5.初始化环境变量
- source /etc/profile
- 6.验证配置
- java -version
- 1.通过rz命令上传安装包
- 2.解压
- tar -zxvf hadoop2.7.1.tar.gz -C /usr/local
- 3.修改名字
- mv hadoop2.7.1/ hadoop
- 4.配置环境变量
- vi /etc/profile
- export HADOOP_HOME=/usr/local/hadoop
- 5.初始化环境变量
- source /etc/profile
- 6.验证配置
- hadoop version
- 1.cd hadoop/etc/hadoop
- 2.vi hadoop-env.sh
- export JAVA_HOME=/usr/local/jdk
- 3.vi yarn-env.sh
- #配置JAVA_HOME(记得去掉前面的#注释,注意别找错地方)
- 4.vi core-site.xml
- #配置主进程NameNode运行地址和Hadoop运行时生成数据的临时存放目录
- <configuration>
- <property>
- <name>fs.defaultFS</name>
- <value>hdfs://hadoop1:9000</value>
- </property>
- <property>
- <name>hadoop.tmp.dir</name>
- <value>/usr/local/hadoop/tmp</value>
- </property>
- </configuration>
- 5.vi hdfs-site.xml
- #配置Secondary NameNode节点运行地址和HDFS数据块的副本数量
- <configuration>
- <property>
- <name>dfs.replication</name>
- <value>3</value>
- </property>
- <property>
- <name>dfs.namenode.secondary.http-address</name>
- <value>hadoop2:50090</value>
- </property>
- </configuration>
- 6.cp mapred-site.xml.template mapred-site.xml
- vi mapred-site.xml
- #配置MapReduce程序在Yarns上运行
- <configuration>
- <property>
- <name>mapreduce.framework.name</name>
- <value>yarn</value>
- </property>
- </configuration>
- 7.vi yarn-site.xml
- #配置Yarn的主进程ResourceManager管理者及附属服务mapreduce_shuffle
- <configuration>
- <!-- Site specific YARN configuration properties -->
- <property>
- <name>yarn.resourcemanager.hostname</name>
- <value>hadoop1</value>
- </property>
- <property>
- <name>yarn.nodemanager.aux-services</name>
- <value>mapreduce_shuffle</value>
- </property>
- </configuration>
- 8.vi slaves
- hadoop1
- hadoop2
- hadoop3
- 9.scp /etc/profile root@hadoop2:/etc/profile
- scp /etc/profile root@hadoop3:/etc/profile
- scp -r /usr/local/* root@hadoop2:/usr/local/
- scp -r /usr/local/* root@hadoop3:/usr/local/
- 10.记得在hadoop2、hadoop3初始化
- source /etc/profile
- #1.格式化文件系统
- 初次启动HDFS集群时,对主节点进行格式化处理
- hdfs namenode -format
- 或者hadoop namenode -format
- #2.进入hadoop/sbin/
- cd /usr/local/hadoop/sbin/
- #3.主节点上启动HDFSNameNode进程
- hadoop-daemon.sh start namenode
- #4.每个节点上启动HDFSDataNode进程
- hadoop-daemon.sh start datanode
- #5.主节点上启动YARNResourceManager进程
- yarn-daemon.sh start resourcemanager
- #6.每个节点上启动YARNodeManager进程
- yarn-daemon.sh start nodemanager
- #7.规划节点上启动SecondaryNameNode进程
- hadoop-daemon.sh start secondarynamenode
- #8.jps(5个进程)
- DataNode
- ResourceManager
- NameNode
- NodeManager
- jps
- #安装mariadb
- yum install mariadb-server mariadb
- #启动服务
- systemctl start mariadb
- systemctl enable mariadb
- #切换到mysql数据库
- use mysql;
- #修改root用户密码
- update user set password=PASSWORD('123456') where user = 'root';
- #设置允许远程登录
- grant all privileges on *.* to 'root'@'%'
- identified by '123456' with grant option;
- #更新权限表
- flush privileges;
- #1.解压
- tar -zxvf apache-hive-1.2.2-bin.tar.gz -C /usr/local
- #2.修改名字
- mv apache-hive-1.2.2-bin/ hive
- #3.配置文件
- cd /hive/conf
- cp hive-env.sh.template hive-env.sh
- vi hive-env.sh(修改 export HADOOP_HOME=/usr/local/hadoop)
- #4.
- vi hive-site.xml
- <configuration>
- <property>
- <name>javax.jdo.option.ConnectionURL</name>
- <value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
- <description>JDBC connect string for a JDBC metastore</description>
- </property>
- <property>
- <name>javax.jdo.option.ConnectionDriverName</name>
- <value>com.mysql.jdbc.Driver</value>
- <description>Driver class name for a JDBC metastore</description>
- </property>
- <property>
- <name>javax.jdo.option.ConnectionUserName</name>
- <value>root</value>
- <description>username to use against metastore database</description>
- </property>
- <property>
- <name>javax.jdo.option.ConnectionPassword</name>
- <value>123456</value>
- <description>password to use against metastore database</description>
- </property>
- </configuration>
- #5.上传mysql驱动包
- cd ../lib
- rz(mysql-connector-java-5.1.40.jar)
- #6.配置环境变量
- vi /etc/profile
- export HIVE_HOME=/usr/local/hive
- export PATH=$PATH:$HIVE_HOME/bin
- source /etc/profile
- #7.启动hive
- cd ../bin/
- ./hive
- #1.解压
- tar -zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz -C /usr/local
- #2.修改名字
- mv sqoop-1.4.7.bin__hadoop-2.6.0/ sqoop
- #3.配置
- cd sqoop/conf/
- cp sqoop-env-template.sh sqoop-env.sh
- vi sqoop-env.sh
- 修改
- export HADOOP_COMMON_HOME=/usr/local/hadoop
- export HADOOP_MAPRED_HOME=/usr/local/hadoop
- export HIVE_HOME=/usr/local/hive
- #4.配置环境变量
- vi /etc/profile
- export SQOOP_HOME=/usr/local/sqoop
- export PATH=$PATH:$SQOOP_HOME/bin
- source /etc/profile
- #5.效果测试
- cd ../lib
- rz(mysql-connector-java-5.1.40.jar)#上传jar包到lib目录下
- cd ../bin/
- sqoop list-database \
- -connect jdbc:mysql://localhost:3306/ \
- --username root --password 123456
- #(sqoop list-database用于输出连接的本地MySQL数据库中的所有数据库,如果正确返回指定地址的MySQL数据库信息,说明Sqoop配置完毕)
- <dependencies>
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <version>4.5.4</version>
- </dependency>
- <dependency>
- <groupId>jdk.tools</groupId>
- <artifactId>jdk.tools</artifactId>
- <version>1.8</version>
- <scope>system</scope>
- <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
- </dependency>
- </dependencies>
- //HttpClientResp.java
- package com.position.reptile;
- import java.io.Serializable;
- public class HttpClientResp implements Serializable {
- private static final long serialVersionUID = 2963835334380947712L;
- //响应状态码
- private int code;
- //响应内容
- private String content;
- //空参构造
- public HttpClientResp() {
- }
- public HttpClientResp(int code) {
- super();
- this.code = code;
- }
- public HttpClientResp(String content) {
- super();
- this.content = content;
- }
- public HttpClientResp(int code, String content) {
- super();
- this.code = code;
- this.content = content;
- }
- //getter和setter方法
- public int getCode() {
- return code;
- }
- public void setCode(int code) {
- this.code = code;
- }
- public String getContent() {
- return content;
- }
- public void setContent(String content) {
- this.content = content;
- }
- //重写toString方法
- @Override
- public String toString() {
- return "HttpClientResp [code=" + code + ", content=" + content + "]";
- }
- }
- //编码格式
- private static final String ENCODING = "UTF-8";
- //设置连接超时时间,单位毫秒
- private static final int CONNECT_TIMEOUT = 6000;
- //设置响应时间
- private static final int SOCKET_TIMEOUT = 6000;
- // 封装请求头
- public static void packageHeader(Map<String, String> params, HttpRequestBase httpMethod){
- if (params != null) {
- // set集合中得到的就是params里面封装的所有请求头的信息,保存在entrySet里面
- Set<Entry<String, String>> entrySet = params.entrySet();
- // 遍历集合
- for (Entry<String, String> entry : entrySet) {
- // 封装到httprequestbase对象里面
- httpMethod.setHeader(entry.getKey(),entry.getValue());
- }
- }
- }
- // 封装请求参数
- public static void packageParam(Map<String,String> params,HttpEntityEnclosingRequestBase httpMethod) throws UnsupportedEncodingException {
- if (params != null) {
- List<NameValuePair> nvps = new ArrayList<NameValuePair>();
- Set<Entry<String, String>> entrySet = params.entrySet();
- for (Entry<String, String> entry : entrySet) {
- // 分别提取entry中的key和value放入nvps数组中
- nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
- }
- httpMethod.setEntity(new UrlEncodedFormEntity(nvps, ENCODING));
- }
- }
- public static HttpClientResp getHttpClientResult(CloseableHttpResponse httpResponse,CloseableHttpClient httpClient,HttpRequestBase httpMethod) throws Exception{
- httpResponse=httpClient.execute(httpMethod);
- //获取HTTP的响应结果
- if(httpResponse != null && httpResponse.getStatusLine() != null) {
- String content = "";
- if(httpResponse.getEntity() != null) {
- content = EntityUtils.toString(httpResponse.getEntity(),ENCODING);
- }
- return new HttpClientResp(httpResponse.getStatusLine().getStatusCode(),content);
- }
- return new HttpClientResp(HttpStatus.SC_INTERNAL_SERVER_ERROR);
- }
- public static HttpClientResp doPost(String url,Map<String,String>headers,Map<String,String>params) throws Exception{
- CloseableHttpClient httpclient = HttpClients.createDefault();
- HttpPost httppost = new HttpPost(url);
- //封装请求配置
- RequestConfig requestConfig = RequestConfig.custom()
- .setConnectTimeout(CONNECT_TIMEOUT)
- .setSocketTimeout(SOCKET_TIMEOUT)
- .build();
- //设置post请求配置项
- httppost.setConfig(requestConfig);
- //设置请求头
- packageHeader(headers,httppost);
- //设置请求参数
- packageParam(params,httppost);
- //创建httpResponse对象获取响应内容
- CloseableHttpResponse httpResponse = null;
- try {
- return getHttpClientResult(httpResponse,httpclient,httppost);
- }finally {
- //释放资源
- release(httpResponse,httpclient);
- }
- }
- private static void release(CloseableHttpResponse httpResponse,CloseableHttpClient httpClient) throws IOException{
- if(httpResponse != null) {
- httpResponse.close();
- }
- if(httpClient != null) {
- httpClient.close();
- }
- }
(1)在pom.xml文件中添加hadoop的依赖,用于调用HDFS API
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>2.7.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.7.1</version>
- </dependency>
- public class HttpClientHdfsUtils {
- public static void createFileBySysTime(String url,String fileName,String data) {
- System.setProperty("HADOOP_USER_NAME", "root");
- Path path = null;
- //读取系统时间
- Calendar calendar = Calendar.getInstance();
- Date time = calendar.getTime();
- //格式化系统时间
- SimpleDateFormat format = new SimpleDateFormat("yyyMMdd");
- //获取系统当前时间,将其转换为String类型
- String filepath = format.format(time);
- //构造Configuration对象,配置hadoop参数
- Configuration conf = new Configuration();
- URI uri= URI.create(url);
- FileSystem fileSystem;
- try {
- //获取文件系统对象
- fileSystem = FileSystem.get(uri,conf);
- //定义文件路径
- path = new Path("/JobData/"+filepath);
- if(!fileSystem.exists(path)) {
- fileSystem.mkdirs(path);
- }
- //在指定目录下创建文件
- FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path(path.toString()+"/"+fileName));
- //向文件中写入数据
- IOUtils.copyBytes(new ByteArrayInputStream(data.getBytes()),fsDataOutputStream,conf,true);
- fileSystem.close();
- }catch(IOException e) {
- e.printStackTrace();
- }
- }
- }
- public class HttpClientData {
- public static void main(String[] args) throws Exception {
- //设置请求头
- Map<String,String>headers = new HashMap<String,String>();
- headers.put("Cookie","privacyPolicyPopup=false; user_trace_token=20221103113731-d2950fcd-eb36-486c-9032-feab09943d4d; LGUID=20221103113731-ef107f32-06e0-4453-a89c-683f5a558e86; _ga=GA1.2.11435994.1667446652; RECOMMEND_TIP=true; index_location_city=%E5%85%A8%E5%9B%BD; __lg_stoken__=a5abb0b1f9cda5e7a6da82dd7a4397075c675acce324397a86b9cbbd4fc31a58d921346f317ba5c8c92b5c4a9ebb0650576575b67ebae44f422aeb4b1a950643cd2854eece70; JSESSIONID=ABAAAECABIEACCAC2031D7A104C1E74CDC3FABFA00BCC7F; WEBTJ-ID=20221105161123-18446d82e00bcd-0f0b3aafbd8e8e-26021a51-921600-18446d82e018bf; _gid=GA1.2.1865104541.1667635884; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1667446652,1667456559,1667635885; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5F%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D%3FlabelWords%3Dhot; LGSID=20221105161124-df5ffe02-aefa-434b-b378-2d64367fddde; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fcommon-sec%2Fsecurity-check.html%3Fseed%3D5E87A87B3DA4AFE2BC190FBB560FB9266A5615D5937A536A0FA5205B13CAC74F0D0C1CC5AF1D2DD0C0060C9AF3B36CA5%26ts%3D16676358793441%26name%3Da5abb0b1f9cd%26callbackUrl%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%5F%2525E5%2525A4%2525A7%2525E6%252595%2525B0%2525E6%25258D%2525AE%253FlabelWords%253D%2526fromSearch%253Dtrue%2526suginput%253D%253FlabelWords%253Dhot%26srcReferer%3D; _gat=1; X_MIDDLE_TOKEN=668d4b4d5ba925cb7156e2d72086c745; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; TG-TRACK-CODE=index_search; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221843b917f5d1b4-025994c92cf438-26021a51-921600-1843b917f5e3e5%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22103.0.0.0%22%2C%22%24latest_referrer_host%22%3A%22%22%7D%2C%22%24device_id%22%3A%221843b917f5d1b4-025994c92cf438-26021a51-921600-1843b917f5e3e5%22%7D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1667636243; LGRID=20221105161724-fad126be-48da-4684-aa52-1ff6cfb2dffd; SEARCH_ID=535076fc2a094fa2913263e0079a9038; X_HTTP_TOKEN=a18b9f65c1cbf1490626367661a3afc88e7340da5d");
- headers.put("Connection","keep-alive");
- headers.put("Accept","application/json, text/javascript, */*; q=0.01");
- headers.put("Accept-Language","zh-CN,zh;q=0.9");
- headers.put("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64)"+"AppleWebKit/537.36 (KHTML, like Gecko)"+"Chrome/ Safari/537.36");
- headers.put("content-type","application/x-www-form-urlencoded; charset=UTF-8");
- headers.put("Referer", "https://www.lagou.com/jobs/list_%E5%A4%A7%E6%95%B0%E6%8D%AE?labelWords=&fromSearch=true&suginput=?labelWords=hot");
- headers.put("Origin", "https://www.lagou.com");
- headers.put("x-requested-with","XMLHttpRequest");
- headers.put("x-anit-forge-token","None");
- headers.put("x-anit-forge-code","0");
- headers.put("Host","www.lagou.com");
- headers.put("Cache-Control","no-cache");
- Map<String,String>params = new HashMap<String,String>();
- params.put("kd","大数据");
- params.put("city","全国");
- for (int i=1;i<31;i++){
- params.put("pn",String.valueOf(i));
- }
- for (int i=1;i<31;i++){
- params.put("pn",String.valueOf(i));
- HttpClientResp result = HttpClientUtils.doPost("https://www.lagou.com/jobs/positionAjax.json?"+"needAddtionalResult=false",headers,params);
- HttpClientHdfsUtils.createFileBySysTime("hdfs://hadoop1:9000","page"+i,result.toString());
- Thread.sleep(1 * 500);
- }
- }
- }
- <dependencies>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>2.7.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.7.1</version>
- </dependency>
- </dependencies>
- //删除指定字符
- public static String deleteString(String str,char delChar) {
- StringBuffer stringBuffer = new StringBuffer("");
- for(int i=0;i<str.length();i++) {
- //str是要处理的字符串,delChar是要删除的字符
- if(str.charAt(i) != delChar) {
- stringBuffer.append(str.charAt(i));
- }
- }
- return stringBuffer.toString();
- }
- //处理合并福利标签
- public static String mergeString(String position,JSONArray company) throws JSONException {
- String result = "";
- if(company.length()!=0) {
- for(int i=0;i<company.length();i++) {
- result = result + company.get(i)+"-";
- }
- }
- if(position != "") {
- String[] positionList = position.split("|; |, |、, |,|/");
- for(int i=0;i<positionList.length;i++) {
- result = result + positionList[i].replaceAll("[\\pP\\p{Punct}]", "")+"-";
- }
- }
- return result.substring(0,result.length()-1);
- }
- //处理技能标签
- public static String killResult(JSONArray killData) throws JSONException {
- String result = "";
- if(killData.length() != 0) {
- for(int i=0;i<killData.length();i++) {
- result = result + killData.get(i)+"-";
- }
- return result.substring(0,result.length()-1);
- }else {
- return "null";
- }
- }
- //数据清洗结果
- public static String resultToString(JSONArray jobdata) throws JSONException {
- String jobResultData="";
- for(int i=0;i<jobdata.length();i++) {
- String everyData = jobdata.get(i).toString();
- JSONObject everyDataJson=new JSONObject(everyData);
- String city = everyDataJson.getString("city");
- String salary = everyDataJson.getString("salary");
- String positionAdvantage = everyDataJson.getString("positionAdvantage");
- JSONArray companyLabelList = everyDataJson.getJSONArray("companyLabelList");
- JSONArray skillLables = everyDataJson.getJSONArray("skillLables");
- //处理薪资字段数据
- String salaryNew = deleteString(salary,'k');
- String welfare = mergeString(positionAdvantage,companyLabelList);
- String kill = killResult(skillLables);
- if(i == jobdata.length() -1) {
- jobResultData = jobResultData+city+","+salaryNew+","+welfare+","+kill;
- }else {
- jobResultData = jobResultData+city+","+salaryNew+","+welfare+","+kill+"\n";
- }
- }
- return jobResultData;
- }
- }
- //CleanMapper类继承Mapper基类,并定义Map程序输入和输出的key和value
- public class CleanMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
- //map()方法对输入的键值对进行处理
- protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException {
- String jobResultData="";
- String reptileData = value.toString();
- //通过截取字符串方式获取content中的数据
- String jobData = reptileData.substring(reptileData.indexOf("=",reptileData.indexOf("=")+1)+1,
- reptileData.length()-1
- );
- try {
- //获取content中的数据内容
- JSONObject contentJson = new JSONObject(jobData);
- String contentData = contentJson.getString("content");
- //获取content下positionResult中的数据内容
- JSONObject positionResultJson = new JSONObject(contentData);
- String positionResultData = positionResultJson.getString("positionResult");
- //获取最终result中的数据内容
- JSONObject resultJson = new JSONObject(positionResultData);
- JSONArray resultData = resultJson.getJSONArray("result");
- jobResultData = CleanJob.resultToString(resultData);
- context.write(new Text(jobResultData), NullWritable.get());
- } catch (JSONException e) {
- e.printStackTrace();
- }
- }
- }
- public class CleanMain {
- public static void main(String[] args) throws IOException,ClassNotFoundException,InterruptedException {
- //控制台输出日志
- BasicConfigurator.configure();
- //初始化Hadoop配置
- Configuration conf = new Configuration();
- //定义一个新的Job,第一个参数是hadoop配置信息,第二个参数是Job的名字
- Job job = new Job(conf,"job");
- //设置主类
- job.setJarByClass(CleanMain.class);
- //设置Mapper类
- job.setMapperClass(CleanMapper.class);
- //设置job输出数据的key类
- job.setOutputKeyClass(Text.class);
- //设置job输出数据的value类
- job.setOutputValueClass(NullWritable.class);
- //数据输入路径
- FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/JobData/20221105"));
- //数据输出路径
- FileOutputFormat.setOutputPath(job,new Path("D:\\BigData\\out"));
- System.exit(job.waitForCompletion(true)?0:1);
- }
- }
- package com.position.clean;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.lib.CombineTextInputFormat;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- import org.apache.log4j.BasicConfigurator;
- public class CleanMain {
- public static void main(String[] args) throws IOException,ClassNotFoundException,InterruptedException {
- //控制台输出日志
- BasicConfigurator.configure();
- //初始化Hadoop配置
- Configuration conf = new Configuration();
- //从hadoop命令行读取参数
- String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
- //判断读取的参数正常是两个,分别是输入文件和输出文件的目录
- if(otherArgs.length != 2) {
- System.err.println("Usage:wordcount<in><out>");
- System.exit(2);
- }
- //定义一个新的Job,第一个参数是hadoop配置信息,第二个参数是Job的名字
- Job job = new Job(conf,"job");
- //设置主类
- job.setJarByClass(CleanMain.class);
- //设置Mapper类
- job.setMapperClass(CleanMapper.class);
- //处理小文件
- job.setInputFormatClass(CombineTextInputFormat.class);
- //n个小文件之和不能大于2MB
- CombineTextInputFormat.setMinInputSplitSize(job, 2097152);
- //在n个小文件之和大于2MB的情况下,需满足n+1个小文件之和不能大于4MB
- CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
- //设置job输出数据的key类
- job.setOutputKeyClass(Text.class);
- //设置job输出数据的value类
- job.setOutputValueClass(NullWritable.class);
- //设置输入文件
- FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
- //设置输出文件
- FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
- System.exit(job.waitForCompletion(true)?0:1);
- }
- }
字段 | 数据类型 | 描述 |
city | String | 城市 |
salary | array<String> | 薪资 |
company | array<String> | 福利标签 |
kill | array<String> | 技能标签 |
字段 | 数据类型 | 描述 |
salary | String | 薪资分布区间 |
count | int | 区间内出现薪资的频次 |
字段 | 数据类型 | 描述 |
company | String | 每个福利标签 |
count | int | 每个福利标签的频次 |
字段 | 数据类型 | 描述 |
city | String | 城市 |
count | int | 城市频次 |
字段 | 数据类型 | 描述 |
kill | String | 每个标签技能 |
count | int | 每个标签技能的频次 |
- --创建数据仓库 jobdata
- create database jobdata;
- use jobdata;
- --创建事实表 ods_jobdata_origin
- create table ods_jobdata_origin(
- city string comment '城市',
- salary array<string> comment '薪资',
- company array<string> comment '福利',
- kill array<string> comment '技能')
- comment '原始职位数据表'
- row format delimited fields terminated by ','
- collection items terminated by '-'
- stored as textfile;
- --加载数据
- load data inpath '/JobData/output/part-r-00000' overwrite into table ods_jobdata_origin;
- --查询数据
- select * from ods_jobdata_origin;
- create table ods_jobdata_detail(
- city string comment '城市',
- salary array<string> comment '薪资',
- company array<string> comment '福利',
- kill array<string> comment '技能',
- low_salary int comment '低薪资',
- high_salary int comment '高薪资',
- avg_salary double comment '平均薪资')
- comment '职位数据明细表'
- row format delimited fields terminated by ','
- collection items terminated by '-'
- stored as textfile;
- insert overwrite table ods_jobdata_detail
- select city,salary,company,kill,salary[0],salary[1],(salary[0]+salary[1])/2
- from ods_jobdata_origin;
create table t_ods_tmp_salary as select explode(ojo.salary) from ods_jobdata_origin ojo;
- create table t_ods_tmp_salary_dist as select case
- when col>=0 and col<=5 then "0-5"
- when col>=6 and col<=10 then "6-10"
- when col>=11 and col<=15 then "11-15"
- when col>=16 and col<=20 then "16-20"
- when col>=21 and col<=25 then "21-25"
- when col>=26 and col<=30 then "26-30"
- when col>=31 and col<=35 then "31-35"
- when col>=36 and col<=40 then "36-40"
- when col>=41 and col<=45 then "41-45"
- when col>=46 and col<=50 then "46-50"
- when col>=51 and col<=55 then "51-55"
- when col>=56 and col<=60 then "56-60"
- when col>=61 and col<=65 then "61-65"
- when col>=66 and col<=70 then "66-70"
- when col>=71 and col<=75 then "71-75"
- when col>=76 and col<=80 then "76-80"
- when col>=81 and col<=85 then "81-85"
- when col>=86 and col<=90 then "86-90"
- when col>=91 and col<=95 then "91-95"
- when col>=96 and col<=100 then "96-100"
- when col>=101 then ">101" end from t_ods_tmp_salary;
create table t_ods_tmp_company as select explode(ojo.company) from ods_jobdata_origin ojo;
create table t_ods_tmp_kill as select explode(ojo.kill) from ods_jobdata_origin ojo;
- create table t_ods_kill(
- every_kill string comment '技能标签',
- count int comment '词频')
- comment '技能标签词频统计'
- row format delimited fields terminated by ','
- stored as textfile;
- create table t_ods_company(
- every_company string comment '福利标签',
- count int comment '词频')
- comment '福利标签词频统计'
- row format delimited fields terminated by ','
- stored as textfile;
- create table t_ods_salary(
- every_partition string comment '薪资分布',
- count int comment '聚合统计')
- comment '薪资分布聚合统计'
- row format delimited fields terminated by ','
- stored as textfile;
- create table t_ods_city(
- every_city string comment '城市',
- count int comment '词频')
- comment '城市统计'
- row format delimited fields terminated by ','
- stored as textfile;
- --职位区域分析
- insert overwrite table t_ods_city
- select city,count(1) from ods_jobdata_origin group by city;
- --倒叙查询职位区域的信息
- select * from t_ods_city sort by count desc;
- --职位薪资分析
- insert overwrite table t_ods_salary
- select '_c0',count(1) from t_ods_tmp_salary_dist group by '_c0';
- --查看维度表t_ods_salary中的分析结果,使用sort by 参数对表中的count列进行倒序排序
- select * from t_ods_salary sort by count desc;
- --平均值
- select avg(avg_salary) from ods_jobdata_detail;
- --众数
- select avg_salary,count(1) as cnt from ods_jobdata_detail group by avg_salary order by cnt desc limit 1;
- --中位数
- select percentile(cast(avg_salary as bigint),0.5) from ods_jobdata_detail;
- --公司福利分析
- insert overwrite table t_ods_company
- select col,count(1) from t_ods_tmp_company group by col;
- --查询维度表中的分析结果,倒序查询前10个
- select every_company,count from t_ods_company sort by count desc limit 10;
- --职位技能要求分析
- insert overwrite table t_ods_kill
- select col,count(1) from t_ods_tmp_kill group by col;
- --查看技能维度表中的分析结果,倒叙查看前3个
- select every_kill,count from t_ods_kill sort by count desc limit 3;
- --创建数据库JobData
- CREATE DATABASE JobData CHARACTER set utf8 COLLATE utf8_general_ci;
- --创建城市分布表
- create table t_city_count(
- city VARCHAR(30) DEFAULT null,
- count int(5) DEFAULT NULL
- --创建薪资分布表
- create table t_salary_count(
- salary VARCHAR(30) DEFAULT null,
- count int(5) DEFAULT NULL
- --创建福利标签统计表
- create table t_company_count(
- company VARCHAR(30) DEFAULT null,
- count int(5) DEFAULT NULL
- --创建技能标签统计表
- create table t_kill_count(
- kills VARCHAR(30) DEFAULT null,
- count int(5) DEFAULT NULL
- --将职位所在的城市的分布统计结果数据迁移到t_city_count表中
- bin/sqoop export \
- --connect jdbc:mysql://hadoop1:3306/JobData?characterEncoding=UTF-8 \
- --username root \
- --password 123456 \
- --table t_city_count \
- --columns "city,count" \
- --fields-terminated-by ',' \
- --export-dir /user/hive/warehouse/jobdata.db/t_ods_city
- --将职位薪资分布结果数据迁移到t_salary_count表中
- bin/sqoop export \
- --connect jdbc:mysql://hadoop1:3306/JobData?characterEncoding=UTF-8 \
- --username root \
- --password 123456 \
- --table t_salary_dist \
- --columns "salary,count" \
- --fields-terminated-by ',' \
- --export-dir /user/hive/warehouse/jobdata.db/t_ods_salary
- --将职位福利统计结果数据迁移到t_company_count表中
- bin/sqoop export \
- --connect jdbc:mysql://hadoop1:3306/JobData?characterEncoding=UTF-8 \
- --username root \
- --password 123456 \
- --table t_company_count \
- --columns "company,count" \
- --fields-terminated-by ',' \
- --export-dir /user/hive/warehouse/jobdata.db/t_ods_company
- --将职位技能标签统计结果迁移到t_kill_count表中
- bin/sqoop export \
- --connect jdbc:mysql://hadoop1:3306/JobData?characterEncoding=UTF-8 \
- --username root \
- --password 123456 \
- --table t_kill_dist \
- --columns "kills,count" \
- --fields-terminated-by ',' \
- --export-dir /user/hive/warehouse/jobdata.db/t_ods_kill
创建后会出现web.xml is missing and <failOnMissingWebXml> is set to true 的错误,是缺少web.xml文件导致的。在src/main/webapp/ WEB-INF下添加web.xml
- <project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
- http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <groupId>com.itcast.jobanalysis</groupId>
- <artifactId>job-web</artifactId>
- <version>0.0.1-SNAPSHOT</version>
- <packaging>war</packaging>
- <dependencies>
- <dependency>
- <groupId>org.codehaus.jettison</groupId>
- <artifactId>jettison</artifactId>
- <version>1.1</version>
- </dependency>
- <!-- Spring -->
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-context</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-beans</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-webmvc</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-jdbc</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-aspects</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-jms</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <dependency>
- <groupId>org.springframework</groupId>
- <artifactId>spring-context-support</artifactId>
- <version>4.2.4.RELEASE</version>
- </dependency>
- <!-- Mybatis -->
- <dependency>
- <groupId>org.mybatis</groupId>
- <artifactId>mybatis</artifactId>
- <version>3.2.8</version>
- </dependency>
- <dependency>
- <groupId>org.mybatis</groupId>
- <artifactId>mybatis-spring</artifactId>
- <version>1.2.2</version>
- </dependency>
- <dependency>
- <groupId>com.github.miemiedev</groupId>
- <artifactId>mybatis-paginator</artifactId>
- <version>1.2.15</version>
- </dependency>
- <!-- MySql -->
- <dependency>
- <groupId>mysql</groupId>
- <artifactId>mysql-connector-java</artifactId>
- <version>5.1.32</version>
- </dependency>
- <!-- 连接池 -->
- <dependency>
- <groupId>com.alibaba</groupId>
- <artifactId>druid</artifactId>
- <version>1.0.9</version>
- <exclusions>
- <exclusion>
- <groupId>com.alibaba</groupId>
- <artifactId>jconsole</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.alibaba</groupId>
- <artifactId>tools</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <!-- JSP相关 -->
- <dependency>
- <groupId>jstl</groupId>
- <artifactId>jstl</artifactId>
- <version>1.2</version>
- </dependency>
- <dependency>
- <groupId>javax.servlet</groupId>
- <artifactId>servlet-api</artifactId>
- <version>2.5</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>javax.servlet</groupId>
- <artifactId>jsp-api</artifactId>
- <version>2.0</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.12</version>
- </dependency>
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- <version>2.4.2</version>
- </dependency>
- <dependency>
- <groupId>org.aspectj</groupId>
- <artifactId>aspectjweaver</artifactId>
- <version>1.8.4</version>
- </dependency>
- </dependencies>
- <build>
- <finalName>${project.artifactId}</finalName>
- <resources>
- <resource>
- <directory>src/main/java</directory>
- <includes>
- <include>**/*.properties</include>
- <include>**/*.xml</include>
- </includes>
- <filtering>false</filtering>
- </resource>
- <resource>
- <directory>src/main/resources</directory>
- <includes>
- <include>**/*.properties</include>
- <include>**/*.xml</include>
- </includes>
- <filtering>false</filtering>
- </resource>
- </resources>
- <plugins>
- <!-- 指定maven编译的jdk版本,如果不指定,maven3默认用jdk 1.5-->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>3.2</version>
- <configuration>
- <!-- 源代码使用的JDK版本 -->
- <source>1.8</source>
- <!-- 需要生成的目标class文件的编译版本 -->
- <target>1.8</target>
- <!-- 字符集编码 -->
- <encoding>UTF-8</encoding>
- </configuration>
- </plugin>
- <!-- 配置Tomcat插件 -->
- <plugin>
- <groupId>org.apache.tomcat.maven</groupId>
- <artifactId>tomcat7-maven-plugin</artifactId>
- <version>2.2</version>
- <configuration>
- <path>/</path>
- <port>8080</port>
- </configuration>
- </plugin>
- </plugins>
- </build>
- </project>
- <?xml version="1.0" encoding="UTF-8"?>
- <beans xmlns="http://www.springframework.org/schema/beans"
- xmlns:context="http://www.springframework.org/schema/context"
- xmlns:p="http://www.springframework.org/schema/p"
- xmlns:aop="http://www.springframework.org/schema/aop"
- xmlns:tx="http://www.springframework.org/schema/tx"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://www.springframework.org/schema/beans
- http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
- http://www.springframework.org/schema/context
- http://www.springframework.org/schema/context/spring-context-4.2.xsd
- http://www.springframework.org/schema/aop
- http://www.springframework.org/schema/aop/spring-aop-4.2.xsd
- http://www.springframework.org/schema/tx
- http://www.springframework.org/schema/tx/spring-tx-4.2.xsd
- http://www.springframework.org/schema/util
- http://www.springframework.org/schema/util/spring-util-4.2.xsd">
- <!-- 数据库连接池 -->
- <!-- 加载配置文件 -->
- <context:property-placeholder
- location="classpath:properties/db.properties" />
- <!-- 数据库连接池 -->
- <bean id="dataSource"
- class="com.alibaba.druid.pool.DruidDataSource"
- destroy-method="close">
- <property name="url" value="${jdbc.url}" />
- <property name="username" value="${jdbc.username}" />
- <property name="password" value="${jdbc.password}" />
- <property name="driverClassName" value="${jdbc.driver}" />
- <property name="maxActive" value="10" />
- <property name="minIdle" value="5" />
- </bean>
- <!-- 让spring管理sqlsessionfactory使用mybatis和spring整合包中的 -->
- <bean id="sqlSessionFactory"
- class="org.mybatis.spring.SqlSessionFactoryBean">
- <!-- 数据库连接池 -->
- <property name="dataSource" ref="dataSource" />
- <!-- 加载mybatis的全局配置文件 -->
- <property name="configLocation"
- value="classpath:mybatis/mybatis-config.xml" />
- </bean>
- <!-- 使用扫描包的形式来创建mapper代理对象 -->
- <bean class="org.mybatis.spring.mapper.MapperScannerConfigurer">
- <property name="basePackage" value="cn.itcast.mapper" />
- </bean>
- <!-- 事务管理器 -->
- <bean id="transactionManager"
- class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
- <!-- 数据源 -->
- <property name="dataSource" ref="dataSource" />
- </bean>
- <!-- 通知 -->
- <tx:advice id="txAdvice" transaction-manager="transactionManager">
- <tx:attributes>
- <!-- 传播行为 -->
- <tx:method name="save*" propagation="REQUIRED" />
- <tx:method name="insert*" propagation="REQUIRED" />
- <tx:method name="add*" propagation="REQUIRED" />
- <tx:method name="create*" propagation="REQUIRED" />
- <tx:method name="delete*" propagation="REQUIRED" />
- <tx:method name="update*" propagation="REQUIRED" />
- <tx:method name="find*"
- propagation="SUPPORTS"
- read-only="true" />
- <tx:method name="select*"
- propagation="SUPPORTS"
- read-only="true" />
- <tx:method name="get*"
- propagation="SUPPORTS"
- read-only="true" />
- </tx:attributes>
- </tx:advice>
- <!-- 切面 -->
- <aop:config>
- <aop:advisor advice-ref="txAdvice"
- pointcut="execution(* cn.itcast.service..*.*(..))" />
- </aop:config>
- <!-- 配置包扫描器,扫描所有带@Service注解的类 -->
- <context:component-scan base-package="cn.itcast.service" />
- </beans>
- <?xml version="1.0" encoding="UTF-8"?>
- <beans xmlns="http://www.springframework.org/schema/beans"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xmlns:p="http://www.springframework.org/schema/p"
- xmlns:context="http://www.springframework.org/schema/context"
- xmlns:mvc="http://www.springframework.org/schema/mvc"
- xsi:schemaLocation="http://www.springframework.org/schema/beans
- http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
- http://www.springframework.org/schema/mvc
- http://www.springframework.org/schema/mvc/spring-mvc-4.2.xsd
- http://www.springframework.org/schema/context
- http://www.springframework.org/schema/context/spring-context-4.2.xsd">
- <!-- 扫描指定包路径 使路径当中的@controller注解生效 -->
- <context:component-scan base-package="cn.itcast.controller" />
- <!-- mvc的注解驱动 -->
- <mvc:annotation-driven />
- <!-- 视图解析器 -->
- <bean
- class=
- "org.springframework.web.servlet.view.InternalResourceViewResolver">
- <property name="prefix" value="/WEB-INF/jsp/" />
- <property name="suffix" value=".jsp" />
- </bean>
- <!-- 配置资源映射 -->
- <mvc:resources location="/css/" mapping="/css/**"/>
- <mvc:resources location="/js/" mapping="/js/**"/>
- <mvc:resources location="/assets/" mapping="/assets/**"/>
- <mvc:resources location="/img/" mapping="/img/**"/>
- </beans>
- <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" version="2.5">
- <display-name>job-web</display-name>
- <welcome-file-list>
- <welcome-file>index.html</welcome-file>
- </welcome-file-list>
- <!-- 加载spring容器 -->
- <context-param>
- <param-name>contextConfigLocation</param-name>
- <param-value>classpath:spring/applicationContext.xml</param-value>
- </context-param>
- <listener>
- <listener-class> org.springframework.web.context.ContextLoaderListener </listener-class>
- </listener>
- <!-- 解决post乱码 -->
- <filter>
- <filter-name>CharacterEncodingFilter</filter-name>
- <filter-class> org.springframework.web.filter.CharacterEncodingFilter </filter-class>
- <init-param>
- <param-name>encoding</param-name>
- <param-value>utf-8</param-value>
- </init-param>
- </filter>
- <filter-mapping>
- <filter-name>CharacterEncodingFilter</filter-name>
- <url-pattern>/*</url-pattern>
- </filter-mapping>
- <!-- 配置springmvc的前端控制器 -->
- <servlet>
- <servlet-name>data-report</servlet-name>
- <servlet-class> org.springframework.web.servlet.DispatcherServlet </servlet-class>
- <init-param>
- <param-name>contextConfigLocation</param-name>
- <param-value>classpath:spring/springmvc.xml</param-value>
- </init-param>
- <load-on-startup>1</load-on-startup>
- </servlet>
- <!-- 拦截所有请求 jsp除外 -->
- <servlet-mapping>
- <servlet-name>data-report</servlet-name>
- <url-pattern>/</url-pattern>
- </servlet-mapping>
- <!-- 全局错误页面 -->
- <error-page>
- <error-code>404</error-code>
- <location>/WEB-INF/jsp/404.jsp</location>
- </error-page>
- </web-app>
- jdbc.driver=com.mysql.jdbc.Driver
- jdbc.url=jdbc:mysql://hadoop1:3306/JobData?characterEncoding=utf-8
- jdbc.username=root
- jdbc.password=123456
- <?xml version="1.0" encoding="UTF-8"?>
- <!DOCTYPE configuration PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
- "http://mybatis.org/dtd/mybatis-3-config.dtd">
- <configuration>
- </configuration>
