赞
踩
一、集群准备
(一)、hadoop集群
192.168.174.10 hadoop1
192.168.174.11 hadoop2
192.168.174.12 hadoop3
192.168.174.9 docker
(二)、启动hadoop集群
开启VMnet8服务
[root@hadoop1 ~]# manage_hadoop.sh
Please input "start" or "stop" or "status" or "jps": start
查看进程975
二、创建MySQL数据库
将数据采集到hive表中
连接hadoop1数据库(首先启动mysql)
[root@hadoop1 ~]# mysql -uroot -p
Enter password:
ERROR 2002 (HY000): Can't connect to local MySQL server through socket'/usr/local/mysql/data/mysql.sock' (2)
[root@hadoop1 ~]# service mysql start
Starting MySQL.. SUCCESS!
使用Navicat或sqlyog连接hadoop1数据库
创建test库(手动创建,或者语句创建)
CREATE DATABASE test CHARSET utf8;
将数据导入到数据库
进入hive
hive> create database data;
OK
Time taken: 1.498 seconds
hive> show databases;
OK
data
default
hive> use data;
编写创建hive目录脚本
[root@hadoop1 sh]# touch mk_hive_dir.sh
[root@hadoop1 sh]# chmod +x mk_hive_dir.sh
[root@hadoop1 sh]# vi mk_hive_dir.sh
#!/bin/bash
beg_date=`date -d "${1}" +%s`
end_date=`date -d "${2}" +%s`
if((beg_date >${end_date}));then
echo "beg_date < end_date"
exit 0;
fi
currentDate=""
for((i=${beg_date};i<=${end_date};i=i+86400))
do
currentDate=`date -d @${i} +%Y%m%d`
echo "-----create /${currentDate}-----"
hive -e "use data;create table order_status_log${currentDate}(id string,order_id string,order_status string,operate_time string) row format delimited fields terminated by ',';"
done
执行hive建表脚本(记得带日期)
[root@hadoop1 sh]# mk_hive_dir.sh 20200901 20200903
#去hive查看表
hive> show tables;
OK
order_status_log
order_status_log20200901
order_status_log20200902
order_status_log20200903
Time taken: 0.052 seconds, Fetched: 3 row(s)
三、创建分区表采集数据
判断nn1 nn2状态(datax文件数据库要改active主机)
[root@hadoop1 sh]# hdfs haadmin -getServiceState nn1
standby
[root@hadoop1 sh]# hdfs haadmin -getServiceState nn2
active
查看hive上对应的path目录
hive> desc formatted order_status_log20200901;
datax需要修改hive上path
Location: hdfs://mycluster/user/hive/warehouse/data.db/order_status_log20200901
修改datax的配置文件(手动执行一次改成20200901,自动化执行改成20200831)
[root@hadoop1 ~]# cd datax/job/
[root@hadoop1 job]# ls
log.json test.json
[root@hadoop1 job]# cp log.json job.json
[root@hadoop1 job]# ls
job.json log.json test.json
[root@hadoop1 job]# vi job.json
{
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "123456",
"column": [
"id",
"order_id",
"order_status",
"operate_time"
],
"splitPk": "id",
"connection": [
{
"table": [
"order_status_log20200901"
],
"jdbcUrl": [
"jdbc:mysql://192.168.174.10:3306/data"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://192.168.174.11:8020",
"fileType": "text",
"path": "/user/hive/warehouse/data.db/order_status_log20200901",
"fileName": "order_status_log20200901",
"column": [
{
"name": "id",
"type": "STRING"
},
{
"name": "order_id",
"type": "STRING"
},
{
"name": "order_status",
"type": "STRING"
},
{
"name": "operate_time",
"type": "STRING"
}
],
"writeMode": "append",
"fieldDelimiter": ",",
"compress":"GZIP"
}
}
}
]
}
}
实现自动化采集脚本
[root@hadoop1 sh]# touch auto_mysql_hive.sh
[root@hadoop1 sh]# chmod +x auto_mysql_hive.sh
[root@hadoop1 sh]# echo 20200901 > /root/sh/date
[root@hadoop1 sh]# vi auto_mysql_hive.sh
#!/bin/bash
#第一步:获取json文件中的日期,date这个文本是我手动创建的,并且执行echo 20200901 > /root/sh/date手动添加的
date=`cat /root/sh/date`
#第二步:获取json文件日期的后一天日期
afterday_timestamp=$[`date -d "${date}" +%s`+86400]
afterday=`date -d @${afterday_timestamp} +%Y%m%d`
#这一步是全局替换,将0901替换成0902
sed -i "s/order_status_log${date}/order_status_log${afterday}/g" /root/datax/job/job.json
#更新/root/sh/date文本中的日期,要和json文件中的日期保持一致
echo ${afterday} > /root/sh/date
#执行datax
python /root/datax/bin/datax.py /root/datax/job/job.json
[root@hadoop1 sh]# sh auto_mysql_hive.sh
在hive上能查询到对应数据
hive> select * from order_status_log20200901;
四、 创建hive分区表(注意字段和分隔符)
hive> create table order_status_log(id string,order_id string,order_status string,operate_time string) partitioned by (day string) row format delimited Fields terminated by ',';
编写hive_count.sh脚本
[root@hadoop1 sh]# touch hive_count.sh
[root@hadoop1 sh]# chmod +x hive_count.sh
[root@hadoop1 sh]# vi hive_count.sh
#!/bin/bash
#此脚本中插入分区表的sql语句需要你自己补全才能够执行此脚本成功
#分区表中分区列的值为date(operate_time)的值
beg_date=`date -d "${1}" +%s`
end_date=`date -d "${2}" +%s`
if((beg_date >${end_date}));then
echo "beg_date < end_date"
exit 0;
fi
currentDate=""
for((i=${beg_date};i<=${end_date};i=i+86400))
do
currentDate=`date -d @${i} +%Y%m%d`
hive -e "
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.dynamic.partition=true;
set hive.exec.max.dynamic.partitions=1000;
insert into table data.order_status_log partition(day) select id,order_id,order_status,operate_time,date(operate_time) as day from data.order_status_log${currentDate};"
done
hive -S -e "select day,count(id) from data.order_status_log group by day;" > /root/sh/result
[root@hadoop1 sh]# sh hive_count.sh 20200901 20200903
查看hive表是否有内容
hive> select * from order_status_log;
五、将hive数据导入到mysql容器的数据库中
创建mysql容器(docker中)
[root@localhost ~]# docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
centos 7 eeb6ee3f44bd 2 months ago 204MB
mysql 5.7 1d7aba917169 3 months ago 448MB
[root@localhost ~]# docker run -itd --name=mysql1 -p 5550:3306 -e MYSQL_ROOT_PASSWORD=123456 1d7aba917169 /bin/bash
f0f2bc48d28fdd37d50612faecd112249cddaa6d6a26209a92bedd4a283a327b
[root@localhost ~]# docker exec -it mysql1 /bin/bash
root@f0f2bc48d28f:/# service mysql start
root@f0f2bc48d28f:/# service mysql status
root@f0f2bc48d28f:/# mysql -uroot -p
Enter password:
mysql> GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY '123456' WITH GRANT OPTION;
Query OK, 0 rows affected, 1 warning (0.00 sec)
mysql> FLUSH PRIVILEGES;
Query OK, 0 rows affected (0.00 sec)
使用测试工具链接
建立mysql result表
CREATE TABLE `hive_count`( `day` DATE, `count` INT )ENGINE=INNODB CHARSET=utf8;
写脚本导出到docker的mysql容器中
[root@hadoop1 sh]# touch hive_mysql.sh
[root@hadoop1 sh]# chmod +x hive_mysql.sh
[root@hadoop1 sh]# vi hive_mysql.sh
#!/bin/bash
user="root"
password="123456"
host="192.168.174.9"
port=5550
mysql_conn="mysql -h"$host" -u"$user" -P"${port}" -p"$password""
cat /root/sh/result | while read day count
do
$mysql_conn -e "INSERT INTO test.hive_count VALUES('$day','$count')"
done
[root@hadoop1 sh]# sh hive_mysql.sh
mysql: [Warning] Using a password on the command line interface can be insecure.
mysql: [Warning] Using a password on the command line interface can be insecure.
mysql: [Warning] Using a password on the command line interface can be insecure.
导入成功
连接工具查询有数据!!!
100分这不就到手了嘛!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。