赞
踩
(1)使用Flume-ng的spooldir类型的source来监听Linux文件系统上的一个目录,使用hdfs类型的sink将日志收集到hdfs上。这个flume-ng的agent配置文件a4.conf的内容如下。
#定义agent名, source、channel、sink的名称
a4.sources = r1
a4.channels = c1
a4.sinks = k1
#具体定义source
a4.sources.r1.type = spooldir
a4.sources.r1.spoolDir = /root/Documents/logs
#具体定义channel
a4.channels.c1.type = memory
a4.channels.c1.capacity = 10000
a4.channels.c1.transactionCapacity = 100
#定义拦截器,为消息添加时间戳
a4.sources.r1.interceptors = i1
a4.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#具体定义sink
a4.sinks.k1.type = hdfs
a4.sinks.k1.hdfs.path = hdfs://hadoop:9000/flume/%Y%m%d
a4.sinks.k1.hdfs.filePrefix = events-
a4.sinks.k1.hdfs.fileType = DataStream
#不按照条数生成文件
a4.sinks.k1.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
a4.sinks.k1.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
a4.sinks.k1.hdfs.rollInterval = 60
#组装source、channel、sink
a4.sources.r1.channels = c1
a4.sinks.k1.channel = c1
CURRENT=`date +%Y%m%d`
#使用一个编写好的mapreduce程序对之前利用flume采集的日志进行清理,去除无用的记录,只选取需要的字段ip,logtime,url
/usr/hadoop/hadoop-2.2.0/bin/hadoop jar /root/Documents/cleaner.jar /flume/$CURRENT /bbslog_cleaned/$CURRENT
#为之前已经手动建好的hive外部分区表bbslog添加一个分区$CURRENT,否则bbslog表不知道其指向的bbslog_cleaned中存在数据
/usr/hive/apache-hive-0.13.0-bin/bin/hive -e "alter table bbslog add partition(logdate=$CURRENT) location '/bbslog_cleaned/$CURRENT'"
#统计当天的pv,即当天访问论坛的总浏览数
/usr/hive/apache-hive-0.13.0-bin/bin/hive -e "create table pv_$CURRENT row format delimited fields terminated by '\t' as select $CURRENT, count(*) from bbslog where logdate=$CURRENT"
#统计当天的uv,即当天访问论坛的唯一用户数
/usr/hive/apache-hive-0.13.0-bin/bin/hive -e "create table uv_$CURRENT row format delimited fields terminated by '\t' as select $CURRENT, count(distinct ip) from bbslog where logdate=$CURRENT"
#统计当天新注册的用户数
/usr/hive/apache-hive-0.13.0-bin/bin/hive -e "create table newregister_$CURRENT row format delimited fields terminated by '\t' as select $CURRENT, count(*) from bbslog where logdate=$CURRENT and instr(url, 'member.php?mod=register')>0"
#统计当天的重点用户,即当天点击论坛页面次数大于20次的排名前10的用户
/usr/hive/apache-hive-0.13.0-bin/bin/hive -e "create table vip_$CURRENT row format delimited fields terminated by '\t' as select $CURRENT, ip, count(*) as hits from bbslog where logdate=$CURRENT group by ip having hits > 20 order by hits desc limit 10"
#利用sqoop将hive分析的结果 pv_$CURRENT hive表中的数据导出到mysql中已经手动建好的表pv中
/usr/sqoop/sqoop-1.4.4.bin__hadoop-2.0.4-alpha/bin/sqoop export --connect jdbc:mysql://192.168.1.10:3306/bbslog_out --username root --password 123 --export-dir "/user/hive/warehouse/pv_$CURRENT" --table pv --fields-terminated-by '\t'
#利用sqoop将hive分析的结果 uv_$CURRENT hive表中的数据导出到mysql中已经手动建好的表uv中
/usr/sqoop/sqoop-1.4.4.bin__hadoop-2.0.4-alpha/bin/sqoop export --connect jdbc:mysql://192.168.1.10:3306/bbslog_out --username root --password 123 --export-dir "/user/hive/warehouse/uv_$CURRENT" --table uv --fields-terminated-by '\t'
#利用sqoop将hive分析的结果 newregister_$CURRENT hive表中的数据导出到mysql中已经手动建好的表newRegister中
/usr/sqoop/sqoop-1.4.4.bin__hadoop-2.0.4-alpha/bin/sqoop export --connect jdbc:mysql://192.168.1.10:3306/bbslog_out --username root --password 123 --export-dir "/user/hive/warehouse/newregister_$CURRENT" --table newRegister --fields-terminated-by '\t'
#利用sqoop将hive分析的结果 vip_$CURRENT hive表中的数据导出到mysql中已经手动建好的表vip中。注意因为sqoop命令执行中会启动mapreduce的mapper任务,并且默认mapper任务数大于1,所以会导致导出前vip_$CURRENT表中记录的顺序与导出后vip表中记录的顺序可能不同,因此这里需要用-m 1指定mapper任务数为1,这样就会保证导出前后记录顺序一致。
/usr/sqoop/sqoop-1.4.4.bin__hadoop-2.0.4-alpha/bin/sqoop export --connect jdbc:mysql://192.168.1.10:3306/bbslog_out --username root --password 123 -m 1 --export-dir "/user/hive/warehouse/vip_$CURRENT" --table vip --fields-terminated-by '\t'
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。