赞
踩
因数据量过大,运行sqoop跑不动或者卡内存,于是通过写脚本分批导入到HDFS,然后再加载到Hive表中。
shell脚本如下:
#!/bin/bash
source /etc/profile
host=127.0.0.1
for((i=1; i<=100; i++))
do
start=$(((${i} - 1) * 100000 + 1))
end=$((${i} * 100000))
sql="select person_id,capture_time,write_time,capture_resource_id,major_capture_image_url,minor_capture_image_url,sex,age,orientation,glasses,knapsack, bag,messenger_bag,shoulder_bag,umbrella,hair,hat,mask,upper_color,upper_type,upper_texture,bottom_color,bottom_type,trolley_case,barrow,baby,feature_type,feature_code from big_data.pedestrian_sm where person_id>=${start} and person_id<=${end} and \$CONDITIONS";
sqoop import --connect jdbc:mysql://${host}:3306/big_data \
--username root \
--password 123456 \
--query "${sql}" \
--fields-terminated-by '\001' \
--delete-target-dir \
--target-dir hdfs://hsmaster:9000/tmp/big_data/pedestrian_sm/${start}-${end}/ \
--split-by person_id \
-m 8
echo Sqoop import from: ${start} to: ${end} success....................................
hive -e "
use big_data;
load data inpath 'hdfs://master:9000/tmp/big_data/pedestrian_sm/${start}-${end}' into table big_data.pedestrian_sm;
"
echo Hive load from: ${start}-${end} success....................................
done
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。