赞
踩
通常用户在HIVE中用SELECT语句出来结果,无法确定结果是来自哪个文件或者具体位置信息,HIVE中考虑到了这点,在Virtual Column虚列中可以指定三个静态列:
当我们的数据当中出现了脏数据的时候,我们可以使用这种方式去定位具体的脏数据在哪里。是很不错的排查方式。
创建textfile表
create table temp.temp_text_file_name (
content_name string,
channel string
)
row format delimited fields terminated by '\t'
stored as textfile;
查询文件位置和偏移量,当不设置hive.exec.rowoffset为true时,会报如下错误:
Error: Error while compiling statement: FAILED: SemanticException [Error 10004]: Line 1:69 Invalid table alias or column reference 'row__offset__inside__block': (possible column names are: content_name, channel) (state=42000,code=10004)
设置set hive.exec.rowoffset=true;
select content_name, input__file__name, block__offset__inside__file, row__offset__inside__block from temp.temp_text_file_name limit 10; +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | content_name | input__file__name | block__offset__inside__file | row__offset__inside__block | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | CCTV10科教 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 0 | 0 | | CCTV10科教高清 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 20 | 0 | | CCTV11戏曲 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 46 | 0 | | CCTV12社会与法 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 66 | 0 | | CCTV13新闻 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 92 | 0 | | CCTV14少儿 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 112 | 0 | | CCTV14少儿高清 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 132 | 0 | | CCTV15音乐 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 158 | 0 | | CCTV1综合 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 178 | 0 | | CCTV1综合高清 | hdfs://hive/warehouse/temp.db/temp_text_file_name/temp_test.csv | 196 | 0 | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+
创建存储格式为ORC的表
create table temp.temp_orc_file_name (
content_name string,
channel string
)
row format delimited fields terminated by '\t'
stored as orc;
插入模拟数据
insert into table temp.temp_orc_file_name
select * from temp.temp_text_file_name;
查询
select content_name, input__file__name, block__offset__inside__file, row__offset__inside__block from temp.temp_orc_file_name limit 10; +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | content_name | input__file__name | block__offset__inside__file | row__offset__inside__block | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | CCTV10科教 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 24 | 0 | | CCTV10科教高清 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 48 | 0 | | CCTV11戏曲 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 73 | 0 | | CCTV12社会与法 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 97 | 0 | | CCTV13新闻 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 121 | 0 | | CCTV14少儿 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 146 | 0 | | CCTV14少儿高清 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 170 | 0 | | CCTV15音乐 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 194 | 0 | | CCTV1综合 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 219 | 0 | | CCTV1综合高清 | hdfs://hive/warehouse/temp.db/temp_orc_file_name/000000_0 | 243 | 0 | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ 10 rows selected (0.29 seconds)
创建sequencefile格式的表并插入数据
create table temp.temp_seq_file_name (
content_name string,
channel string
)
row format delimited fields terminated by '\t'
stored as sequencefile;
insert into table temp.temp_seq_file_name
select * from temp.temp_text_file_name;
select content_name, input__file__name, block__offset__inside__file, row__offset__inside__block from temp.temp_seq_file_name order by row__offset__inside__block desc limit 10; +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | content_name | input__file__name | block__offset__inside__file | row__offset__inside__block | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ | CCTV1综合高清 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 391 | 0 | | CCTV1综合 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 361 | 0 | | CCTV央视音乐 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0_copy_1 | 1818 | 0 | | CCTV14少儿高清 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 291 | 0 | | CCTV14少儿 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 259 | 0 | | CCTV13新闻 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 227 | 0 | | CCTV12社会与法 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 189 | 0 | | CCTV11戏曲 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 157 | 0 | | CCTV央视文化精品 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0_copy_1 | 1760 | 0 | | CCTV10科教 | hdfs://hive/warehouse/temp.db/temp_seq_file_name/000000_0 | 87 | 0 | +---------------+----------------------------------------------------+------------------------------+-----------------------------+--+ 10 rows selected (26.585 seconds)
疑问有待验证与解释:row__offset__inside__block字段不管是textfile格式还是ORC格式还是sequencefile格式,值都是为0???
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。