赞
踩
CREATE INDEX idx_user_id_login ON TABLE tb_login_part(userid) AS 'COMPACT' WITH deferred REBUILD;
ALTER INDEX idx_user_id_login ON tb_login_part REBUILD;
STORED AS file_format
CREATE TABLE tb_sogou_seq(
stime STRING,
userid STRING
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS SEQUENCEFILE;
INSERT INTO TABLE tb_sogou_seq
SELECT * FROM tb_sogou_source;
CREATE TABLE tb_sogou_parquet(
stime STRING,
userid STRING
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET;
INSERT INTO TABLE tb_sogou_parquet
SELECT * FROM tb_sogou_source;
CREATE TABLE tb_sogou_parquet(
stime STRING,
userid STRING
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC;
INSERT INTO TABLE tb_sogou_orc
SELECT * FROM tb_sogou_source;
-- 1) 开启hive中间传输数据压缩功能:
SET hive.exec.compress.intermediate=true;
-- 2) 开启mapreduce中map输出压缩功能
SET mapreduce.map.output.compress=true;
-- 3) 设置mapreduce中map输出数据的压缩方式
SET mapreduce.map.output.compress.codec=org.apache.io.compress.SnappyCodec;
-- 1) 开启hive最终输出数据压缩功能
SET hive.exec.compress.output=true;
-- 2) 开启mapreduce最终输出数据压缩
SET mapreduce.output.fileoutputformat.compress=true;
-- 3) 设置mapreduce最终数据输出压缩方式
SET mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
-- 4) 设置mapreduce最终数据输出压缩为块压缩
SET mapreduce.output.fileoutputformat.compress.type=BLOCK;
CREATE TABLE tb_sogou_orc_snappy
STORED AS ORC tblproperties ("orc.compress"="SNAPPY")
AS SELECT * FROM tb_sogou_source;
-- 如果hive程序只有maptask,将MapTask产生的所有小文件进行合并
SET hive.merge.mapfiles=true;
-- 如果hive程序有Map和ReduceTask,将ReduceTask产生的所以小文件进行合并
SET hive.merge.mapredfiles=true;
-- 每一个合并的文件的大小
SET hive.merge.size.per.task=256000000;
-- 平均每个文件的大小,如果小于这个值就会进行合并
SET hive.merge.smallfiles.avgsize=16000000;
-- 设置Hive中底层MapReduce读取数据的输入类:将所有文件合并为一个大文件作为输入
SET hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
-- 1. 开启索引配置
SET hive.optimize.index.filter=true;
-- 2. 创建表并设置构建索引
CREATE TABLE tb_sogou_orc_index
STORED AS ORC TBLPROPERTIES("orc.create.index"="true")
AS SELECT * FROM tb_sogou_source
DISTRIBUTE BY stime
SORT BY stime;
-- 3. 当进行范围或者等值查询(<,>,=)时就可以基于构建的索引进行查询
SELECT COUNT(*) FROM tb_sogou_orc_index WHERE stime > '12:00:00' AND stime < '18:00:00';
-- 创建表并设置构建布隆索引
CREATE TABLE tb_sogou_orc_index
STORED AS ORC TBLPROPERTIES
("orc.create.index"="true","orc.bloom.filter.columns"="stime,userid")
AS SELECT * FROM tb_sogou_source
DISTRIBUTE BY stime
SORT BY stime;
-- 当stime进行范围查询(<,>)时可以走row group index,userid等值查询时走bloom filter index
SELECT COUNT(*) FROM tb_sogou_orc_index WHERE stime > '12:00:00' AND stime < '18:00:00' AND userid = '123345';
SET hive.vectorized.execution.enabled = ture;
SET hive.vectorized.execution.reduce.enabled = true;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。