赞
踩
日志配置文件:/conf/hive-log4j2.properties
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
可以在hive cli中查询变量值
hive> set system:java.io.tmpdir;
system:java.io.tmpdir=/tmp
--root用户的默认日志位置
/tmp/root/hive.log
--会打印所有属性 hive>set; hive>set env:HOME; hive> set hivevar:col_name=name; hive> set col_name; hive> create table test3(id int,${env:LOGNAME} string); hive> create table test2(id int,${hivevar:col_name} string); --启动是添加配置项 --显示当前db hive --hiveconf hive.cli.print.current.db=true hive (default)> set hiveconf: hive.cli.print.current.db; hiveconf: hive.cli.print.current.db=true --更改为不显示 hive (default)> set hiveconf: hive.cli.print.current.db=false; hive> --显示系统属性,java对system属性有可读可写权限 hive> set system:user.name; system:user.name=root hive> set system:myname=hujiawei; hive> set system:myname; system:myname=hujiawei
#执行一条命令,-S 是静默模式,不会输出OK之类的信息 hive -S -e 'show tables;' hive -S -e 'set'|grep warehouse; #创建src表,并加载数据 create table src(s String); echo "one row">/tmp/myfile hive -e "LOAD DATA LOCAL INPATH '/tmp/myfile' into table src" #执行sql文件 hive -f test.hql #hive cli中执行文件 source /root/test/test.hql; #hive cli加载时候自动加载$HOME/.hiverc文件,如果没有,可以创建一个 set hive.cli.print.current.db=true; set hive.exec.mode.local.auto=true; #hive -i 会在启动时候加载指定的文件
hive所有默认属性都在
/opt/install/hadoop/apache-hive-2.3.6-bin/conf/hive-default.xml.template
文件中显示配置了
在hive中执行shell命令:!pwd,执行hadoop命令:dfs -ls /
hive脚本注释方式同sql: –
create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
);
create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'
stored as textfile;
show datbase like 't*' create database test; drop datbase test; desc database test; --hive中一个database在hdfs中以.db结尾的目录,表是以表名为名的目录,可以在创建的时候指定位置 hive> create database test location '/test/test.db'; dfs -ls -R /test --删除有表的database; drop database test cascade; --添加database描述 create database test comment 'this is a test db'; --添加database属性 create database test comment 'this is a test db' with dbproperties('creator'='hujiawie','date'='2019年12月10日'); --查看属性 desc database extended test; --修改数据库属性,可以新增,不能删除 alter database test set dbproperties('creators'='laohu');
create table employes(
name string,
salary float,
subordinates array<string>,
deductions map<String,float>,
address struct<street:string,city:string,state:string,zip:INT>
)location '/test/employes'
--拷贝表
hive> create table employees like employes;
hive> show tables in mydb;
hive> create table t1(
> id int
> ,name string
> ,hobby array<string>
> ,add map<String,string>
> )
> partitioned by (pt_d string)
> row format delimited
> fields terminated by ','
> collection items terminated by '-'
> map keys terminated by ':'
> ;
hive> create table t1( > id int > ,name string > ,hobby array<string> > ,add map<String,string> > ) > partitioned by (pt_d string) > row format delimited > fields terminated by ',' > collection items terminated by '-' > map keys terminated by ':' > ; --加载数据 1,xiaoming,book-TV-code,beijing:chaoyang-shagnhai:pudong 2,lilei,book-code,nanjing:jiangning-taiwan:taibei 3,lihua,music-book,heilongjiang:haerbin load data local inpath '/root/test/myfile' overwrite into table t1 partition ( pt_d = '201701'); --加载另一个分区数据 1 xiaoming ["book","TV","code"] { "beijing":"chaoyang","shagnhai":"pudong"} 000000 2 lilei ["book","code"] { "nanjing":"jiangning","taiwan":"taibei"} 000000 3 lihua ["music","book"] { "heilongjiang":"haerbin"} 000000 1 xiaoming ["book","TV","code"] { "beijing":"chaoyang","shagnhai":"pudong"} 201701 2 lilei ["book","code"] { "nanjing":"jiangning","taiwan":"taibei"} 201701 3 lihua ["music","book"] { "heilongjiang":"haerbin"} 201701 load data local inpath '/root/test/myfile2' overwrite into table t1 partition ( pt_d = '000000'); --查看dfs上的目录和文件 hive> dfs -ls -R /user/hive/warehouse/mydb.db; drwxr-xr-x - root supergroup 0 2019-12-11 10:56 /user/hive/warehouse/mydb.db/employes drwxr-xr-x - root supergroup 0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1 drwxr-xr-x - root supergroup 0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000 -rwxr-xr-x 1 root supergroup 474 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000/myfile2 drwxr-xr-x - root supergroup 0 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701 -rwxr-xr-x 1 root supergroup 147 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701/myfile --加载一个分区,会创建相应的目录 hive> alter table t1 add partition(pt_d ='3333'); --删除分区,会删除相应的文件(外部表不会删除,可以通过msck repair table table_name恢复) alter table test1 drop partition (pt_d = ‘201701’); --另外注意分区其实也是个字段,只不过把这个字段当作索引,通过建目录的方式,提高性能 hive> desc extended t1; id int name string hobby array<string> add map<string,string> pt_d string # Partition Information # col_name data_type comment pt_d string #查看有多少分区 hive> show partitions t1; pt_d=000000 pt_d=3333
--重命名 hive> alter table employes rename to employees; --修改分区地址,无效 hive> alter table t1 partition(pt_d=3333) set location "hdfs://localhost:9000/user/hive/warehouse/mydb.db/t1/pt_d=4444"; --修改列,这边修改列为值,after某列之后,可以改成first,就是第一个位置, --但是更改列顺序要求两列类型相同, hive> create table src (c1 string,c1 string); hive> alter table src change column c1 c3 string comment 'test' after c2; --增加列 hive> alter table src add columns(c4 string comment 'column4'); --删除列,全替换的方式 hive> alter table src replace columns(cl1 string,cl2 string); --修改表属性 hive> alter table src set tblproperties('name'='hujiawei'); --查看表属性 hive> show tblproperties src;
动态分区配置参数
create table test as select c1,c2 from src;
--或者用like创建表
hive> create table test2 like test;
注意这种创建表,如果src表是从外部文件加载进来的表,会将src的数据文件移动到目标表的位置
exec.FileSinkOperator: Moving tmp dir: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272
320733291437-1/_tmp.-ext-10002 to: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272320733291437-1/-ext-10002
--1, hive> from test t > insert overwrite local directory '/root/test/' > select * ; hive> ! ls /root/test; 000000_0 hive> ! cat /root/test/000000_0; 20191212 20180112 20190212 20190312 20190712 --2 hive> from test t > insert overwrite local directory '/root/test/' > select * ; --3 直接拷贝hdfs中的文件
# 拷贝到本地
fs -get 'hdfs://localhost:9000/user/hive/warehouse/test/data' .
#拷贝到hdfs中另一个目录
hs -cp 'hdfs://localhost:9000/user/hive/warehouse/test/data' '/test/'
--hive的 正则表达式是用的java的,
select * from src a where a.s rlike '^a.*';
--order同oracle中的order,是全局排序,耗时多
select * from test a order by a.id1;
--sort by 是对每个reducer的输出排序,但是多个reducer整合的时候不一定排序正常
hive> select * from test a sort by a.id1;
--distribute by 用于对多个排序字段时,指定某个字段相同的给同一个reducer处理
hive> select * from test a distribute by a.id1 sort by a.id1,a.id2;
--cluster by 相当于distribute by order by 的组合,
hive> select * from test a cluster by a.id1;
hive> select cast(a.id1 as float) from test a ;
--将test 表数据随机分成2个桶,取其中一个桶
hive> select * from test tablesample(bucket 1 out of 2 on rand()) a;
--随机分3个桶,取第二个桶
hive> select * from test tablesample(bucket 2 out of 3 on rand());
--不是随机分桶,对列值分桶
hive> select * from test tablesample(bucket 2 out of 3 on id1);
--注意这边分桶其实不是均分的,所以每个桶中数据量不一定相同
逻辑视图和物理视图的区别
hive不支持物化视图,实质是将视图的定义语句和查询语句组合一起,供hive执行查询计划,所以视图只是一个逻辑视图,提供简化查询的功能
不能对列分权
orace中可以将指定列作为视图,达到控制权限,某些用户不需要ta_staff表的查询权限,只需要有视图的权限,可以看到某些列,但是hive中不支持对列分权,因为用户必须要有这个表的查询权限(文件访问权限)才能看视图
但是hive中的视图可以通过where字句来限定某些行
hive> select * from test;
OK
1 2 3
88 77 66
33 22 11
1 2 3
88 77 66
33 22 11
Time taken: 0.123 seconds, Fetched: 6 row(s)
hive>
关于hive的用户,角色,组的相关问题参考csdn1
创建索引
hive> create index test1_index on table test(id1) as
> 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
> with deferred rebuild
> in table test_index_table;
--bitmap索引适合值较少的列,
create index index_test_2 on table test(id2) as 'BITMAP' with deferred rebuild in table test_index2_table ;
删除索引
hive> drop index test1_index on test;
查看索引
hive> show formatted index on test;
重建索引
alter index test1_index on test rebuild;
定制化索引
实现hive的接口,打包,添加后,在创建索引时用as指定类名
具体见cwiki2
可以查看查询语句转换成map reduce的具体过程
explain select sum(id1) from test; OK STAGE DEPENDENCIES: Stage-1 is a root stage Stage-0 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-1 Map Reduce Map Operator Tree: TableScan alias: test Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: id1 (type: int) outputColumnNames: id1 Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: sum(id1) mode: hash outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink Time taken: 0.31 seconds, Fetched: 44 row(s)
另外可以使用explain extended 获取更详细的信心
explain extended select sum(id1) from test; OK STAGE DEPENDENCIES: Stage-1 is a root stage Stage-0 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-1 Map Reduce Map Operator Tree: TableScan alias: test Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE GatherStats: false Select Operator expressions: id1 (type: int) outputColumnNames: id1 Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: sum(id1) mode: hash outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false Path -> Alias: hdfs://localhost:9000/user/hive/warehouse/test [test] Path -> Partition: hdfs://localhost:9000/user/hive/warehouse/test Partition base file name: test input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE { "BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns id1,id2,id3 columns.comments columns.types int:int:int file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://localhost:9000/user/hive/warehouse/test name default.test numFiles 4 numRows 6 rawDataSize 42 serialization.ddl struct test { i32 id1, i32 id2, i32 id3} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 48 transient_lastDdlTime 1576480718 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE { "BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns id1,id2,id3 columns.comments columns.types int:int:int file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://localhost:9000/user/hive/warehouse/test name default.test numFiles 4 numRows 6 rawDataSize 42 serialization.ddl struct test { i32 id1, i32 id2, i32 id3} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 48 transient_lastDdlTime 1576480718 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.test name: default.test Truncated Path -> Alias: /test [test] Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 directory: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002 NumFilesPerFileSink: 1 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Stats Publishing Key Prefix: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002/ table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat properties: columns _col0 columns.types bigint escape.delim \ hive.serialization.extend.additional.nesting.levels true serialization.escape.crlf true serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe TotalFiles: 1 GatherStats: false MultiFileSpray: false Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink Time taken: 0.324 seconds, Fetched: 119 row(s)
大表放join右边,小表放join左边
原因1:小表可以放在内存中缓存,用大表中记录挨个匹配小表的记录
实际原因是: 写在关联左侧的表每有1条重复的关联键时底层就会多1次运算处理
具体参考csdn3
只针对小数据集,没有实际意义
对job执行独立的阶段可以执行并行,提高速率
严格模式下:
分区表下where必须指定分区
order by 必须加上limit
对笛卡尔乘积禁用
根据输入和输出文件数量大小调整
可以减少新建task过程中初始化和销毁jvm的开销,缺点是耗时最长的task会长时间占用插槽,导致堵塞
同oracle,对有索引列的条件查询会显著提高效率,但是维护索引会耗时,需要rebuild
提升很明显,但是会导致namenode的文件过多,内存爆炸
在分布式集群环境下,因为程序Bug(包括Hadoop本身的bug),负载不均衡或者资源分布不均等原因,会造成同一个作业的多个任务之间运行速度不一致,有些任务的运行速度可能明显慢于其他任务(比如一个作业的某个任务进度只有50%,而其他所有任务已经运行完毕),则这些任务会拖慢作业的整体执行进度。为了避免这种情况发生,Hadoop采用了推测执行(Speculative Execution)机制,它根据一定的法则推测出“拖后腿”的任务,并为这样的任务启动一个备份任务,让该任务与原始任务同时处理同一份数据,并最终选用最先成功运行完成任务的计算结果作为最终结果。
如果用户因为输入数据量很大而需要执行长时间的map或者Reduce task的话,那么启动推测执行造成的浪费是非常巨大大。
设置是否启用该功能,能将多个group by操作组装到单个MAP REDUCE中
用于诊断结果,通过参数配置开启
压缩能节省磁盘空间,提高文件传输速率,但是会消耗cpu加/解压缩的
对于量大但是不怎么计算的数据,一般用gzip(压缩比最高,压缩解压缩速度最慢)
对于量小但是经常需要计算的数据,一般用lzo或者snappy
开启hadoop压缩格式
vi /opt/install/hadoop/hadoop-2.7.1/etc/hadoop/core-site.xml
添加
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
--配置后不需要重启hadoop就能够在hive中显示可用的压缩格式
hive> set io.compression.codecs;
io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec
开启中间压缩
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RejrvQ5W-1577407917780)(…//pic/image-20191218101039825.png)]
开启输出结果压缩
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-F4IMck81-1577407917781)(…//pic/image-20191218101113926.png)]
使用sequence file
使用中间结果压缩
hive> set hive.exec.compress.intermediate=true; hive> create table interemediate_com_om row format delimited fields terminated by '\t' as select * from test; Automatically selecting local only mode for query WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. Query ID = root_20191218101653_df7baa3b-9570-40a7-b480-be91907bfb1e Total jobs = 3 Launching Job 1 out of 3 Number of reduce tasks is set to 0 since there's no reduce operator Job running in-process (local Hadoop) 2019-12-18 10:17:03,294 Stage-1 map = 0%, reduce = 0% Ended Job = job_local1362494287_0001 Stage-4 is selected by condition resolver. Stage-3 is filtered out by condition resolver. Stage-5 is filtered out by condition resolver. Moving data to directory hdfs://localhost:9000/user/hive/warehouse/.hive-staging_hive_2019-12-18_10-16-53_073_1921170228225164888-1/-ext-10002 Moving data to directory hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om MapReduce Jobs Launched: Stage-Stage-1: HDFS Read: 48 HDFS Write: 132 SUCCESS Total MapReduce CPU Time Spent:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。