当前位置:   article > 正文

hive读书笔记_info [thread-26] exec.filesinkoperator: moving tmp

info [thread-26] exec.filesinkoperator: moving tmp dir: hdfs://node1

hive cli

hive执行日志的位置

日志配置文件:/conf/hive-log4j2.properties

property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
  • 1

可以在hive cli中查询变量值


hive> set system:java.io.tmpdir;
system:java.io.tmpdir=/tmp
--root用户的默认日志位置
/tmp/root/hive.log
  • 1
  • 2
  • 3
  • 4
  • 5

指定主动使用本地模式

image-20191210142158793

指定数据仓库目录

image-20191210143918377

hive查看和使用自定义及系统属性

--会打印所有属性
hive>set;
hive>set env:HOME;
hive> set hivevar:col_name=name;
hive> set col_name;
hive> create table test3(id int,${env:LOGNAME} string);
hive> create table test2(id int,${hivevar:col_name} string);
--启动是添加配置项
--显示当前db
hive --hiveconf hive.cli.print.current.db=true
hive (default)> set hiveconf: hive.cli.print.current.db;
hiveconf: hive.cli.print.current.db=true
--更改为不显示
hive (default)> set hiveconf: hive.cli.print.current.db=false;
hive> 
--显示系统属性,java对system属性有可读可写权限
hive> set system:user.name;
system:user.name=root
hive> set system:myname=hujiawei;
hive> set system:myname;
system:myname=hujiawei
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
#执行一条命令,-S 是静默模式,不会输出OK之类的信息
hive -S -e 'show tables;'   
hive -S -e 'set'|grep warehouse;
#创建src表,并加载数据
create table src(s String);
echo "one row">/tmp/myfile
hive -e "LOAD DATA LOCAL INPATH '/tmp/myfile' into table src"
#执行sql文件
hive -f test.hql 
#hive cli中执行文件
source /root/test/test.hql;
#hive cli加载时候自动加载$HOME/.hiverc文件,如果没有,可以创建一个
set hive.cli.print.current.db=true;
set hive.exec.mode.local.auto=true;
#hive -i 会在启动时候加载指定的文件

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

hive所有默认属性都在
/opt/install/hadoop/apache-hive-2.3.6-bin/conf/hive-default.xml.template
文件中显示配置了

  • 在hive中执行shell命令:!pwd,执行hadoop命令:dfs -ls /

  • hive脚本注释方式同sql: –

数据类型

image-20191210161553645

image-20191210161611128

create table employes(
	name string,
    salary float,
    subordinates array<string>,
    deductions map<String,float>,
    address struct<street:string,city:string,state:string,zip:INT>
);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

默认字段分隔符

image-20191210163408857

create table employes(
	name string,
    salary float,
    subordinates array<string>,
    deductions map<String,float>,
    address struct<street:string,city:string,state:string,zip:INT>
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'
stored as textfile;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

sql

database

show datbase like 't*'
create database test;
drop datbase test;
desc database test;
--hive中一个database在hdfs中以.db结尾的目录,表是以表名为名的目录,可以在创建的时候指定位置
hive> create database test location '/test/test.db';
dfs -ls -R /test
--删除有表的database;
drop database test cascade;
--添加database描述
create database test comment 'this is a test db';
--添加database属性
create database test comment 'this is a test db' with dbproperties('creator'='hujiawie','date'='2019年12月10日');
--查看属性
desc database extended  test;
--修改数据库属性,可以新增,不能删除
alter database test set dbproperties('creators'='laohu');
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

table

create table employes(
	name string,
    salary float,
    subordinates array<string>,
    deductions map<String,float>,
    address struct<street:string,city:string,state:string,zip:INT>
)location '/test/employes' 
--拷贝表
hive> create table employees  like employes;
hive> show tables in mydb;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
指定分隔符
hive> create table t1(
    >     id      int
    >    ,name    string
    >    ,hobby   array<string>
    >    ,add     map<String,string>
    > )
    > partitioned by (pt_d string)
    > row format delimited
    > fields terminated by ','
    > collection items terminated by '-'
    > map keys terminated by ':'
    > ;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
分区表
hive> create table t1(
    >     id      int
    >    ,name    string
    >    ,hobby   array<string>
    >    ,add     map<String,string>
    > )
    > partitioned by (pt_d string)
    > row format delimited
    > fields terminated by ','
    > collection items terminated by '-'
    > map keys terminated by ':'
    > ;
--加载数据
1,xiaoming,book-TV-code,beijing:chaoyang-shagnhai:pudong
2,lilei,book-code,nanjing:jiangning-taiwan:taibei
3,lihua,music-book,heilongjiang:haerbin

load data local inpath '/root/test/myfile' overwrite into table t1 partition ( pt_d = '201701');
--加载另一个分区数据
1   xiaoming    ["book","TV","code"]    {
  "beijing":"chaoyang","shagnhai":"pudong"}  000000
2   lilei   ["book","code"] {
  "nanjing":"jiangning","taiwan":"taibei"}   000000
3   lihua   ["music","book"]    {
  "heilongjiang":"haerbin"}  000000
1   xiaoming    ["book","TV","code"]    {
  "beijing":"chaoyang","shagnhai":"pudong"}  201701
2   lilei   ["book","code"] {
  "nanjing":"jiangning","taiwan":"taibei"}   201701
3   lihua   ["music","book"]    {
  "heilongjiang":"haerbin"}  201701

load data local inpath '/root/test/myfile2' overwrite into table t1 partition ( pt_d = '000000');
--查看dfs上的目录和文件
hive> dfs -ls -R /user/hive/warehouse/mydb.db;
drwxr-xr-x   - root supergroup          0 2019-12-11 10:56 /user/hive/warehouse/mydb.db/employes
drwxr-xr-x   - root supergroup          0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1
drwxr-xr-x   - root supergroup          0 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000
-rwxr-xr-x   1 root supergroup        474 2019-12-11 11:04 /user/hive/warehouse/mydb.db/t1/pt_d=000000/myfile2
drwxr-xr-x   - root supergroup          0 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701
-rwxr-xr-x   1 root supergroup        147 2019-12-11 11:02 /user/hive/warehouse/mydb.db/t1/pt_d=201701/myfile
--加载一个分区,会创建相应的目录
hive> alter table t1 add partition(pt_d ='3333');
--删除分区,会删除相应的文件(外部表不会删除,可以通过msck repair table table_name恢复)
alter table test1 drop partition (pt_d =201701);
--另外注意分区其实也是个字段,只不过把这个字段当作索引,通过建目录的方式,提高性能
hive> desc extended t1;
id                      int                                         
name                    string                                      
hobby                   array<string>                               
add                     map<string,string>                          
pt_d                    string                                      
                 
# Partition Information          
# col_name              data_type               comment             
                 
pt_d                    string  
#查看有多少分区
hive> show partitions t1;
pt_d=000000
pt_d=3333

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
修改表
--重命名
hive> alter table employes rename to employees;
--修改分区地址,无效
hive> alter table t1 partition(pt_d=3333) set location  "hdfs://localhost:9000/user/hive/warehouse/mydb.db/t1/pt_d=4444";
--修改列,这边修改列为值,after某列之后,可以改成first,就是第一个位置,
--但是更改列顺序要求两列类型相同,
hive> create table src (c1 string,c1 string);
hive> alter table src change column c1 c3 string comment 'test' after c2;
--增加列
hive> alter table src add columns(c4 string comment 'column4');
--删除列,全替换的方式
hive> alter table src replace columns(cl1 string,cl2 string);
--修改表属性
hive> alter table src set tblproperties('name'='hujiawei');
--查看表属性
hive> show tblproperties src;

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

数据操作

装载数据

image-20191211161224548

  • local指定是从本地拷贝,如果没有local是从hdfs中移动(不可跨集群)
  • overwrite指定是追加还是覆盖
静态分区和动态分区
  • 静态分区方法1:

image-20191211160901289

  • 静态分区方法2:

image-20191211160927105

  • 动态分区

image-20191211161023170

  • 动态分区配置参数

    image-20191211161050604

从查询结果创建表
create table test as select c1,c2 from src;
--或者用like创建表
hive> create table test2 like test;
  • 1
  • 2
  • 3

注意这种创建表,如果src表是从外部文件加载进来的表,会将src的数据文件移动到目标表的位置

exec.FileSinkOperator: Moving tmp dir: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272
320733291437-1/_tmp.-ext-10002 to: hdfs://localhost:9000/user/hive/warehouse/src/.hive-staging_hive_2019-12-10_16-11-45_214_2063272320733291437-1/-ext-10002
  • 1
  • 2
导出数据
--1,
hive> from test t
    > insert overwrite local directory '/root/test/'
    > select * ;
hive> ! ls /root/test;
000000_0
hive> ! cat /root/test/000000_0;
20191212
20180112
20190212
20190312
20190712
--2
hive> from test t
    > insert overwrite local directory '/root/test/'
    > select * ;
--3 直接拷贝hdfs中的文件
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
 # 拷贝到本地
 fs -get 'hdfs://localhost:9000/user/hive/warehouse/test/data' .
 #拷贝到hdfs中另一个目录
  hs -cp 'hdfs://localhost:9000/user/hive/warehouse/test/data' '/test/'
  
  • 1
  • 2
  • 3
  • 4
  • 5

数据查询

查询是用正则表达式rlike
--hive的 正则表达式是用的java的,
select * from src a where a.s rlike '^a.*';
  • 1
  • 2
排序order,sort的区别
--order同oracle中的order,是全局排序,耗时多
select * from test a order by a.id1;
--sort by 是对每个reducer的输出排序,但是多个reducer整合的时候不一定排序正常
hive> select * from test a sort by a.id1;
--distribute by 用于对多个排序字段时,指定某个字段相同的给同一个reducer处理
hive> select * from test a distribute by a.id1 sort by a.id1,a.id2;
--cluster by 相当于distribute by order by 的组合,
hive> select * from test a cluster by  a.id1;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
cast 强制转换
hive> select cast(a.id1 as float) from test a ;
  • 1
数据抽样,分桶
--将test 表数据随机分成2个桶,取其中一个桶
hive> select * from test  tablesample(bucket 1 out of 2 on rand()) a;
--随机分3个桶,取第二个桶
hive> select * from test tablesample(bucket 2 out of 3 on rand());
--不是随机分桶,对列值分桶
hive> select * from test tablesample(bucket 2 out of 3 on id1);
--注意这边分桶其实不是均分的,所以每个桶中数据量不一定相同

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

视图

区别
  1. 逻辑视图和物理视图的区别
    hive不支持物化视图,实质是将视图的定义语句和查询语句组合一起,供hive执行查询计划,所以视图只是一个逻辑视图,提供简化查询的功能

  2. 不能对列分权
    orace中可以将指定列作为视图,达到控制权限,某些用户不需要ta_staff表的查询权限,只需要有视图的权限,可以看到某些列,但是hive中不支持对列分权,因为用户必须要有这个表的查询权限(文件访问权限)才能看视图

    但是hive中的视图可以通过where字句来限定某些行

    hive> select * from test;
    OK
    1       2       3
    88      77      66
    33      22      11
    1       2       3
    88      77      66
    33      22      11
    Time taken: 0.123 seconds, Fetched: 6 row(s)
    hive> 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    关于hive的用户,角色,组的相关问题参考csdn1

索引

  1. 创建索引

    hive> create index test1_index on table test(id1) as 
        > 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
        > with deferred rebuild 
        > in table test_index_table;
    --bitmap索引适合值较少的列,
    create index index_test_2 on table test(id2) as 'BITMAP' with deferred rebuild in table test_index2_table ;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
  2. 删除索引

    hive> drop index test1_index on test;
    
    • 1
  3. 查看索引

    hive> show formatted index on test;
    
    • 1
  4. 重建索引

    alter index test1_index on test rebuild;
    
    • 1
  5. 定制化索引
    实现hive的接口,打包,添加后,在创建索引时用as指定类名

    具体见cwiki2

调优

explain

可以查看查询语句转换成map reduce的具体过程

 explain select sum(id1) from test;
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: test
            Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
            Select Operator
              expressions: id1 (type: int)
              outputColumnNames: id1
              Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
              Group By Operator
                aggregations: sum(id1)
                mode: hash
                outputColumnNames: _col0
                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                Reduce Output Operator
                  sort order: 
                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                  value expressions: _col0 (type: bigint)
      Reduce Operator Tree:
        Group By Operator
          aggregations: sum(VALUE._col0)
          mode: mergepartial
          outputColumnNames: _col0
          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
          File Output Operator
            compressed: false
            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

Time taken: 0.31 seconds, Fetched: 44 row(s)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47

另外可以使用explain extended 获取更详细的信心

explain extended select sum(id1) from test;
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: test
            Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
            GatherStats: false
            Select Operator
              expressions: id1 (type: int)
              outputColumnNames: id1
              Statistics: Num rows: 6 Data size: 42 Basic stats: COMPLETE Column stats: NONE
              Group By Operator
                aggregations: sum(id1)
                mode: hash
                outputColumnNames: _col0
                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                Reduce Output Operator
                  null sort order: 
                  sort order: 
                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                  tag: -1
                  value expressions: _col0 (type: bigint)
                  auto parallelism: false
      Path -> Alias:
        hdfs://localhost:9000/user/hive/warehouse/test [test]
      Path -> Partition:
        hdfs://localhost:9000/user/hive/warehouse/test 
          Partition
            base file name: test
            input format: org.apache.hadoop.mapred.TextInputFormat
            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
            properties:
              COLUMN_STATS_ACCURATE {
  "BASIC_STATS":"true"}
              bucket_count -1
              column.name.delimiter ,
              columns id1,id2,id3
              columns.comments 
              columns.types int:int:int
              file.inputformat org.apache.hadoop.mapred.TextInputFormat
              file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              location hdfs://localhost:9000/user/hive/warehouse/test
              name default.test
              numFiles 4
              numRows 6
              rawDataSize 42
              serialization.ddl struct test { i32 id1, i32 id2, i32 id3}
              serialization.format 1
              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              totalSize 48
              transient_lastDdlTime 1576480718
            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
          
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              properties:
                COLUMN_STATS_ACCURATE {
  "BASIC_STATS":"true"}
                bucket_count -1
                column.name.delimiter ,
                columns id1,id2,id3
                columns.comments 
                columns.types int:int:int
                file.inputformat org.apache.hadoop.mapred.TextInputFormat
                file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                location hdfs://localhost:9000/user/hive/warehouse/test
                name default.test
                numFiles 4
                numRows 6
                rawDataSize 42
                serialization.ddl struct test { i32 id1, i32 id2, i32 id3}
                serialization.format 1
                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                totalSize 48
                transient_lastDdlTime 1576480718
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: default.test
            name: default.test
      Truncated Path -> Alias:
        /test [test]
      Needs Tagging: false
      Reduce Operator Tree:
        Group By Operator
          aggregations: sum(VALUE._col0)
          mode: mergepartial
          outputColumnNames: _col0
          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
          File Output Operator
            compressed: false
            GlobalTableId: 0
            directory: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002
            NumFilesPerFileSink: 1
            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
            Stats Publishing Key Prefix: hdfs://localhost:9000/tmp/hive/root/8d35f95d-893e-40ee-b831-6177341c7acb/hive_2019-12-17_11-12-25_359_70780381862038030-1/-mr-10001/.hive-staging_hive_2019-12-17_11-12-25_359_70780381862038030-1/-ext-10002/
            table:
                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                properties:
                  columns _col0
                  columns.types bigint
                  escape.delim \
                  hive.serialization.extend.additional.nesting.levels true
                  serialization.escape.crlf true
                  serialization.format 1
                  serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
            TotalFiles: 1
            GatherStats: false
            MultiFileSpray: false

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

Time taken: 0.324 seconds, Fetched: 119 row(s)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125

join优化

大表放join右边,小表放join左边

原因1:小表可以放在内存中缓存,用大表中记录挨个匹配小表的记录
实际原因是: 写在关联左侧的表每有1条重复的关联键时底层就会多1次运算处理
具体参考csdn3

本地模式

只针对小数据集,没有实际意义

并行优化

对job执行独立的阶段可以执行并行,提高速率

严格模式

严格模式下:

  1. 分区表下where必须指定分区

  2. order by 必须加上limit

  3. 对笛卡尔乘积禁用

调整mapper和reducer的数量

根据输入和输出文件数量大小调整

JVM重用

可以减少新建task过程中初始化和销毁jvm的开销,缺点是耗时最长的task会长时间占用插槽,导致堵塞

索引

同oracle,对有索引列的条件查询会显著提高效率,但是维护索引会耗时,需要rebuild

分区

提升很明显,但是会导致namenode的文件过多,内存爆炸

推测执行

在分布式集群环境下,因为程序Bug(包括Hadoop本身的bug),负载不均衡或者资源分布不均等原因,会造成同一个作业的多个任务之间运行速度不一致,有些任务的运行速度可能明显慢于其他任务(比如一个作业的某个任务进度只有50%,而其他所有任务已经运行完毕),则这些任务会拖慢作业的整体执行进度。为了避免这种情况发生,Hadoop采用了推测执行(Speculative Execution)机制,它根据一定的法则推测出“拖后腿”的任务,并为这样的任务启动一个备份任务,让该任务与原始任务同时处理同一份数据,并最终选用最先成功运行完成任务的计算结果作为最终结果。

如果用户因为输入数据量很大而需要执行长时间的map或者Reduce task的话,那么启动推测执行造成的浪费是非常巨大大。

单个MR中多个group by

设置是否启用该功能,能将多个group by操作组装到单个MAP REDUCE中

虚拟列

用于诊断结果,通过参数配置开启

压缩

常用压缩格式

压缩能节省磁盘空间,提高文件传输速率,但是会消耗cpu加/解压缩的

  1. 对于量大但是不怎么计算的数据,一般用gzip(压缩比最高,压缩解压缩速度最慢)

  2. 对于量小但是经常需要计算的数据,一般用lzo或者snappy

配置

  1. 开启hadoop压缩格式

    vi /opt/install/hadoop/hadoop-2.7.1/etc/hadoop/core-site.xml
    
    • 1

    添加

    <property>
    	<name>io.compression.codecs</name>
    	<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value>
    </property>
    
    • 1
    • 2
    • 3
    • 4
    --配置后不需要重启hadoop就能够在hive中显示可用的压缩格式
    hive> set io.compression.codecs;
    io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec
    
    • 1
    • 2
    • 3
  2. 开启中间压缩
    image-20191218101001935
    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RejrvQ5W-1577407917780)(…//pic/image-20191218101039825.png)]

  3. 开启输出结果压缩
    image-20191218101103562
    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-F4IMck81-1577407917781)(…//pic/image-20191218101113926.png)]

  4. 使用sequence file

    image-20191218101320856

测试

  1. 使用中间结果压缩

    hive> set hive.exec.compress.intermediate=true;
    hive> create table interemediate_com_om row format delimited fields terminated by '\t' as select * from test;
    Automatically selecting local only mode for query
    WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
    Query ID = root_20191218101653_df7baa3b-9570-40a7-b480-be91907bfb1e
    Total jobs = 3
    Launching Job 1 out of 3
    Number of reduce tasks is set to 0 since there's no reduce operator
    Job running in-process (local Hadoop)
    2019-12-18 10:17:03,294 Stage-1 map = 0%,  reduce = 0%
    Ended Job = job_local1362494287_0001
    Stage-4 is selected by condition resolver.
    Stage-3 is filtered out by condition resolver.
    Stage-5 is filtered out by condition resolver.
    Moving data to directory hdfs://localhost:9000/user/hive/warehouse/.hive-staging_hive_2019-12-18_10-16-53_073_1921170228225164888-1/-ext-10002
    Moving data to directory hdfs://localhost:9000/user/hive/warehouse/interemediate_com_om
    MapReduce Jobs Launched: 
    Stage-Stage-1:  HDFS Read: 48 HDFS Write: 132 SUCCESS
    Total MapReduce CPU Time Spent: 
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/315700
推荐阅读
相关标签
  

闽ICP备14008679号