赞
踩
下面将以 hdp 版(HDP-3.1.5.0-centos7-rpm.tar.gz)为例详细介绍 hive 3 的安装以及配置,环境可以为 CentOS 7 或者 CentOS 8(虽然 tar 包为 centos7,但本文使用的组建 rpm 依然适用于CentOS 8 )。
安装之前需先手动安装并配置好如下:
安装的组建及版本:
# 0 解压 HDP 包
tar -zxf HDP-3.1.5.0-centos7-rpm.tar.gz
# 1 依赖(可选)
cd HDP/centos7/3.1.5.0-152/bigtop-tomcat
rpm -ivh bigtop-tomcat-7.0.94-1.noarch.rpm
cd ../bigtop-jsvc
rpm -ivh bigtop-jsvc-1.0.15-152.x86_64.rpm
cd ../hdp-select
rpm -ivh hdp-select-3.1.5.0-152.el7.noarch.rpm
yum install -y redhat-lsb
# 2 时间同步
yum -y install chrony
## 添加 server ntp1.aliyun.com iburst
vim /etc/chrony.conf
## 查看
chronyc sourcestats -v
timedatectl
# 3 Zookeeper(HA 及 部分组建,例如 Hive 依赖于 ZK)
cd ../zookeeper
rpm -ivh *.rpm
ln -s /usr/hdp/3.1.5.0-152 /usr/hdp/current
# 4 hadoop
cd ../hadoop
rpm -ivh *.rpm --nodeps
# 5 tez(Hive 引擎)
cd ../tez
rpm -ivh *.rpm
# 6 hive
cd ../hive
rpm -ivh *.rpm --nodeps
# 7 spark2(可选)
## 如果 yarn.nodemanager.aux-services 添加了 spark2_shuffle 需安装
cd ../spark2
rpm -ivh spark2_3_1_5_0_152-yarn-shuffle-2.3.2.3.1.5.0-152.noarch.rpm
以下涉及到内存的根据自己服务器配置情况进行配置,尽可能设置比较合理充足的大小,同时每个配置结束后又给出了一份生产环境的配置可供参考。
/etc/hadoop/conf/workers
添加 Hadoop workers节点
/etc/hadoop/conf/hadoop-env.sh
export JAVA_HOME=/usr/local/jdk8
export HADOOP_HOME_WARN_SUPPRESS=1
export HADOOP_HOME=/usr/hdp/current/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/conf
export JSVC_HOME=/usr/lib/bigtop-utils
export HADOOP_HEAPSIZE="1024"
export HADOOP_NAMENODE_INIT_HEAPSIZE="-Xms1024m"
export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true ${HADOOP_OPTS}"
USER="$(whoami)"
HADOOP_JOBTRACKER_OPTS="-server -XX:ParallelGCThreads=1 -XX:+UseConcMarkSweepGC -XX:ErrorFile=/var/log/hadoop/$USER/hs_err_pid%p.log -XX:NewSize=200m -XX:MaxNewSize=200m -Xloggc:/var/log/hadoop/$USER/gc.log-`date +'%Y%m%d%H%M'` -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xmx1024m -Dhadoop.security.logger=INFO,DRFAS -Dmapred.audit.logger=INFO,MRAUDIT -Dhadoop.mapreduce.jobsummary.logger=INFO,JSA ${HADOOP_JOBTRACKER_OPTS}"
HADOOP_TASKTRACKER_OPTS="-server -Xmx1024m -Dhadoop.security.logger=ERROR,console -Dmapred.audit.logger=ERROR,console ${HADOOP_TASKTRACKER_OPTS}"
SHARED_HDFS_NAMENODE_OPTS="-server -XX:ParallelGCThreads=1 -XX:+UseConcMarkSweepGC -XX:ErrorFile=/var/log/hadoop/$USER/hs_err_pid%p.log -XX:NewSize=256m -XX:MaxNewSize=256m -Xloggc:/var/log/hadoop/$USER/gc.log-`date +'%Y%m%d%H%M'` -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:CMSInitiatingOccupancyFraction=70 -XX:+UseCMSInitiatingOccupancyOnly -Xms1024m -Xmx1024m -Dhadoop.security.logger=INFO,DRFAS -Dhdfs.audit.logger=INFO,DRFAAUDIT"
export HDFS_NAMENODE_OPTS="${SHARED_HDFS_NAMENODE_OPTS} -XX:OnOutOfMemoryError=\"/usr/hdp/current/hadoop/bin/kill-name-node\" -Dorg.mortbay.jetty.Request.maxFormContentSize=-1 ${HDFS_NAMENODE_OPTS}"
export HDFS_DATANODE_OPTS="-server -XX:ParallelGCThreads=1 -XX:+UseConcMarkSweepGC -XX:OnOutOfMemoryError=\"/usr/hdp/current/hadoop/bin/kill-data-node\" -XX:ErrorFile=/var/log/hadoop/$USER/hs_err_pid%p.log -XX:NewSize=200m -XX:MaxNewSize=200m -Xloggc:/var/log/hadoop/$USER/gc.log-`date +'%Y%m%d%H%M'` -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xms1024m -Xmx1024m -Dhadoop.security.logger=INFO,DRFAS -Dhdfs.audit.logger=INFO,DRFAAUDIT ${HDFS_DATANODE_OPTS} -XX:CMSInitiatingOccupancyFraction=70 -XX:+UseCMSInitiatingOccupancyOnly"
export HDFS_SECONDARYNAMENODE_OPTS="${SHARED_HDFS_NAMENODE_OPTS} -XX:OnOutOfMemoryError=\"/usr/hdp/current/hadoop/bin/kill-secondary-name-node\" ${HDFS_SECONDARYNAMENODE_OPTS}"
export HADOOP_CLIENT_OPTS="-Xmx${HADOOP_HEAPSIZE}m $HADOOP_CLIENT_OPTS"
HDFS_NFS3_OPTS="-Xmx1024m -Dhadoop.security.logger=ERROR,DRFAS ${HDFS_NFS3_OPTS}"
HADOOP_BALANCER_OPTS="-server -Xmx1024m ${HADOOP_BALANCER_OPTS}"
export HDFS_DATANODE_SECURE_USER=${HDFS_DATANODE_SECURE_USER:-""}
export HADOOP_SSH_OPTS="-o ConnectTimeout=5 -o SendEnv=HADOOP_CONF_DIR"
export HADOOP_LOG_DIR=/var/log/hadoop/$USER
export HADOOP_SECURE_LOG_DIR=${HADOOP_SECURE_LOG_DIR:-/var/log/hadoop/$HDFS_DATANODE_SECURE_USER}
export HADOOP_PID_DIR=/var/run/hadoop/$USER
export HADOOP_SECURE_PID_DIR=${HADOOP_SECURE_PID_DIR:-/var/run/hadoop/$HDFS_DATANODE_SECURE_USER}
YARN_RESOURCEMANAGER_OPTS="-Dyarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY"
export HADOOP_IDENT_STRING=$USER
# Add database libraries
JAVA_JDBC_LIBS=""
if [ -d "/usr/share/java" ]; then
for jarFile in `ls /usr/share/java | grep -E "(mysql|ojdbc|postgresql|sqljdbc)" 2>/dev/null`
do
JAVA_JDBC_LIBS=${JAVA_JDBC_LIBS}:$jarFile
done
fi
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}${JAVA_JDBC_LIBS}
export HADOOP_LIBEXEC_DIR=/usr/hdp/current/hadoop/libexec
export JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:/usr/hdp/current/hadoop/lib/native/Linux-amd64-64
export HADOOP_OPTS="-Dhdp.version=$HDP_VERSION $HADOOP_OPTS"
if [ "$command" == "datanode" ] && [ "$EUID" -eq 0 ] && [ -n "$HDFS_DATANODE_SECURE_USER" ]; then
ulimit -n 128000
fi
HADOOP_NAMENODE_INIT_HEAPSIZE
值可以设置为 "-Xms20480m"
HADOOP_JOBTRACKER_OPTS
GC的线程的并行线程数可以设置为 8 -XX:ParallelGCThreads=8
;-XX:ErrorFile
、-Xloggc
可以设置为数据盘的路径;SHARED_HDFS_NAMENODE_OPTS
GC的线程的并行线程数可以设置为 8 -XX:ParallelGCThreads=8
;-XX:ErrorFile
、-Xloggc
可以设置为数据盘的路径;-XX:NewSize=2560m -XX:MaxNewSize=2560m
新生代初始内存的大小可以设置为稍大的值(需小于-Xms);HDFS_DATANODE_OPTS
GC的线程的并行线程数可以设置为4 -XX:ParallelGCThreads=4
;-XX:ErrorFile
、-Xloggc
可以设置为数据盘的路径;初始化和最大堆内存可以调大为 -Xms13568m -Xmx13568m
;HADOOP_LOG_DIR
可以设置到数据盘。HADOOP_SECURE_LOG_DIR
可以设置到数据盘。/etc/hadoop/conf/core-site.xml
<property>
<name>fs.defaultFS</name>
<value>hdfs://node01:8020</value>
<final>true</final>
</property>
<property>
<name>fs.trash.interval</name>
<value>360</value>
</property>
<property>
<name>hadoop.http.cross-origin.allowed-headers</name>
<value>X-Requested-With,Content-Type,Accept,Origin,WWW-Authenticate,Accept-Encoding,Transfer-Encoding</value>
</property>
<property>
<name>hadoop.http.cross-origin.allowed-methods</name>
<value>GET,PUT,POST,OPTIONS,HEAD,DELETE</value>
</property>
<property>
<name>hadoop.http.cross-origin.allowed-origins</name>
<value>*</value>
</property>
<property>
<name>hadoop.http.cross-origin.max-age</name>
<value>1800</value>
</property>
<property>
<name>hadoop.http.filter.initializers</name>
<value>org.apache.hadoop.security.AuthenticationFilterInitializer,org.apache.hadoop.security.HttpCrossOriginFilterInitializer</value>
</property>
<property>
<name>hadoop.security.auth_to_local</name>
<value>DEFAULT</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>simple</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>false</value>
</property>
<property>
<name>hadoop.security.instrumentation.requires.admin</name>
<value>false</value>
</property>
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>4096</value>
</property>
<property>
<name>io.serializations</name>
<value>org.apache.hadoop.io.serializer.WritableSerialization</value>
</property>
<property>
<name>ipc.client.connect.max.retries</name>
<value>10</value>
</property>
<property>
<name>ipc.client.connection.maxidletime</name>
<value>10000</value>
</property>
<property>
<name>ipc.client.idlethreshold</name>
<value>4000</value>
</property>
<property>
<name>ipc.server.tcpnodelay</name>
<value>true</value>
</property>
<property>
<name>mapreduce.jobtracker.webinterface.trusted</name>
<value>false</value>
</property>
<!--<property>
<name>net.topology.script.file.name</name>
<value>/etc/hadoop/conf/topology_script.py</value>
</property>-->
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<!--<value>bdm0,bdm1</value>-->
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.impala.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.impala.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.livy.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.livy.hosts</name>
<value>*</value>
</property>
<!--<property>
<name>hadoop.proxyuser.oozie.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.hosts</name>
<value>bdm0</value>
</property>-->
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.hosts</name>
<value>*</value>
</property>
<!--
<property>
<name>ha.failover-controller.active-standby-elector.zk.op.retries</name>
<value>120</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>es1:2181,es2:2181,bdm0:2181,bdm1:2181,etl1:2181</value>
</property>
-->
fs.defaultFS
如果是配置了 HA 可以简写为 hdfs://nameservice
,nameservice 可以为任意其它合法的名字,后面配置保持统一即可。/etc/hadoop/conf/hdfs-site.xml
<property>
<name>dfs.permissions.superusergroup</name>
<value>hdfs</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>node01:50070</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/hadoop/hdfs/namenode</value>
<final>true</final>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/hadoop/hdfs/sda,/hadoop/hdfs/sdb</value>
<final>true</final>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>/hadoop/hdfs/namesecondary</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address </name>
<value>node01:50090</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<property>
<name>dfs.block.access.token.enable</name>
<value>true</value>
</property>
<property>
<name>dfs.blockreport.initialDelay</name>
<value>120</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.nameservice</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.client.read.shortcircuit.streams.cache.size</name>
<value>256</value>
</property>
<property>
<name>dfs.client.retry.policy.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.cluster.administrators</name>
<value>hdfs</value>
</property>
<property>
<name>dfs.content-summary.limit</name>
<value>5000</value>
</property>
<property>
<name>dfs.datanode.address</name>
<value>0.0.0.0:50010</value>
</property>
<property>
<name>dfs.datanode.balance.bandwidthPerSec</name>
<value>6250000</value>
</property>
<property>
<name>dfs.datanode.data.dir.perm</name>
<value>750</value>
</property>
<property>
<name>dfs.datanode.du.reserved</name>
<value>1340866560</value>
</property>
<property>
<name>dfs.datanode.failed.volumes.tolerated</name>
<value>0</value>
<final>true</final>
</property>
<property>
<name>dfs.datanode.http.address</name>
<value>0.0.0.0:50075</value>
</property>
<property>
<name>dfs.datanode.https.address</name>
<value>0.0.0.0:50475</value>
</property>
<property>
<name>dfs.datanode.ipc.address</name>
<value>0.0.0.0:8010</value>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>4096</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
<property>
<name>dfs.encrypt.data.transfer.cipher.suites</name>
<value>AES/CTR/NoPadding</value>
</property>
<property>
<name>dfs.heartbeat.interval</name>
<value>3</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/etc/hadoop/conf/dfs.exclude</value>
</property>
<property>
<name>dfs.http.policy</name>
<value>HTTP_ONLY</value>
</property>
<property>
<name>dfs.https.port</name>
<value>50470</value>
</property>
<property>
<name>dfs.namenode.accesstime.precision</name>
<value>0</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.audit.log.async</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.avoid.read.stale.datanode</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.avoid.write.stale.datanode</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.checkpoint.edits.dir</name>
<value>${dfs.namenode.checkpoint.dir}</value>
</property>
<property>
<name>dfs.namenode.checkpoint.period</name>
<value>21600</value>
</property>
<property>
<name>dfs.namenode.checkpoint.txns</name>
<value>1000000</value>
</property>
<property>
<name>dfs.namenode.fslock.fair</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.handler.count</name>
<value>800</value>
</property>
<property>
<name>dfs.namenode.name.dir.restore</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.safemode.threshold-pct</name>
<value>0.99</value>
</property>
<property>
<name>dfs.namenode.stale.datanode.interval</name>
<value>30000</value>
</property>
<property>
<name>dfs.namenode.startup.delay.block.deletion.sec</name>
<value>3600</value>
</property>
<property>
<name>dfs.namenode.write.stale.datanode.ratio</name>
<value>1.0f</value>
</property>
<property>
<name>dfs.permissions.ContentSummary.subAccess</name>
<value>false</value>
</property>
<property>
<name>dfs.replication.max</name>
<value>50</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
<final>true</final>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>hadoop.caller.context.enabled</name>
<value>true</value>
</property>
<property>
<name>hadoop.http.authentication.type</name>
<value>simple</value>
</property>
<property>
<name>manage.include.files</name>
<value>false</value>
</property>
<property>
<name>nfs.exports.allowed.hosts</name>
<value>* rw</value>
</property>
<property>
<name>nfs.file.dump.dir</name>
<value>/tmp/.hdfs-nfs</value>
</property>
<!--
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>shell(/bin/true)</value>
</property>
<property>
<name>dfs.ha.namenodes.nameservice</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.internal.nameservices</name>
<value>nameservice</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/hadoop/hdfs/journal</value>
</property>
<property>
<name>dfs.journalnode.http-address</name>
<value>0.0.0.0:8480</value>
</property>
<property>
<name>dfs.journalnode.https-address</name>
<value>0.0.0.0:8481</value>
</property>
<property>
<name>dfs.namenode.http-address.nameservice.nn1</name>
<value>bdm0:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.nameservice.nn2</name>
<value>bdm1:50070</value>
</property>
<property>
<name>dfs.namenode.https-address.nameservice.nn1</name>
<value>bdm0:50470</value>
</property>
<property>
<name>dfs.namenode.https-address.nameservice.nn2</name>
<value>bdm1:50470</value>
</property>
<property>
<name>dfs.namenode.rpc-address.nameservice.nn1</name>
<value>bdm0:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.nameservice.nn2</name>
<value>bdm1:8020</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://bdm0:8485;bdm1:8485;etl1:8485/nameservice</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>nameservice</value>
</property>
-->
dfs.replication
block 副本数,生产环境建议设置为大于 3 的值。dfs.namenode.http-address
如果开启 HA 后配置为 dfs.namenode.http-address.nameservice.nn1=bdm0:50070
、dfs.namenode.http-address.nameservice.nn2=bdm1:50070
,nameservice 为 dfs.nameservices
指定的名字。dfs.namenode.https-address
如果开启 HA 后配置为 dfs.namenode.https-address.nameservice.nn1=bdm0:50470
、dfs.namenode.https-address.nameservice.nn2=bdm1:50470
。dfs.domain.socket.path=/var/lib/hadoop-hdfs/dn_socket
配置项指定路径的属组,对于启动用户一定要有权限,最好归属者为启动用户,启动用户属于这个组。/etc/hadoop/conf/mapred-env.sh
HDP_VERSION="3.1.5.0-152"
export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=900
export HADOOP_LOGLEVEL=${HADOOP_LOGLEVEL:-INFO}
export HADOOP_ROOT_LOGGER=${HADOOP_ROOT_LOGGER:-INFO,console}
export HADOOP_DAEMON_ROOT_LOGGER=${HADOOP_DAEMON_ROOT_LOGGER:-${HADOOP_LOGLEVEL},RFA}
export HADOOP_OPTS="-Dhdp.version=$HDP_VERSION $HADOOP_OPTS"
#export HADOOP_OPTS="-Djava.io.tmpdir=/var/lib/ambari-server/data/tmp/hadoop_java_io_tmpdir $HADOOP_OPTS"
export JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:/var/lib/ambari-server/data/tmp/hadoop_java_io_tmpdir"
export HADOOP_LOG_DIR=/var/log/hadoop-mapreduce/$USER
export HADOOP_PID_DIR=/var/run/hadoop-mapreduce/$USER
/etc/hadoop/conf/mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapred.local.dir</name>
<value>/hadoop/mapred</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx1024m</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>256</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx1024m</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>256</value>
</property>
<property>
<name>yarn.app.mapreduce.am.command-opts</name>
<value>-Xmx512m -Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>node01:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node01:19888</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.https.address</name>
<value>node01:19890</value>
</property>
<property>
<name>hadoop.http.authentication.type</name>
<value>simple</value>
</property>
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>true</value>
</property>
<property>
<name>mapred.reduce.tasks.speculative.execution</name>
<value>true</value>
</property>
<property>
<name>mapreduce.admin.map.child.java.opts</name>
<value>-server -XX:NewRatio=8 -Djava.net.preferIPv4Stack=true -Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>mapreduce.admin.reduce.child.java.opts</name>
<value>-server -XX:NewRatio=8 -Djava.net.preferIPv4Stack=true -Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>mapreduce.admin.user.env</name>
<value>LD_LIBRARY_PATH=/usr/hdp/current/hadoop/lib/native:/usr/hdp/current/hadoop/lib/native/Linux-amd64-64</value>
</property>
<property>
<name>mapreduce.cluster.acls.enabled</name>
<value>false</value>
</property>
<property>
<name>mapreduce.am.max-attempts</name>
<value>2</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$PWD/mr-framework/hadoop/share/hadoop/mapreduce/*:$PWD/mr-framework/hadoop/share/hadoop/mapreduce/lib/*:$PWD/mr-framework/hadoop/share/hadoop/common/*:$PWD/mr-framework/hadoop/share/hadoop/common/lib/*:$PWD/mr-framework/hadoop/share/hadoop/yarn/*:$PWD/mr-framework/hadoop/share/hadoop/yarn/lib/*:$PWD/mr-framework/hadoop/share/hadoop/hdfs/*:$PWD/mr-framework/hadoop/share/hadoop/hdfs/lib/*:$PWD/mr-framework/hadoop/share/hadoop/tools/lib/*:/usr/hdp/current/hadoop/lib/hadoop-lzo-0.6.0.${hdp.version}.jar:/etc/hadoop/conf/secure</value>
</property>
<property>
<name>mapreduce.application.framework.path</name>
<value>/hdp/apps/${hdp.version}/mapreduce/mapreduce.tar.gz#mr-framework</value>
</property>
<property>
<name>mapreduce.cluster.administrators</name>
<value>hadoop</value>
</property>
<property>
<name>mapreduce.job.acl-modify-job</name>
<value> </value>
</property>
<property>
<name>mapreduce.job.acl-view-job</name>
<value> </value>
</property>
<property>
<name>mapreduce.job.counters.max</name>
<value>130</value>
</property>
<property>
<name>mapreduce.job.emit-timeline-data</name>
<value>true</value>
</property>
<property>
<name>mapreduce.job.queuename</name>
<value>default</value>
</property>
<property>
<name>mapreduce.job.reduce.slowstart.completedmaps</name>
<value>0.05</value>
</property>
<property>
<name>mapreduce.jobhistory.admin.acl</name>
<value>*</value>
</property>
<property>
<name>mapreduce.jobhistory.bind-host</name>
<value>0.0.0.0</value>
</property>
<!--<property>
<name>mapreduce.jobhistory.done-dir</name>
<value>/mr-history/done</value>
</property>-->
<property>
<name>mapreduce.jobhistory.http.policy</name>
<value>HTTP_ONLY</value>
</property>
<!--<property>
<name>mapreduce.jobhistory.intermediate-done-dir</name>
<value>/mr-history/tmp</value>
</property>-->
<property>
<name>mapreduce.jobhistory.recovery.enable</name>
<value>true</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.class</name>
<value>org.apache.hadoop.mapreduce.v2.hs.HistoryServerLeveldbStateStoreService</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.leveldb.path</name>
<value>/hadoop/mapreduce/jhs</value>
</property>
<property>
<name>mapreduce.map.log.level</name>
<value>INFO</value>
</property>
<property>
<name>mapreduce.map.output.compress</name>
<value>false</value>
</property>
<property>
<name>mapreduce.map.sort.spill.percent</name>
<value>0.7</value>
</property>
<property>
<name>mapreduce.map.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress</name>
<value>false</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.type</name>
<value>BLOCK</value>
</property>
<property>
<name>mapreduce.reduce.input.buffer.percent</name>
<value>0.0</value>
</property>
<property>
<name>mapreduce.reduce.log.level</name>
<value>INFO</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.fetch.retry.enabled</name>
<value>1</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.fetch.retry.interval-ms</name>
<value>1000</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.fetch.retry.timeout-ms</name>
<value>30000</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.input.buffer.percent</name>
<value>0.7</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.merge.percent</name>
<value>0.66</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.parallelcopies</name>
<value>30</value>
</property>
<property>
<name>mapreduce.reduce.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.shuffle.port</name>
<value>13562</value>
</property>
<property>
<name>mapreduce.task.io.sort.factor</name>
<value>100</value>
</property>
<property>
<name>mapreduce.task.io.sort.mb</name>
<value>256</value>
</property>
<property>
<name>mapreduce.task.timeout</name>
<value>300000</value>
</property>
<property>
<name>yarn.app.mapreduce.am.admin-command-opts</name>
<value>-Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>yarn.app.mapreduce.am.log.level</name>
<value>INFO</value>
</property>
<property>
<name>yarn.app.mapreduce.am.resource.mb</name>
<value>512</value>
</property>
<property>
<name>yarn.app.mapreduce.am.staging-dir</name>
<value>/user</value>
</property>
mapreduce.map.java.opts=-Xmx12697m
启动 map 任务是 JVM 堆内存大小,如果此值设置过小提交任务时会 JVM 就会抛出 Out of Memory 异常。mapreduce.map.memory.mb=12288
每个 map Container 使用的内存上限,默认为 -1
,表示不限制。如果未指定,则从 mapreduce.job.heap.memory-mb.ratio
(默认为 0.8) 和 mapreduce.map.java.opts
推断。当 Container 的内存大小超过了这个参数值,NodeManager 会负责 kill 掉 Container。mapreduce.reduce.java.opts=-Xmx16384m
,同 mapreduce.map.java.opts。mapreduce.reduce.memory.mb=12288
,同 mapreduce.map.memory.mb。yarn.app.mapreduce.am.command-opts=-Xmx8192m -Dhdp.version=${hdp.version}
,默认值为 -Xmx1024m
,App Master 堆内存大小。当提交的任务过多时,此值又设置过大,可能会导致 AM 占用的内存超过 yarn.scheduler.capacity.maximum-am-resource-percent
,当前提交的和后续提交的任务会在队列中等待。/etc/hadoop/conf/yarn-env.sh
export HADOOP_YARN_HOME=/usr/hdp/current/hadoop-yarn
export HADOOP_LOG_DIR=/var/log/hadoop-yarn/yarn
export HADOOP_SECURE_LOG_DIR=/var/log/hadoop-yarn/yarn
export HADOOP_PID_DIR=/var/run/hadoop-yarn/yarn
export HADOOP_SECURE_PID_DIR=/var/run/hadoop-yarn/yarn
export HADOOP_LIBEXEC_DIR=/usr/hdp/current/hadoop/libexec
export JAVA_HOME=/usr/local/jdk8
#export JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:/var/lib/ambari-server/data/tmp/hadoop_java_io_tmpdir"
export HADOOP_LOGLEVEL=${HADOOP_LOGLEVEL:-INFO}
export HADOOP_ROOT_LOGGER=${HADOOP_ROOT_LOGGER:-INFO,console}
export HADOOP_DAEMON_ROOT_LOGGER=${HADOOP_DAEMON_ROOT_LOGGER:-${HADOOP_LOGLEVEL},EWMA,RFA}
# User for YARN daemons
export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn}
# some Java parameters
# export JAVA_HOME=/home/y/libexec/jdk1.6.0/
if [ "$JAVA_HOME" != "" ]; then
#echo "run java in $JAVA_HOME"
JAVA_HOME=$JAVA_HOME
fi
if [ "$JAVA_HOME" = "" ]; then
echo "Error: JAVA_HOME is not set."
exit 1
fi
JAVA=$JAVA_HOME/bin/java
JAVA_HEAP_MAX=-Xmx1000m
YARN_HEAPSIZE=1024
# check envvars which might override default args
if [ "$YARN_HEAPSIZE" != "" ]; then
JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m"
fi
export YARN_RESOURCEMANAGER_HEAPSIZE=1024
export YARN_NODEMANAGER_HEAPSIZE=1024
export YARN_TIMELINESERVER_HEAPSIZE=1024
IFS=
# default log directory and file
if [ "$HADOOP_LOG_DIR" = "" ]; then
HADOOP_LOG_DIR="$HADOOP_YARN_HOME/logs"
fi
if [ "$HADOOP_LOGFILE" = "" ]; then
HADOOP_LOGFILE='yarn.log'
fi
# default policy file for service-level authorization
if [ "$YARN_POLICYFILE" = "" ]; then
YARN_POLICYFILE="hadoop-policy.xml"
fi
# restore ordinary behaviour
unset IFS
HADOOP_OPTS="$HADOOP_OPTS -Dyarn.id.str=$YARN_IDENT_STRING"
HADOOP_OPTS="$HADOOP_OPTS -Dyarn.policy.file=$YARN_POLICYFILE"
#HADOOP_OPTS="$HADOOP_OPTS -Djava.io.tmpdir=/var/lib/ambari-server/data/tmp/hadoop_java_io_tmpdir"
export YARN_NODEMANAGER_OPTS="$YARN_NODEMANAGER_OPTS -Dnm.audit.logger=INFO,NMAUDIT"
export YARN_RESOURCEMANAGER_OPTS="$YARN_RESOURCEMANAGER_OPTS -Dyarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY -Drm.audit.logger=INFO,RMAUDIT"
export YARN_REGISTRYDNS_SECURE_USER=yarn
export YARN_REGISTRYDNS_SECURE_EXTRA_OPTS="-jvm server"
HADOOP_LOG_DIR
和 HADOOP_SECURE_LOG_DIR
设置到数据盘。YARN_RESOURCEMANAGER_HEAPSIZE=3072
值可以适当调大,例如 3072。YARN_NODEMANAGER_HEAPSIZE=3072
值可以适当调大,例如 3072。YARN_TIMELINESERVER_HEAPSIZE=8072
值可以适当调大,例如 8072。/etc/hadoop/conf/capacity-scheduler.xml
capacity 调度配置文件,这里主要关注如下配置,其它默认或根据实际情况修改即可。这个配置为 AM 最大使用的 YARN 总内存的比值,默认为 0.1,如果当前 YARN 中运行的有流式任务或者同时运行的任务比较多,可以适当调大这个值,例如设置为 0.4 或者 0.5。
<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.4</value>
</property>
/etc/hadoop/conf/yarn-site.xml
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node01</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<!--<value>mapreduce_shuffle,spark2_shuffle,timeline_collector</value>-->
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/hadoop/yarn/local</value>
</property>
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/hadoop/yarn/log</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log.server.url</name>
<value>http://node01:19888/jobhistory/logs</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address</name>
<value>node01:8090</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>
$HADOOP_CONF_DIR,
/usr/hdp/current/hadoop/*,
/usr/hdp/current/hadoop/lib/*,
/usr/hdp/current/hadoop-hdfs/*,
/usr/hdp/current/hadoop-hdfs/lib/*,
/usr/hdp/current/hadoop-yarn/*,
/usr/hdp/current/hadoop-yarn/lib/*
</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>4096</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>256</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>5000</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>8</value>
</property>
<property>
<name>hadoop.http.authentication.type</name>
<value>simple</value>
</property>
<!--<property>
<name>hadoop.http.cross-origin.allowed-origins</name>
<value>regex:.*[.]bdm1[.]com(:\d*)?</value>
</property>-->
<property>
<name>hadoop.registry.dns.bind-address</name>
<value>0.0.0.0</value>
</property>
<property>
<name>hadoop.registry.dns.bind-port</name>
<value>53</value>
</property>
<property>
<name>hadoop.registry.dns.domain-name</name>
<value>EXAMPLE.COM</value>
</property>
<property>
<name>hadoop.registry.dns.enabled</name>
<value>true</value>
</property>
<property>
<name>hadoop.registry.dns.zone-mask</name>
<value>255.255.255.0</value>
</property>
<property>
<name>hadoop.registry.dns.zone-subnet</name>
<value>172.17.0.0</value>
</property>
<property>
<name>hadoop.registry.zk.quorum</name>
<value>node01:2181</value>
</property>
<property>
<name>manage.include.files</name>
<value>false</value>
</property>
<property>
<name>yarn.acl.enable</name>
<value>false</value>
</property>
<property>
<name>yarn.admin.acl</name>
<value>activity_analyzer,yarn</value>
</property>
<property>
<name>yarn.client.nodemanager-connect.max-wait-ms</name>
<value>60000</value>
</property>
<property>
<name>yarn.client.nodemanager-connect.retry-interval-ms</name>
<value>10000</value>
</property>
<property>
<name>yarn.http.policy</name>
<value>HTTP_ONLY</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>2592000</value>
</property>
<property>
<name>yarn.log.server.web-service.url</name>
<value>http://node01:8188/ws/v1/applicationhistory</value>
</property>
<property>
<name>yarn.node-labels.enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.node-labels.fs-store.retry-policy-spec</name>
<value>2000, 500</value>
</property>
<property>
<name>yarn.node-labels.fs-store.root-dir</name>
<value>/system/yarn/node-labels</value>
</property>
<property>
<name>yarn.nodemanager.address</name>
<value>0.0.0.0:45454</value>
</property>
<property>
<name>yarn.nodemanager.admin-env</name>
<value>MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark2_shuffle.classpath</name>
<value>/usr/hdp/current/spark2/aux/*</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.classpath</name>
<value>/usr/hdp/current/spark/aux/*</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.timeline_collector.class</name>
<value>org.apache.hadoop.yarn.server.timelineservice.collector.PerNodeTimelineCollectorsAuxService</value>
</property>
<property>
<name>yarn.nodemanager.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.nodemanager.container-executor.class</name>
<value>org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor</value>
</property>
<property>
<name>yarn.nodemanager.container-metrics.unregister-delay-ms</name>
<value>60000</value>
</property>
<property>
<name>yarn.nodemanager.container-monitor.interval-ms</name>
<value>3000</value>
</property>
<property>
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>0</value>
</property>
<property>
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
<value>90</value>
</property>
<property>
<name>yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb</name>
<value>1000</value>
</property>
<property>
<name>yarn.nodemanager.disk-health-checker.min-healthy-disks</name>
<value>0.25</value>
</property>
<property>
<name>yarn.nodemanager.health-checker.interval-ms</name>
<value>135000</value>
</property>
<property>
<name>yarn.nodemanager.health-checker.script.timeout-ms</name>
<value>60000</value>
</property>
<property>
<name>yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.linux-container-executor.group</name>
<value>hadoop</value>
</property>
<property>
<name>yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users</name>
<value>true</value>
</property>
<property>
<name>yarn.nodemanager.log-aggregation.compression-type</name>
<value>gz</value>
</property>
<property>
<name>yarn.nodemanager.log-aggregation.debug-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.log-aggregation.num-log-files-per-app</name>
<value>30</value>
</property>
<property>
<name>yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds</name>
<value>3600</value>
</property>
<property>
<name>yarn.nodemanager.log.retain-seconds</name>
<value>1209600</value>
</property>
<property>
<name>yarn.nodemanager.recovery.dir</name>
<value>/var/log/hadoop-yarn/nodemanager/recovery-state</value>
</property>
<property>
<name>yarn.nodemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.nodemanager.recovery.supervised</name>
<value>true</value>
</property>
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/hadoop/app-logs</value>
</property>
<property>
<name>yarn.nodemanager.remote-app-log-dir-suffix</name>
<value>logs</value>
</property>
<property>
<name>yarn.nodemanager.resource-plugins</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.resource-plugins.gpu.docker-plugin</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.resource-plugins.gpu.docker-plugin.nvidiadocker-v1.endpoint</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.resource.pcores-vcores-multiplier</name>
<value>2</value>
</property>
<property>
<name>yarn.nodemanager.resource.percentage-physical-cpu-limit</name>
<value>80</value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.allowed-runtimes</name>
<value>default,docker</value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.docker.allowed-container-networks</name>
<value>host,none,bridge</value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.docker.capabilities</name>
<value>
CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,
SETPCAP,NET_BIND_SERVICE,SYS_CHROOT,KILL,AUDIT_WRITE</value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.docker.default-container-network</name>
<value>host</value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.docker.privileged-containers.acl</name>
<value></value>
</property>
<property>
<name>yarn.nodemanager.runtime.linux.docker.privileged-containers.allowed</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>2.1</value>
</property>
<property>
<name>yarn.nodemanager.webapp.cross-origin.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>node01:8050</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>node01:8141</value>
</property>
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>2</value>
</property>
<property>
<name>yarn.resourcemanager.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarn-cluster</value>
</property>
<property>
<name>yarn.resourcemanager.connect.max-wait.ms</name>
<value>900000</value>
</property>
<property>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>30000</value>
</property>
<property>
<name>yarn.resourcemanager.display.per-user-apps</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.fs.state-store.retry-policy-spec</name>
<value>2000, 500</value>
</property>
<property>
<name>yarn.resourcemanager.fs.state-store.uri</name>
<value> </value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
<value>/yarn-leader-election</value>
</property>
<property>
<name>yarn.resourcemanager.monitor.capacity.preemption.intra-queue-preemption.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval</name>
<value>15000</value>
</property>
<property>
<name>yarn.resourcemanager.monitor.capacity.preemption.natural_termination_factor</name>
<value>1</value>
</property>
<property>
<name>yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round</name>
<value>0.1</value>
</property>
<!--<property>
<name>yarn.resourcemanager.nodes.exclude-path</name>
<value>/etc/hadoop/conf/yarn.exclude</value>
</property>-->
<property>
<name>yarn.resourcemanager.placement-constraints.handler</name>
<value>scheduler</value>
</property>
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>node01:8025</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>node01:8030</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.monitor.enable</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.state-store.max-completed-applications</name>
<value>${yarn.resourcemanager.max-completed-applications}</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<property>
<name>yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size</name>
<value>10</value>
</property>
<property>
<name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>node01:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.cross-origin.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.resourcemanager.work-preserving-recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms</name>
<value>10000</value>
</property>
<property>
<name>yarn.resourcemanager.zk-acl</name>
<value>world:anyone:rwcda</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node01:2181</value>
</property>
<property>
<name>yarn.resourcemanager.zk-num-retries</name>
<value>1000</value>
</property>
<property>
<name>yarn.resourcemanager.zk-retry-interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.resourcemanager.zk-state-store.parent-path</name>
<value>/rmstore</value>
</property>
<property>
<name>yarn.resourcemanager.zk-timeout-ms</name>
<value>10000</value>
</property>
<property>
<name>yarn.rm.system-metricspublisher.emit-container-events</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.capacity.ordering-policy.priority-utilization.underutilized-preemption.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.service.framework.path</name>
<value>/hdp/apps/${hdp.version}/hadoop-yarn/lib/service-dep.tar.gz</value>
</property>
<property>
<name>yarn.service.system-service.dir</name>
<value>/services</value>
</property>
<property>
<name>yarn.system-metricspublisher.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.timeline-service.address</name>
<value>node01:10200</value>
</property>
<property>
<name>yarn.timeline-service.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.timeline-service.client.max-retries</name>
<value>30</value>
</property>
<property>
<name>yarn.timeline-service.client.retry-interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.timeline-service.enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.active-dir</name>
<value>/ats/active/</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.app-cache-size</name>
<value>10</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.cleaner-interval-seconds</name>
<value>3600</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.done-dir</name>
<value>/ats/done/</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.group-id-plugin-classes</name>
<value>org.apache.hadoop.yarn.applications.distributedshell.DistributedShellTimelinePlugin</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.group-id-plugin-classpath</name>
<value></value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.scan-interval-seconds</name>
<value>60</value>
</property>
<property>
<name>yarn.timeline-service.entity-group-fs-store.summary-store</name>
<value>org.apache.hadoop.yarn.server.timeline.RollingLevelDBTimelineStore</value>
</property>
<property>
<name>yarn.timeline-service.generic-application-history.save-non-am-container-meta-info</name>
<value>false</value>
</property>
<property>
<name>yarn.timeline-service.generic-application-history.store-class</name>
<value>org.apache.hadoop.yarn.server.applicationhistoryservice.NullApplicationHistoryStore</value>
</property>
<property>
<name>yarn.timeline-service.hbase-schema.prefix</name>
<value>prod.</value>
</property>
<property>
<name>yarn.timeline-service.hbase.configuration.file</name>
<value>file:///usr/hdp/${hdp.version}/hadoop/conf/embedded-yarn-ats-hbase/hbase-site.xml</value>
</property>
<property>
<name>yarn.timeline-service.hbase.coprocessor.jar.hdfs.location</name>
<value>file:///usr/hdp/${hdp.version}/hadoop-yarn/timelineservice/hadoop-yarn-server-timelineservice-hbase-coprocessor.jar</value>
</property>
<property>
<name>yarn.timeline-service.http-authentication.proxyuser.root.groups</name>
<value>*</value>
</property>
<property>
<name>yarn.timeline-service.http-authentication.proxyuser.root.hosts</name>
<value>node01</value>
</property>
<property>
<name>yarn.timeline-service.http-authentication.simple.anonymous.allowed</name>
<value>true</value>
</property>
<property>
<name>yarn.timeline-service.http-authentication.type</name>
<value>simple</value>
</property>
<property>
<name>yarn.timeline-service.http-cross-origin.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-state-store.path</name>
<value>/hadoop/yarn/timeline</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-timeline-store.path</name>
<value>/hadoop/yarn/timeline</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-timeline-store.read-cache-size</name>
<value>104857600</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size</name>
<value>10000</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size</name>
<value>10000</value>
</property>
<property>
<name>yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms</name>
<value>300000</value>
</property>
<property>
<name>yarn.timeline-service.reader.webapp.address</name>
<value>node01:8198</value>
</property>
<property>
<name>yarn.timeline-service.reader.webapp.https.address</name>
<value>node01:8199</value>
</property>
<property>
<name>yarn.timeline-service.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.timeline-service.state-store-class</name>
<value>org.apache.hadoop.yarn.server.timeline.recovery.LeveldbTimelineStateStore</value>
</property>
<property>
<name>yarn.timeline-service.store-class</name>
<value>org.apache.hadoop.yarn.server.timeline.EntityGroupFSTimelineStore</value>
</property>
<property>
<name>yarn.timeline-service.ttl-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.timeline-service.ttl-ms</name>
<value>2678400000</value>
</property>
<property>
<name>yarn.timeline-service.version</name>
<value>2.0f</value>
</property>
<property>
<name>yarn.timeline-service.versions</name>
<value>1.5f,2.0f</value>
</property>
<property>
<name>yarn.timeline-service.webapp.address</name>
<value>node01:8188</value>
</property>
<property>
<name>yarn.timeline-service.webapp.https.address</name>
<value>node01:8190</value>
</property>
<property>
<name>yarn.webapp.api-service.enable</name>
<value>true</value>
</property>
<property>
<name>yarn.webapp.ui2.enable</name>
<value>true</value>
</property>
<!--
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>bdm0</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>bdm1</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>bdm0:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>bdm1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm1</name>
<value>bdm0:8090</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm2</name>
<value>bdm1:8090</value>
</property>
-->
yarn.nodemanager.resource.cpu-vcores=64
每个nodemanager可以使用的核数资源,默认为 -1,例如这里设置为物理核数的两倍。
yarn.nodemanager.resource.memory-mb=131072
每个nodemanager可以使用的内存资源,如果该值为 -1 时,并且配置项 yarn.nodemanager.resource.detect-hardware-capabilities=true
,则会自动计算,其他情况下默认值为 8192MB。例如这里设置为 128GB。
yarn.scheduler.minimum-allocation-vcores=1
单个Container可申请的最小虚拟核数,例如这里指定为 1。
yarn.scheduler.minimum-allocation-mb=2048
单个Container可申请的最小内存资源,例如这里指定为 2048。
yarn.scheduler.maximum-allocation-vcores=8
单个Container可申请的最大虚拟核数,例如这里指定为 8。
yarn.scheduler.maximum-allocation-mb=30720
单个Container可申请的最多内存资源,例如这里指定为 30720。
yarn.timeline-service.enabled=false
是否启用 timeline 服务,这里设置为 false,不启用此服务。
yarn.nodemanager.log-aggregation.compression-type=gz
日志的压缩类型算法,默认为 none,这里设置为 gz。
其它配置项详见官方文档 yarn-default.xml
/etc/zookeeper/conf/zookeeper-env.sh
export JAVA_HOME=/usr/local/jdk8
export ZOOKEEPER_HOME=/usr/hdp/current/zookeeper
export ZOO_LOG_DIR=/var/log/zookeeper
export ZOOPIDFILE=/var/run/zookeeper/zookeeper_server.pid
export SERVER_JVMFLAGS=-Xmx256m
export JAVA=$JAVA_HOME/bin/java
export CLASSPATH=$CLASSPATH:/usr/share/zookeeper/*
ZOO_LOG_DIR
日志可以设置到某个数据盘。SERVER_JVMFLAGS
最大堆内存可设置为 -Xmx1024m
。/etc/zookeeper/conf/zoo.cfg
tickTime=2000
maxClientCnxns=50
initLimit=10
syncLimit=5
dataDir=/var/lib/zookeeper
clientPort=2181
autopurge.snapRetainCount=5
autopurge.purgeInterval=24
admin.enableServer=false
server.1=node01:2887:3887
#...
/etc/tez/conf/tez-env.sh
export TEZ_CONF_DIR=/etc/tez/conf/
export HADOOP_HOME=${HADOOP_HOME:-/usr}
export JAVA_HOME=/usr/local/jdk8
/etc/tez/conf/tez-site.xml
<property>
<name>tez.task.resource.memory.mb</name>
<value>512</value>
</property>
<property>
<name>tez.am.resource.memory.mb</name>
<value>512</value>
</property>
<property>
<name>tez.counters.max</name>
<value>10000</value>
</property>
<property>
<name>tez.lib.uris</name>
<value>/hdp/apps/3.1.5.0-152/tez/tez.tar.gz</value>
</property>
<property>
<name>tez.runtime.io.sort.mb</name>
<value>256</value>
</property>
<property>
<name>tez.am.java.opts</name>
<value>-server -Xmx512m -Djava.net.preferIPv4Stack=true</value>
</property>
<property>
<name>tez.am.launch.env</name>
<value>LD_LIBRARY_PATH=/usr/hdp/current/hadoop/lib/native:/usr/hdp/current/hadoop/lib/native/Linux-amd64-64</value>
</property>
<property>
<name>tez.cluster.additional.classpath.prefix</name>
<value>/usr/hdp/current/hadoop/lib/hadoop-lzo-0.6.0.${hdp.version}.jar:/etc/hadoop/conf/secure</value>
</property>
<property>
<name>tez.task.launch.cmd-opts</name>
<value>-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB</value>
</property>
<property>
<name>tez.task.launch.env</name>
<value>LD_LIBRARY_PATH=/usr/hdp/current/hadoop/lib/native:/usr/hdp/current/hadoop/lib/native/Linux-amd64-64</value>
</property>
<property>
<name>tez.am.am-rm.heartbeat.interval-ms.max</name>
<value>250</value>
</property>
<property>
<name>tez.am.container.idle.release-timeout-max.millis</name>
<value>20000</value>
</property>
<property>
<name>tez.am.container.idle.release-timeout-min.millis</name>
<value>10000</value>
</property>
<property>
<name>tez.am.container.reuse.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.am.container.reuse.locality.delay-allocation-millis</name>
<value>250</value>
</property>
<property>
<name>tez.am.container.reuse.non-local-fallback.enabled</name>
<value>false</value>
</property>
<property>
<name>tez.am.container.reuse.rack-fallback.enabled</name>
<value>true</value>
</property>
<property>
<name>tez.am.launch.cluster-default.cmd-opts</name>
<value>-server -Djava.net.preferIPv4Stack=true -Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>tez.am.launch.cmd-opts</name>
<value>-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB</value>
</property>
<property>
<name>tez.am.log.level</name>
<value>INFO</value>
</property>
<property>
<name>tez.am.max.app.attempts</name>
<value>2</value>
</property>
<property>
<name>tez.am.maxtaskfailures.per.node</name>
<value>10</value>
</property>
<property>
<name>tez.am.tez-ui.history-url.template</name>
<value>__HISTORY_URL_BASE__?viewPath=%2F%23%2Ftez-app%2F__APPLICATION_ID__</value>
</property>
<property>
<name>tez.am.view-acls</name>
<value>*</value>
</property>
<property>
<name>tez.counters.max.groups</name>
<value>3000</value>
</property>
<property>
<name>tez.generate.debug.artifacts</name>
<value>false</value>
</property>
<property>
<name>tez.grouping.max-size</name>
<value>1073741824</value>
</property>
<property>
<name>tez.grouping.min-size</name>
<value>16777216</value>
</property>
<property>
<name>tez.grouping.split-waves</name>
<value>1.7</value>
</property>
<property>
<name>tez.history.logging.proto-base-dir</name>
<value>/warehouse/tablespace/external/hive/sys.db</value>
</property>
<property>
<name>tez.history.logging.service.class</name>
<value>org.apache.tez.dag.history.logging.proto.ProtoHistoryLoggingService</value>
</property>
<property>
<name>tez.history.logging.timeline-cache-plugin.old-num-dags-per-group</name>
<value>5</value>
</property>
<property>
<name>tez.queue.name</name>
<value>default</value>
</property>
<property>
<name>tez.runtime.compress</name>
<value>true</value>
</property>
<property>
<name>tez.runtime.compress.codec</name>
<value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>tez.runtime.convert.user-payload.to.history-text</name>
<value>false</value>
</property>
<property>
<name>tez.runtime.optimize.local.fetch</name>
<value>true</value>
</property>
<property>
<name>tez.runtime.pipelined.sorter.sort.threads</name>
<value>2</value>
</property>
<property>
<name>tez.runtime.shuffle.fetch.buffer.percent</name>
<value>0.6</value>
</property>
<property>
<name>tez.runtime.shuffle.memory.limit.percent</name>
<value>0.25</value>
</property>
<property>
<name>tez.runtime.sorter.class</name>
<value>PIPELINED</value>
</property>
<property>
<name>tez.runtime.unordered.output.buffer.size-mb</name>
<value>768</value>
</property>
<property>
<name>tez.session.am.dag.submit.timeout.secs</name>
<value>600</value>
</property>
<property>
<name>tez.session.client.timeout.secs</name>
<value>-1</value>
</property>
<property>
<name>tez.shuffle-vertex-manager.max-src-fraction</name>
<value>0.4</value>
</property>
<property>
<name>tez.shuffle-vertex-manager.min-src-fraction</name>
<value>0.2</value>
</property>
<property>
<name>tez.staging-dir</name>
<value>/tmp/${user.name}/staging</value>
</property>
<property>
<name>tez.task.am.heartbeat.counter.interval-ms.max</name>
<value>4000</value>
</property>
<property>
<name>tez.task.generate.counters.per.io</name>
<value>true</value>
</property>
<property>
<name>tez.task.get-task.sleep.interval-ms.max</name>
<value>200</value>
</property>
<property>
<name>tez.task.launch.cluster-default.cmd-opts</name>
<value>-server -Djava.net.preferIPv4Stack=true -Dhdp.version=${hdp.version}</value>
</property>
<property>
<name>tez.task.max-events-per-heartbeat</name>
<value>500</value>
</property>
<property>
<name>tez.use.cluster.hadoop-libs</name>
<value>false</value>
</property>
<property>
<name>yarn.timeline-service.enabled</name>
<value>false</value>
</property>
tez.task.resource.memory.mb=8192
默认为 1024,tez 任务使用的内存大小,这里设置为 8192,这个值适当调大有利于性能的提升。tez.am.resource.memory.mb=5120
默认为 1024,tez 任务的 AppMaster 要使用的内存量,这里设置为 5120。tez.counters.max=10000
,高级配置,默认值为 1200,限制每个 dag 的数量(AppMaster 和 Task),例如这里设置为 10000。tez.lib.uris=/hdp/apps/3.1.5.0-152/tez/tez.tar.gz
,必填项,HDFS 上的路径,需要将 /usr/hdp/current/tez/lib/tez.tar.gz 资源上传到这里配置的 HDFS 路径上。tez.runtime.io.sort.mb=2703
这里设置为 2703。tez.am.java.opts=-server -Xmx8192m -Djava.net.preferIPv4Stack=true
。/etc/hive/conf/hive-env.sh
if [ "$SERVICE" = "metastore" ]; then
export HADOOP_HEAPSIZE=12288 # Setting for HiveMetastore
export HADOOP_OPTS="$HADOOP_OPTS -Xloggc:/var/log/hive/hivemetastore-gc-%t.log -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCCause -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/var/log/hive/hms_heapdump.hprof -Dhive.log.dir=/var/log/hive -Dhive.log.file=hivemetastore.log -Duser.timezone=Asia/Shanghai"
fi
if [ "$SERVICE" = "hiveserver2" ]; then
export HADOOP_HEAPSIZE=12288 # Setting for HiveServer2 and Client
export HADOOP_OPTS="$HADOOP_OPTS -Xloggc:/var/log/hive/hiveserver2-gc-%t.log -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCCause -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/var/log/hive/hs2_heapdump.hprof -Dhive.log.dir=/var/log/hive -Dhive.log.file=hiveserver2.log -Duser.timezone=Asia/Shanghai"
fi
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -Xmx${HADOOP_HEAPSIZE}m"
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS"
HADOOP_HOME=${HADOOP_HOME:-/usr/hdp/current/hadoop}
export HIVE_HOME=${HIVE_HOME:-/usr/hdp/current/hive}
export HIVE_CONF_DIR=${HIVE_CONF_DIR:-/usr/hdp/current/hive/conf}
if [ "${HIVE_AUX_JARS_PATH}" != "" ]; then
if [ -f "${HIVE_AUX_JARS_PATH}" ]; then
export HIVE_AUX_JARS_PATH=${HIVE_AUX_JARS_PATH}
elif [ -d "/usr/hdp/current/hive-hcatalog/share/hcatalog" ]; then
export HIVE_AUX_JARS_PATH=/usr/hdp/current/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar
fi
elif [ -d "/usr/hdp/current/hive-hcatalog/share/hcatalog" ]; then
export HIVE_AUX_JARS_PATH=/usr/hdp/current/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar
fi
export METASTORE_PORT=9083
HADOOP_HEAPSIZE
对于 metastore 和 hiveserver2 堆内存可以适当调大,如果过小请求执行的任务多时可能会卡死,例如这里设置为 12288。HADOOP_OPTS
为了方式时区问题,在 metastore 和 hiveserver2 中添加 -Duser.timezone=Asia/Shanghai
参数,设置为中国东八区。/etc/hive/conf/hive-exec-log4j2.properties
参考 /etc/hive/conf/hive-exec-log4j2.properties.template
/etc/hive/conf/hive-log4j2.properties
参考 /etc/hive/conf/hive-log4j2.properties.template
/etc/hive/conf/hive-site.xml
<property>
<name>hive.server2.thrift.bind.host</name>
<value>node01</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://node01:9083</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/warehouse/tablespace/managed/hive</value>
</property>
<property>
<name>hive.metastore.warehouse.external.dir</name>
<value>/warehouse/tablespace/external/hive</value>
</property>
<property>
<name>hive.metastore.db.type</name>
<value>mysql</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://node01:3306/hive?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.tez.container.size</name>
<value>512</value>
</property>
<property>
<name>hive.heapsize</name>
<value>512</value>
</property>
<property>
<name>hive.server2.logging.operation.log.location</name>
<value>/tmp/hive/operation_logs</value>
</property>
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>hive.exec.local.scratchdir</name>
<value>/hadoop/hive/exec/${user.name}</value>
</property>
<property>
<name>hive.downloaded.resources.dir</name>
<value>/hadoop/hive/${hive.session.id}_resources</value>
</property>
<property>
<name>hive.querylog.location</name>
<value>/hadoop/hive/log</value>
</property>
<property>
<name>hive.server2.logging.operation.log.location</name>
<value>/hadoop/hive/server2/${user.name}/operation_logs</value>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<property>
<name>hive.server2.thrift.client.user</name>
<value>hive</value>
</property>
<property>
<name>hive.server2.thrift.client.password</name>
<value>hive</value>
</property>
<property>
<name>hive.server2.tez.initialize.default.sessions</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.event.listeners</name>
<value></value>
</property>
<property>
<name>hive.cluster.delegation.token.store.zookeeper.connectString</name>
<value>node01:2181</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>node01</value>
</property>
<property>
<name>hive.cluster.delegation.token.store.class</name>
<value>org.apache.hadoop.hive.thrift.ZooKeeperTokenStore</value>
</property>
<property>
<name>hive.cluster.delegation.token.store.zookeeper.znode</name>
<value>/hive/cluster/delegation</value>
</property>
<property>
<name>hive.server2.zookeeper.namespace</name>
<value>hiveserver2</value>
</property>
<property>
<name>hive.zookeeper.client.port</name>
<value>2181</value>
</property>
<property>
<name>hive.zookeeper.namespace</name>
<value>hive_zookeeper_namespace</value>
</property>
<property>
<name>hive.zookeeper.quorum</name>
<value>node01:2181</value>
</property>
<property>
<name>atlas.hook.hive.maxThreads</name>
<value>1</value>
</property>
<property>
<name>atlas.hook.hive.minThreads</name>
<value>1</value>
</property>
<property>
<name>datanucleus.autoCreateSchema</name>
<value>false</value>
</property>
<property>
<name>datanucleus.cache.level2.type</name>
<value>none</value>
</property>
<property>
<name>datanucleus.fixedDatastore</name>
<value>true</value>
</property>
<property>
<name>hive.auto.convert.join</name>
<value>true</value>
</property>
<property>
<name>hive.auto.convert.join.noconditionaltask</name>
<value>true</value>
</property>
<property>
<name>hive.auto.convert.join.noconditionaltask.size</name>
<value>10737418240</value>
</property>
<property>
<name>hive.auto.convert.sortmerge.join</name>
<value>true</value>
</property>
<property>
<name>hive.auto.convert.sortmerge.join.to.mapjoin</name>
<value>true</value>
</property>
<property>
<name>hive.cbo.enable</name>
<value>true</value>
</property>
<property>
<name>hive.cli.print.header</name>
<value>false</value>
</property>
<property>
<name>hive.compactor.abortedtxn.threshold</name>
<value>1000</value>
</property>
<property>
<name>hive.compactor.check.interval</name>
<value>300</value>
</property>
<property>
<name>hive.compactor.delta.num.threshold</name>
<value>10</value>
</property>
<property>
<name>hive.compactor.delta.pct.threshold</name>
<value>0.1f</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>4</value>
</property>
<property>
<name>hive.compactor.worker.timeout</name>
<value>86400</value>
</property>
<property>
<name>hive.compute.query.using.stats</name>
<value>true</value>
</property>
<property>
<name>hive.convert.join.bucket.mapjoin.tez</name>
<value>false</value>
</property>
<property>
<name>hive.create.as.insert.only</name>
<value>true</value>
</property>
<property>
<name>hive.default.fileformat</name>
<value>TextFile</value>
</property>
<property>
<name>hive.default.fileformat.managed</name>
<value>ORC</value>
</property>
<property>
<name>hive.driver.parallel.compilation</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.sortmergebucketmapjoin</name>
<value>true</value>
</property>
<property>
<name>hive.exec.compress.intermediate</name>
<value>false</value>
</property>
<property>
<name>hive.exec.compress.output</name>
<value>false</value>
</property>
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
</property>
<property>
<name>hive.exec.failure.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
</property>
<property>
<name>hive.exec.max.created.files</name>
<value>100000</value>
</property>
<property>
<name>hive.exec.max.dynamic.partitions</name>
<value>5000</value>
</property>
<property>
<name>hive.exec.max.dynamic.partitions.pernode</name>
<value>2000</value>
</property>
<property>
<name>hive.exec.orc.split.strategy</name>
<value>HYBRID</value>
</property>
<property>
<name>hive.exec.parallel</name>
<value>false</value>
</property>
<property>
<name>hive.exec.parallel.thread.number</name>
<value>8</value>
</property>
<property>
<name>hive.exec.post.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
</property>
<property>
<name>hive.exec.pre.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
</property>
<property>
<name>hive.exec.reducers.bytes.per.reducer</name>
<value>4294967296</value>
</property>
<property>
<name>hive.exec.reducers.max</name>
<value>1009</value>
</property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive</value>
</property>
<property>
<name>hive.exec.submit.local.task.via.child</name>
<value>true</value>
</property>
<property>
<name>hive.exec.submitviachild</name>
<value>false</value>
</property>
<property>
<name>hive.execution.mode</name>
<value>container</value>
</property>
<property>
<name>hive.fetch.task.aggr</name>
<value>false</value>
</property>
<property>
<name>hive.fetch.task.conversion</name>
<value>none</value>
</property>
<property>
<name>hive.fetch.task.conversion.threshold</name>
<value>1073741824</value>
</property>
<property>
<name>hive.hook.proto.base-directory</name>
<value>{hive_metastore_warehouse_external_dir}/sys.db/query_data/</value>
</property>
<property>
<name>hive.limit.optimize.enable</name>
<value>true</value>
</property>
<property>
<name>hive.limit.pushdown.memory.usage</name>
<value>0.04</value>
</property>
<property>
<name>hive.load.data.owner</name>
<value>hive</value>
</property>
<property>
<name>hive.lock.manager</name>
<value></value>
</property>
<property>
<name>hive.log.explain.output</name>
<value>true</value>
</property>
<property>
<name>hive.map.aggr</name>
<value>true</value>
</property>
<property>
<name>hive.map.aggr.hash.force.flush.memory.threshold</name>
<value>0.9</value>
</property>
<property>
<name>hive.map.aggr.hash.min.reduction</name>
<value>0.5</value>
</property>
<property>
<name>hive.map.aggr.hash.percentmemory</name>
<value>0.5</value>
</property>
<property>
<name>hive.mapjoin.bucket.cache.size</name>
<value>10000</value>
</property>
<property>
<name>hive.mapjoin.hybridgrace.hashtable</name>
<value>false</value>
</property>
<property>
<name>hive.mapjoin.optimized.hashtable</name>
<value>true</value>
</property>
<property>
<name>hive.mapred.reduce.tasks.speculative.execution</name>
<value>true</value>
</property>
<property>
<name>hive.materializedview.rewriting.incremental</name>
<value>false</value>
</property>
<property>
<name>hive.merge.mapfiles</name>
<value>true</value>
</property>
<property>
<name>hive.merge.mapredfiles</name>
<value>false</value>
</property>
<property>
<name>hive.merge.orcfile.stripe.level</name>
<value>true</value>
</property>
<property>
<name>hive.merge.rcfile.block.level</name>
<value>true</value>
</property>
<property>
<name>hive.merge.size.per.task</name>
<value>256000000</value>
</property>
<property>
<name>hive.merge.smallfiles.avgsize</name>
<value>100000000</value>
</property>
<property>
<name>hive.merge.tezfiles</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.authorization.storage.checks</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.cache.pinobjtypes</name>
<value>Table,Database,Type,FieldSchema,Order</value>
</property>
<property>
<name>hive.metastore.client.connect.retry.delay</name>
<value>5s</value>
</property>
<property>
<name>hive.metastore.client.socket.timeout</name>
<value>1800s</value>
</property>
<property>
<name>hive.metastore.connect.retries</name>
<value>24</value>
</property>
<property>
<name>hive.metastore.dml.events</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.event.listeners</name>
<value></value>
</property>
<property>
<name>hive.metastore.execute.setugi</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.failure.retries</name>
<value>24</value>
</property>
<property>
<name>hive.metastore.pre.event.listeners</name>
<value>org.apache.hadoop.hive.ql.security.authorization.AuthorizationPreEventListener</value>
</property>
<property>
<name>hive.metastore.sasl.enabled</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.server.max.threads</name>
<value>100000</value>
</property>
<property>
<name>hive.metastore.transactional.event.listeners</name>
<value>org.apache.hive.hcatalog.listener.DbNotificationListener</value>
</property>
<property>
<name>hive.optimize.bucketmapjoin</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.bucketmapjoin.sortedmerge</name>
<value>false</value>
</property>
<property>
<name>hive.optimize.constant.propagation</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.cp</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.dynamic.partition.hashjoin</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.index.filter</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.metadataonly</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.null.scan</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.reducededuplication</name>
<value>true</value>
</property>
<property>
<name>hive.optimize.reducededuplication.min.reducer</name>
<value>4</value>
</property>
<property>
<name>hive.optimize.sort.dynamic.partition</name>
<value>false</value>
</property>
<property>
<name>hive.orc.compute.splits.num.threads</name>
<value>10</value>
</property>
<property>
<name>hive.orc.splits.include.file.footer</name>
<value>false</value>
</property>
<property>
<name>hive.prewarm.enabled</name>
<value>false</value>
</property>
<property>
<name>hive.prewarm.numcontainers</name>
<value>3</value>
</property>
<property>
<name>hive.repl.cm.enabled</name>
<value></value>
</property>
<property>
<name>hive.repl.cmrootdir</name>
<value></value>
</property>
<property>
<name>hive.repl.rootdir</name>
<value></value>
</property>
<property>
<name>hive.security.authorization.createtable.owner.grants</name>
<value>ALL</value>
</property>
<property>
<name>hive.security.metastore.authenticator.manager</name>
<value>org.apache.hadoop.hive.ql.security.HadoopDefaultMetastoreAuthenticator</value>
</property>
<property>
<name>hive.security.metastore.authorization.auth.reads</name>
<value>false</value>
</property>
<property>
<name>hive.security.metastore.authorization.manager</name>
<value>org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider</value>
</property>
<property>
<name>hive.server2.allow.user.substitution</name>
<value>true</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>true</value>
</property>
<property>
<name>hive.server2.idle.operation.timeout</name>
<value>6h</value>
</property>
<property>
<name>hive.server2.idle.session.timeout</name>
<value>1d</value>
</property>
<property>
<name>hive.server2.logging.operation.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.server2.max.start.attempts</name>
<value>5</value>
</property>
<property>
<name>hive.server2.support.dynamic.service.discovery</name>
<value>true</value>
</property>
<property>
<name>hive.server2.table.type.mapping</name>
<value>CLASSIC</value>
</property>
<property>
<name>hive.server2.tez.default.queues</name>
<value>default</value>
</property>
<property>
<name>hive.server2.tez.sessions.per.default.queue</name>
<value>1</value>
</property>
<property>
<name>hive.server2.thrift.http.path</name>
<value>cliservice</value>
</property>
<property>
<name>hive.server2.thrift.http.port</name>
<value>10001</value>
</property>
<property>
<name>hive.server2.thrift.max.worker.threads</name>
<value>1200</value>
</property>
<property>
<name>hive.server2.thrift.sasl.qop</name>
<value>auth</value>
</property>
<property>
<name>hive.server2.transport.mode</name>
<value>binary</value>
</property>
<property>
<name>hive.server2.use.SSL</name>
<value>false</value>
</property>
<property>
<name>hive.server2.webui.cors.allowed.headers</name>
<value>X-Requested-With,Content-Type,Accept,Origin,X-Requested-By,x-requested-by</value>
</property>
<property>
<name>hive.server2.webui.enable.cors</name>
<value>true</value>
</property>
<property>
<name>hive.server2.webui.port</name>
<value>10002</value>
</property>
<property>
<name>hive.server2.webui.use.ssl</name>
<value>false</value>
</property>
<property>
<name>hive.service.metrics.codahale.reporter.classes</name>
<value>org.apache.hadoop.hive.common.metrics.metrics2.JsonFileMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.JmxMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.Metrics2Reporter</value>
</property>
<property>
<name>hive.smbjoin.cache.rows</name>
<value>10000</value>
</property>
<property>
<name>hive.stats.autogather</name>
<value>true</value>
</property>
<property>
<name>hive.stats.dbclass</name>
<value>fs</value>
</property>
<property>
<name>hive.stats.fetch.column.stats</name>
<value>true</value>
</property>
<property>
<name>hive.stats.fetch.partition.stats</name>
<value>true</value>
</property>
<property>
<name>hive.strict.managed.tables</name>
<value>false</value>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.tez.auto.reducer.parallelism</name>
<value>true</value>
</property>
<property>
<name>hive.tez.bucket.pruning</name>
<value>true</value>
</property>
<property>
<name>hive.tez.cartesian-product.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.tez.cpu.vcores</name>
<value>-1</value>
</property>
<property>
<name>hive.tez.dynamic.partition.pruning</name>
<value>true</value>
</property>
<property>
<name>hive.tez.dynamic.partition.pruning.max.data.size</name>
<value>104857600</value>
</property>
<property>
<name>hive.tez.dynamic.partition.pruning.max.event.size</name>
<value>1048576</value>
</property>
<property>
<name>hive.tez.exec.print.summary</name>
<value>true</value>
</property>
<property>
<name>hive.tez.input.format</name>
<value>org.apache.hadoop.hive.ql.io.HiveInputFormat</value>
</property>
<property>
<name>hive.tez.input.generate.consistent.splits</name>
<value>true</value>
</property>
<property>
<name>hive.tez.java.opts</name>
<value>-server -Djava.net.preferIPv4Stack=true -XX:NewRatio=8 -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps</value>
</property>
<property>
<name>hive.tez.log.level</name>
<value>INFO</value>
</property>
<property>
<name>hive.tez.max.partition.factor</name>
<value>2.0</value>
</property>
<property>
<name>hive.tez.min.partition.factor</name>
<value>0.25</value>
</property>
<property>
<name>hive.tez.smb.number.waves</name>
<value>0.5</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.txn.max.open.batch</name>
<value>1000</value>
</property>
<property>
<name>hive.txn.strict.locking.mode</name>
<value>false</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>300</value>
</property>
<property>
<name>hive.user.install.directory</name>
<value>/user/</value>
</property>
<property>
<name>hive.vectorized.execution.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.vectorized.execution.mapjoin.minmax.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.vectorized.execution.mapjoin.native.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.vectorized.execution.reduce.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.vectorized.groupby.checkinterval</name>
<value>4096</value>
</property>
<property>
<name>hive.vectorized.groupby.flush.percent</name>
<value>0.1</value>
</property>
<property>
<name>hive.vectorized.groupby.maxentries</name>
<value>100000</value>
</property>
<property>
<name>mapred.max.split.size</name>
<value>256000000</value>
</property>
<property>
<name>mapred.min.split.size.per.node</name>
<value>128000000</value>
</property>
<property>
<name>mapred.min.split.size.per.rack</name>
<value>128000000</value>
</property>
<property>
<name>metastore.create.as.acid</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.kerberos.keytab.file</name>
<value>/etc/security/keytabs/hive.service.keytab</value>
</property>
<property>
<name>hive.metastore.kerberos.principal</name>
<value>hive/_HOST@EXAMPLE.COM</value>
</property>
<property>
<name>hive.server2.authentication.spnego.keytab</name>
<value>HTTP/_HOST@EXAMPLE.COM</value>
</property>
<property>
<name>hive.server2.authentication.spnego.principal</name>
<value>/etc/security/keytabs/spnego.service.keytab</value>
</property>
<!--
<property>
<name>hive.kudu.master.addresses.default</name>
<value>bdd11:7051,bdd12:7051,bdd13:7051,app1:7051,es2:7051</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>LDAP</value>
</property>
<property>
<name>hive.server2.authentication.ldap.baseDN</name>
<value>ou=bigdata,dc=gdh,dc=yore,dc=com</value>
</property>
<property>
<name>hive.server2.authentication.ldap.url</name>
<value>ldap://bdm0:389</value>
</property>
<property>
<name>hive.cluster.delegation.token.store.zookeeper.connectString</name>
<value>bdm0:2181,bdm1:2181,etl1:2181,es1:2181,es2:2181</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>bdm0,bdm1,etl1,es1,es2</value>
</property>
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<property>
<name>hive.cluster.delegation.token.store.class</name>
<value>org.apache.hadoop.hive.thrift.ZooKeeperTokenStore</value>
</property>
<property>
<name>hive.cluster.delegation.token.store.zookeeper.znode</name>
<value>/hive/cluster/delegation</value>
</property>
<property>
<name>hive.server2.zookeeper.namespace</name>
<value>hiveserver2</value>
</property>
<property>
<name>hive.zookeeper.client.port</name>
<value>2181</value>
</property>
<property>
<name>hive.zookeeper.namespace</name>
<value>hive_zookeeper_namespace</value>
</property>
<property>
<name>hive.zookeeper.quorum</name>
<value>node01:2181</value>
</property>
<property>
<name>zookeeper.znode.parent</name>
<value>/hbase-unsecure</value>
</property>
-->
hive.metastore.warehouse.dir=/warehouse/tablespace/managed/hive
和 hive.metastore.warehouse.external.dir=/warehouse/tablespace/external/hive
指定了内表和外表在 HDFS 上存储的路径。hive.metastore.db.type=mysql
、javax.jdo.option.ConnectionDriverName=com.mysql.jdbc.Driver=com.mysql.jdbc.Driver
、javax.jdo.option.ConnectionURL=jdbc:mysql://node01:3306/hive?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false
、javax.jdo.option.ConnectionUserName=hive
、javax.jdo.option.ConnectionPassword=******
等配置,生产环境建议保存在单独集群之外的关系型数据库中。hive.tez.container.size=8192
和 Tez 配置中的 tez.task.resource.memory.mb
保存一直。hive.heapsize=2048
可以适当调大,例如这里设置为 2048。hive.insert.into.multilevel.dirs=true
当为 true 时表示允许生成多级目录,否则父级目录必须存在。hive.exec.stagingdir=/tmp/hive/.hive-staging
默认为 .hive-staging,表示在表当前目录下生成 .hive-staging 的暂存文件,直立指定到表目录外的 HDFS 路径,因为生产环境 Hive 数据可能会使用一些同步工具抽数,有些工具为了加快读写速度,会直接读取 Hive 对应表的 HDFS 路径上满足要求的文件,例如 DataX,如果当前正在变更 Hive 数据,则使用工具同步数据时可能数据会重复或者加倍。hive.metastore.event.listeners
如果需要监听 hive metastore 可以配置此项,自定义时将继承 MetaStoreEventListener
的实现类打好的 jar 包放到 $HIVE_HOME/lib
下并配置到此项即可。同样的配置发送到其它节点,并进行适当的修改。
如果使用普通用户启动,大多会遇到的权限问题,根据日志将对应的文件夹设置为启动用户有权限的属组。测试环境可能会遇到资源问题,可以根据实际情况调小部分配置即可。
如果 yarn.nodemanager.aux-services
添加了 spark2_shuffle
,启动 YARN 可能会报如下错误 :
ERROR org.apache.hadoop.yarn.server.nodemanager.NodeManager: Error starting NodeManager
java.lang.UnsatisfiedLinkError: Could not load library. Reasons: [no leveldbjni64-1.8 in java.library.path, no leveldbjni-1.8 in java.library.path, no leveldbjni in java.library.path, No such file or directory]
hadoop 启动时会加载 /usr/hdp/3.1.5.0-152/hadoop-hdfs/lib/leveldbjni-all-1.8.jar
,
将 spark2_3_1_5_0_152-yarn-shuffle-2.3.2.3.1.5.0-152.noarch.rpm 中的 spark-2.3.2.3.1.5.0-152-yarn-shuffle.jar
对应版本的 libleveldbjni.so
,放置到 java.library.path 中,系统当前 java.library.path 值可通过如下查看:
java -XshowSettings:properties
/usr/hdp/current/hadoop/bin/hdfs namenode -format
chown -R hdfs:hadoop /hadoop
mkdir /hadoop/{yarn,mapred,mapreduce}
hadoop fs -mkdir /{home,user,tmp}
hadoop fs -mkdir -p /hdp/apps/3.1.5.0-152/{mapreduce,tez}
hadoop fs -put /usr/hdp/current/hadoop/mapreduce.tar.gz /hdp/apps/3.1.5.0-152/mapreduce/
chmod 755 /usr/hdp/3.1.5.0-152/hadoop-yarn/bin/container-executor
usermod -G hadoop hdfs
usermod -G hadoop yarn
usermod -G hdfs yarn
chown root:hadoop /var/lib/{hadoop-hdfs,hadoop-mapreduce,hadoop-yarn}
hdfs dfsadmin -safemode get
hdfs dfsadmin -safemode leave
#su - hdfs
/usr/hdp/current/hadoop/bin/hdfs --config /etc/hadoop/conf --daemon start namenode
/usr/hdp/current/hadoop/bin/hdfs --config /etc/hadoop/conf --daemon start secondarynamenode
/usr/hdp/current/hadoop/bin/hdfs --config /etc/hadoop/conf --daemon start datanode
#su - yarn
/usr/hdp/current/hadoop/bin/yarn --config /etc/hadoop/conf --daemon start nodemanager
/usr/hdp/current/hadoop/bin/yarn --config /etc/hadoop/conf --daemon start resourcemanager
mkdir -p /var/lib/zookeeper
echo "1" > /var/lib/zookeeper/myid
/usr/hdp/current/zookeeper/bin/zkServer.sh start /etc/zookeeper/conf/zoo.cfg
wget https://repo.huaweicloud.com/repository/maven/mysql/mysql-connector-java/5.1.47/mysql-connector-java-5.1.47.jar -P /usr/hdp/current/hive/lib/
/usr/hdp/current/hive/bin/schematool -dbType mysql -initSchema
hadoop fs -put /usr/hdp/current/tez/lib/tez.tar.gz /hdp/apps/3.1.5.0-152/tez/
/usr/hdp/current/hive/bin/hive --service metastore >/dev/null 2>&1 &
/usr/hdp/current/hive/bin/hive --service hiveserver2 >/dev/null 2>&1 &
hadoop dfs -mkdir /tmp/input
hadoop fs -put /usr/hdp/current/hadoop/src/dev-support/README.md /tmp/input
hadoop jar /usr/hdp/current/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount /tmp/input /tmp/output
/usr/hdp/current/zookeeper/bin/zkServer.sh status /etc/zookeeper/conf/zoo.cfg
/usr/hdp/current/hive/bin/beeline --color=true -u jdbc:hive2://node01:10000/default -n hive
0: jdbc:hive2://node01:10000/default> set hive.execution.engine;
+----------------------------+
| set |
+----------------------------+
| hive.execution.engine=tez |
+----------------------------+
1 row selected (0.403 seconds)
-- 建表
CREATE TABLE `visit_t01` (
uid string,
visit_date string,
visit_count bigint
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- 插入测试数据
INSERT INTO visit_t01 VALUES ('u01', '2019/11/21', 5),('u02', '2019/11/23', 6),
('u03', '2019/11/22', 8),('u04', '2019/11/20', 3),('u01', '2019/11/23', 6),
('u01', '2019/12/21', 8),('u02', '2019/11/23', 6),('u01', '2019/12/22', 4);
-- 查询插入的数据
0: jdbc:hive2://node01:10000/default> SELECT * FROM visit_t01 LIMIT 10;
+----------------+-----------------------+------------------------+
| visit_t01.uid | visit_t01.visit_date | visit_t01.visit_count |
+----------------+-----------------------+------------------------+
| u01 | 2019/11/21 | 5 |
| u02 | 2019/11/23 | 6 |
| u03 | 2019/11/22 | 8 |
| u04 | 2019/11/20 | 3 |
| u01 | 2019/11/23 | 6 |
| u01 | 2019/12/21 | 8 |
| u02 | 2019/11/23 | 6 |
| u01 | 2019/12/22 | 4 |
+----------------+-----------------------+------------------------+
8 rows selected
-- 统计每个用户的累计访问量,月计访问量和累计访问量
SELECT B.uid,B.visit_date2,B.v_count `月计`,
SUM(v_count) OVER(PARTITION BY uid ORDER BY visit_date2) `累计` FROM (
SELECT uid,visit_date2,SUM(visit_count) AS v_count FROM (
SELECT uid,date_format(regexp_replace(visit_date, '/','-'),'yyyy-MM') visit_date2,visit_count
FROM visit_t01
) A GROUP BY uid,visit_date2
) B;
----------------------------------------------------------------------------------------------
VERTICES MODE STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
----------------------------------------------------------------------------------------------
Map 1 .......... container SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... container SUCCEEDED 2 2 0 0 0 0
----------------------------------------------------------------------------------------------
VERTICES: 02/02 [==========================>>] 100% ELAPSED TIME: 20.42 s
----------------------------------------------------------------------------------------------
+--------+----------------+-----+-----+
| b.uid | b.visit_date2 | 月计 | 累计 |
+--------+----------------+-----+-----+
| u01 | 2019-11 | 11 | 11 |
| u01 | 2019-12 | 12 | 23 |
| u03 | 2019-11 | 8 | 8 |
| u02 | 2019-11 | 12 | 12 |
| u04 | 2019-11 | 3 | 3 |
+--------+----------------+-----+-----+
5 rows selected (25.988 seconds)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。