赞
踩
系统环境:hadoop2.7.2+hive1.2.1
大约一年多之前,将hive版本从0.14升级到了1.2.1。之后发现新版本在最后一步写入数据的时候,会比以前慢很多。最后发现是由于hive新版本中,默认中间结果文件是在表空间下生成以 .hive-staging_hive_ 开头的文件。以前版本默认是在/tmp/hive下。最后查出来解决办法:
修改配置文件参数:
<property>
<name>hive.exec.stagingdir</name>
<value>/tmp/hive/.hive-staging</value>
</property>
产生问题的原因是,在hive1.2.1版本中,如果中间结果文件目录跟目标目录在同一根目录的话,就会将中间结果数据复制到目标目录。而以前是直接将原目录(/tmp/hive)下的目录直接rename到目标目录。因此,变慢的原因是多了一个额外的数据复制工作。具体看代码:
- MoveTask.java
-
- private void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir)
- throws Exception {
- FileSystem fs = sourcePath.getFileSystem(conf);
- if (isDfsDir) {
- // Just do a rename on the URIs, they belong to the same FS
- String mesg = "Moving data to: " + targetPath.toString();
- String mesg_detail = " from " + sourcePath.toString();
- console.printInfo(mesg, mesg_detail);
-
- // if source exists, rename. Otherwise, create a empty directory
- if (fs.exists(sourcePath)) {
- Path deletePath = null;
- // If it multiple level of folder are there fs.rename is failing so first
- // create the targetpath.getParent() if it not exist
- if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INSERT_INTO_MULTILEVEL_DIRS)) {
- deletePath = createTargetPath(targetPath, fs);
- }
- if (!Hive.moveFile(conf, sourcePath, targetPath, fs, true, false)) {
- try {
- if (deletePath != null) {
- fs.delete(deletePath, true);
- }
- } catch (IOException e) {
- LOG.info("Unable to delete the path created for facilitating rename"
- + deletePath);
- }
- throw new HiveException("Unable to rename: " + sourcePath
- + " to: " + targetPath);
- }
- } else if (!fs.mkdirs(targetPath)) {
- throw new HiveException("Unable to make directory: " + targetPath);
- }
- } else {
- // This is a local file
- String mesg = "Copying data to local directory " + targetPath.toString();
- String mesg_detail = " from " + sourcePath.toString();
- console.printInfo(mesg, mesg_detail);
-
- // delete the existing dest directory
- LocalFileSystem dstFs = FileSystem.getLocal(conf);
-
- if (dstFs.delete(targetPath, true) || !dstFs.exists(targetPath)) {
- console.printInfo(mesg, mesg_detail);
- // if source exists, rename. Otherwise, create a empty directory
- if (fs.exists(sourcePath)) {
- fs.copyToLocalFile(sourcePath, targetPath);
- } else {
- if (!dstFs.mkdirs(targetPath)) {
- throw new HiveException("Unable to make local directory: "
- + targetPath);
- }
- }
- } else {
- throw new AccessControlException(
- "Unable to delete the existing destination directory: "
- + targetPath);
- }
- }
- }
- public static boolean moveFile(HiveConf conf, Path srcf, Path destf,
- FileSystem fs, boolean replace, boolean isSrcLocal) throws HiveException {
- boolean success = false;
-
- //needed for perm inheritance.
- boolean inheritPerms = HiveConf.getBoolVar(conf,
- HiveConf.ConfVars.HIVE_WAREHOUSE_SUBDIR_INHERIT_PERMS);
- HadoopShims shims = ShimLoader.getHadoopShims();
- HadoopShims.HdfsFileStatus destStatus = null;
- HadoopShims.HdfsEncryptionShim hdfsEncryptionShim = SessionState.get().getHdfsEncryptionShim();
-
- // If source path is a subdirectory of the destination path:
- // ex: INSERT OVERWRITE DIRECTORY 'target/warehouse/dest4.out' SELECT src.value WHERE src.key >= 300;
- // where the staging directory is a subdirectory of the destination directory
- // (1) Do not delete the dest dir before doing the move operation.
- // (2) It is assumed that subdir and dir are in same encryption zone.
- // (3) Move individual files from scr dir to dest dir.
- boolean destIsSubDir = isSubDir(srcf, destf, fs, isSrcLocal);
- try {
- if (inheritPerms || replace) {
- try{
- destStatus = shims.getFullFileStatus(conf, fs, destf.getParent());
- //if destf is an existing directory:
- //if replace is true, delete followed by rename(mv) is equivalent to replace
- //if replace is false, rename (mv) actually move the src under dest dir
- //if destf is an existing file, rename is actually a replace, and do not need
- // to delete the file first
- if (replace && !destIsSubDir) {
- LOG.debug("The path " + destf.toString() + " is deleted");
- fs.delete(destf, true);
- }
- } catch (FileNotFoundException ignore) {
- //if dest dir does not exist, any re
- if (inheritPerms) {
- destStatus = shims.getFullFileStatus(conf, fs, destf.getParent());
- }
- }
- }
- if (!isSrcLocal) {
- // For NOT local src file, rename the file
- if (hdfsEncryptionShim != null && (hdfsEncryptionShim.isPathEncrypted(srcf) || hdfsEncryptionShim.isPathEncrypted(destf))
- && !hdfsEncryptionShim.arePathsOnSameEncryptionZone(srcf, destf))
- {
- LOG.info("Copying source " + srcf + " to " + destf + " because HDFS encryption zones are different.");
- success = FileUtils.copy(srcf.getFileSystem(conf), srcf, destf.getFileSystem(conf), destf,
- true, // delete source
- replace, // overwrite destination
- conf);
- } else {
- if (destIsSubDir) {
- FileStatus[] srcs = fs.listStatus(srcf, FileUtils.HIDDEN_FILES_PATH_FILTER);
- if (srcs.length == 0) {
- success = true; // Nothing to move.
- }
- for (FileStatus status : srcs) {
- success = FileUtils.copy(srcf.getFileSystem(conf), status.getPath(), destf.getFileSystem(conf), destf,
- true, // delete source
- replace, // overwrite destination
- conf);
-
- if (!success) {
- throw new HiveException("Unable to move source " + status.getPath() + " to destination " + destf);
- }
- }
- } else {
- success = fs.rename(srcf, destf);
- }
- }
- } else {
- // For local src file, copy to hdfs
- fs.copyFromLocalFile(srcf, destf);
- success = true;
- }
-
- LOG.info((replace ? "Replacing src:" : "Renaming src: ") + srcf.toString()
- + ", dest: " + destf.toString() + ", Status:" + success);
- } catch (IOException ioe) {
- throw new HiveException("Unable to move source " + srcf + " to destination " + destf, ioe);
- }
-
- if (success && inheritPerms) {
- try {
- ShimLoader.getHadoopShims().setFullFileStatus(conf, destStatus, fs, destf);
- } catch (IOException e) {
- LOG.warn("Error setting permission of file " + destf + ": "+ e.getMessage(), e);
- }
- }
- return success;
- }
从代码中可以看出,有两种策略:如果源目录和目标目录是同一个根目录,则会源目录下的每个文件执行复制操作。反之,执行remane操作(只涉及元数据,不会有额外数据操作)。
为什么会这样呢?我做了一个关于rename的验证。
- import java.io.IOException;
-
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
-
- public class TestHdfs {
- public static void main(String args[]) throws IOException{
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- Path srct=new Path("/tmp/lgh/d1/.dd1");
- Path dst=new Path("/usr");
- fs.rename(srct, dst);
-
- }
- }
验证说明,.dd1 目录下有数据文件,执行程序之后,数据文件都移动到了/usr/目录下。对于以 . 开头的目录下的文件,rename方法,如果源目录和目标目录没有相同的前缀的话,会将数据文件移动到目标目录中。如果不是的话,则会将目标目录也移动过去。
而hive的中间结果文件命名规范,就是以.hive-staging开头的。因此,会出现上述情况。大家以可以用hadoop fs -mv命令验证。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。