赞
踩
显示块信息
% hdfs fsck / -files -blocks
使用java.net.URL 对象来打开一个数据流
InputStream in = null;
try {
in = new URL("hdfs://host/path").openStream();
// process in
} finally {
IOUtils.closeStream(in);
}
用URLStreamHandler以标准输出格式像是hadoop文件系统的文件
public class URLCat {
static {
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
}
public static void main(String[] args) throws Exception {
InputStream in = null;
try {
in = new URL(args[0]).openStream();
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
}
usage
% export HADOOP_CLASSPATH=hadoop-examples.jar
% hadoop URLCat hdfs://localhost/user/tom/quangle.txt
On the top of the Crumpetty Tree
The Quangle Wangle sat,
But his face you could not see,
On account of his Beaver Hat.
public class FileSystemCat {
public static void main(String[] args) throws Exception {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
InputStream in = null;
try {
in = fs.open(new Path(uri));
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
}
usage
% hadoop FileSystemCat hdfs://localhost/user/tom/quangle.txt
On the top of the Crumpetty Tree
The Quangle Wangle sat,
But his face you could not see,
On account of his Beaver Hat.
FSDataInputStream
package org.apache.hadoop.fs; public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // implementation elided } public interface Seekable { void seek(long pos) throws IOException; long getPos() throws IOException; } public class FileSystemDoubleCat { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); FSDataInputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); // go back to the start of the file IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
usage
% hadoop FileSystemDoubleCat hdfs://localhost/user/tom/quangle.txt
On the top of the Crumpetty Tree
本文档由Linux公社 www.linuxidc.com 收集整理
The Quangle Wangle sat,
But his face you could not see,
On account of his Beaver Hat.
On the top of the Crumpetty Tree
The Quangle Wangle sat,
But his face you could not see,
On account of his Beaver Hat.
public class FileCopyWithProgress {
public static void main(String[] args) throws Exception {
String localSrc = args[0];
String dst = args[1];
InputStream in = new BufferedInputStream(new FileInputStream(localSrc));
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(dst), conf);
OutputStream out = fs.create(new Path(dst), new Progressable() {
public void progress() {
System.out.print(".");
}
});
IOUtils.copyBytes(in, out, 4096, true);
}
}
usage
% hadoop FileCopyWithProgress input/docs/1400-8.txt
hdfs://localhost/user/tom/1400-8.txt
.................
public class ShowFileStatusTest { private MiniDFSCluster cluster; // use an in-process HDFS cluster for testing private FileSystem fs; @Before public void setUp() throws IOException { Configuration conf = new Configuration(); if (System.getProperty("test.build.data") == null) { System.setProperty("test.build.data", "/tmp"); } cluster = new MiniDFSCluster.Builder(conf).build(); fs = cluster.getFileSystem(); OutputStream out = fs.create(new Path("/dir/file")); out.write("content".getBytes("UTF-8")); out.close(); } @After public void tearDown() throws IOException { if (fs != null) { fs.close(); } if (cluster != null) { cluster.shutdown(); } } @Test(expected = FileNotFoundException.class) public void throwsFileNotFoundForNonExistentFile() throws IOException { fs.getFileStatus(new Path("no-such-file")); } @Test public void fileStatusForFile() throws IOException { Path file = new Path("/dir/file"); FileStatus stat = fs.getFileStatus(file); assertThat(stat.getPath().toUri().getPath(), is("/dir/file")); assertThat(stat.isDirectory(), is(false)); assertThat(stat.getLen(), is(7L)); assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis()))); assertThat(stat.getReplication(), is((short) 1)); assertThat(stat.getBlockSize(), is(128 * 1024 * 1024L)); assertThat(stat.getOwner(), is(System.getProperty("user.name"))); assertThat(stat.getGroup(), is("supergroup")); assertThat(stat.getPermission().toString(), is("rw-r--r--")); } @Test public void fileStatusForDirectory() throws IOException { Path dir = new Path("/dir"); FileStatus stat = fs.getFileStatus(dir); assertThat(stat.getPath().toUri().getPath(), is("/dir")); assertThat(stat.isDirectory(), is(true)); assertThat(stat.getLen(), is(0L)); assertThat(stat.getModificationTime(), is(lessThanOrEqualTo(System.currentTimeMillis()))); assertThat(stat.getReplication(), is((short) 0)); assertThat(stat.getBlockSize(), is(0L)); assertThat(stat.getOwner(), is(System.getProperty("user.name"))); assertThat(stat.getGroup(), is("supergroup")); assertThat(stat.getPermission().toString(), is("rwxr-xr-x")); } }
public class ListStatus { public static void main(String[] args) throws Exception { String uri = args[0]; 本文档由Linux公社 www.linuxidc.com 收集整理 Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path[] paths = new Path[args.length]; for (int i = 0; i < paths.length; i++) { paths[i] = new Path(args[i]); } FileStatus[] status = fs.listStatus(paths); Path[] listedPaths = FileUtil.stat2Paths(status); for (Path p : listedPaths) { System.out.println(p); } } }
usage
% hadoop ListStatus hdfs://localhost/ hdfs://localhost/user/tom
hdfs://localhost/user
hdfs://localhost/user/tom/books
hdfs://localhost/user/tom/quangle.txt
public class RegexExcludePathFilter implements PathFilter {
private final String regex;
public RegexExcludePathFilter(String regex) {
this.regex = regex;
}
public boolean accept(Path path) {
return !path.toString().matches(regex);
}
}
usage
fs.globStatus(new Path("/2007/*/*"), new RegexExcludeFilter("^.*/2007/12/31$"))
distributionFileSystem通过使用RPC来调用名称节点,以确定文件开头部分的块的位置。对于每一个快,名称节点返回具有该块副本的数据节点地址。此外,这些数据节点根据他们与客户点的距离来排序(网络集群的拓扑)如果该客户端本身就为数据节点(MR任务中),便从本地数据节点读取。
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yIjhVNUe-1607581109250)(C:\Users\18451\AppData\Roaming\Typora\typora-user-images\image-20201127094028503.png)\
所有的副本都放在一个节点基本上不会损失写入贷款,但这样并没有实现真正的冗余。离机架读取的带宽很高。
在与客户点相同的节点上放置第一个副本(如果客户端在集群之外,就可以随机选取节点,不过系统会避免挑选太满太忙的节点。)
一旦选定副本放置的位置,就会生成一个管线,会考虑到网络拓扑。
如图:
在写入数据的时候,有以下代码执行。当前写入的块不能被客户端读取。超过一个块的数据后,新读取者才能看见第一个块。
Path p = new Path("p");
fs.create(p);
//报告文件已存在
assertThat(fs.exists(p), is(true));
Path p = new Path("p");
OutputStream out = fs.create(p);
out.write("content".getBytes("UTF-8"));
out.flush();
//报告文件长度为0
assertThat(fs.getFileStatus(p).getLen(), is(0L));
Path p = new Path("p");
OutputStream out = fs.create(p);
out.write("content".getBytes("UTF-8"));
out.flush();
assertThat(fs.getFileStatus(p).getLen(), is(0L));
#复制目录
% hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar
#第二个节点产生目录/bar/foo,bar不存在的话就自动创建。
#复制文件
% hadoop distcp dir1 dir2
添加参数(-overwrite、-update、-m、-delete)
% hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo#注意要相同目录
% hadoop distcp -update -delete -p hdfs://namenode1/foo hdfs://namenode2/foo
delete标志使distcp删除源文件中不存在的目标文件或目录,而-p表示保留权限、块大小和复制等文件状态属性。您可以在不带参数的情况下运行distcp,以查看精确的使用指令。
-m参数为设置该MR任务的map数量。
例如,若总文件为1000 GB,-m 1000会分配1000个map,平均每个复制1 GB 。
% hadoop distcp webhdfs://namenode1:50070/foo webhdfs://namenode2:50070/foo
hadoop archives 或HAR文件,是一个更高效的将文件放入HDFS块中的文件存档设备,在减少名称节点内存使用的同时,仍然允许对文件进行透明的访问。具体来说,HAR可以被用作MR的输入。
usage
#创建
hadoop archive -archiveName files.har /mu/files /my
#查看
hadoop fs -lsr har:///mu/files.har
为什么har文件必须要有har扩展名?因为har文件系统将har URI转换成为一个基础文件系统的URI。如下所示。
hadoop fs -lsr har://hdfs-localhost:8082/my/files.har
#删除
hadoop fs -rmr /my/files.har
1、占用和源文件同样大小的磁盘空间,目前还不支持档案压缩。
2、一旦创建,archives不可改变,要增删的话,必须要重新创建归档文件。
3、没有归档inputformat可以打包多个文件到一个单一的MapReduce,所以即使在har文件中处理许多小文件,也仍然低效的。(第七章有解决此问题的另一种方法)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。