当前位置:   article > 正文

spark写入hdfs_spark 写入hdfs

spark 写入hdfs

spark streaming写入hdfs

场景: 需要将数据写入hdfs,打包成一个gz包, 每5分钟执行一次spark任务。

最终的结果如下:
在这里插入图片描述
5分钟跑一次spark, 将数据写入hdfs, 会产生很多的小文件。

spark代码

val hadoopConf: Configuration = rdd.context.hadoopConfiguration
hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
hadoopConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec")
hadoopConf.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK")
dataToSaveHdfs.saveAsNewAPIHadoopFile(ConfigInfo.saveHdfsMarkDataPathConfig, classOf[Text], classOf[Text], classOf[StreamingDataGzipOutputFormat[Text, Text]])
  • 1
  • 2
  • 3
  • 4
  • 5

注释:
      dataToSaveHdfs是RDD
     ConfigInfo.saveHdfsMarkDataPathConfig 是配置文件的hdfs路径, 上图的/pub_stat_migu/tmp/log/路径
     StreamingDataGzipOutputFormat是一个类


StreamingDataGzipOutputFormat.java类 // 主要是实现Hadoop MapReduce重写FileOutputFormat

import com.hadoop.compression.lzo.LzopCodec;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;


import java.io.DataOutputStream;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Iterator;



// Hadoop MapReduce的FileOutputFormat

public class StreamingDataGzipOutputFormat<K, V> extends FileOutputFormat<K, V> {


    private StreamingDataGzipOutputFormat<K, V>.MultiRecordWriter writer;
    private String jobId;
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();


    public StreamingDataGzipOutputFormat() {
        this.jobId = null;
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
            throws IOException, InterruptedException {
        if (this.writer == null) {
            this.writer = new MultiRecordWriter(job, getTaskOutputPath(job));
        }
        if (this.jobId == null) {
            this.jobId = String.valueOf(job.getJobID().getId());
        }
        return this.writer;
    }


    private Path getTaskOutputPath(TaskAttemptContext job) throws IOException {
        Path workPath = null;
        OutputCommitter committer = super.getOutputCommitter(job);
        if ((committer instanceof FileOutputCommitter)) {
            workPath = ((FileOutputCommitter) committer).getWorkPath();
        } else {
            Path outputPath = FileOutputFormat.getOutputPath(job);
            if (outputPath == null) {
                throw new IOException("Undefined job output-path");
            }
            workPath = outputPath;
        }
        return workPath;
    }

    /**
     *   提供并发量: 批量写入数据,存在写入同一个或者不同的路径情况
    */
    public class MultiRecordWriter extends RecordWriter<K, V> {
        private HashMap<String, RecordWriter<K, V>> recordWriters;
        private TaskAttemptContext job;
        private Path workPath;


        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
            this.job = job;
            this.workPath = workPath;
            this.recordWriters = new HashMap();
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            Iterator values = this.recordWriters.values().iterator();
            while (values.hasNext()) {
                ((RecordWriter) values.next()).close(context);
            }
            this.recordWriters.clear();
        }


        protected String[] generateFileNameForKeyValue(K key, V value) {
            if ((key == null) || (value == null)) {
                return null;
            }
            String keyStr = key.toString();
            String[] fileInfo = new String[3];
            if (keyStr.startsWith("ERROR")) {
                String[] keyStrs = StringUtils.split(keyStr.substring("ERROR".length() + 1), "_");
                fileInfo[0] = (this.workPath.toString() + "/ERROR/" + StringUtils.substring(keyStrs[1], 0, 10) + "/" + keyStrs[1] + "/" + keyStrs[0]);
                fileInfo[1] = ("ERROR_" + keyStrs[0] + "_" + keyStrs[1]);
            } else {
                //20001_202002132155_1583302930630.gz
                String[] keyStrs = StringUtils.split(keyStr, "_");
                fileInfo[0] = (this.workPath.toString() + "/" +StringUtils.substring(keyStrs[1], 0, 10)  + "/" + keyStrs[1] + "/" + keyStrs[0]);
                fileInfo[1] = (keyStrs[0] + "_" + keyStrs[1]);
            }
            fileInfo[2] = (fileInfo[1] + "_" + System.currentTimeMillis());
            return fileInfo;
        }

        @Override
        public void write(K key, V value)
                throws IOException, InterruptedException {
            String[] fileInfo = generateFileNameForKeyValue(key, value);
            if (fileInfo != null) {
                RecordWriter rw = (RecordWriter) this.recordWriters.get(fileInfo[1]);
                if (rw == null) {
                    rw = getBaseRecordWriter(this.job, fileInfo);
                    this.recordWriters.put(fileInfo[1], rw);
                }
                rw.write(key, value);
            }
        }


        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String[] fileInfo) throws IOException, InterruptedException {
            Configuration conf = job.getConfiguration();
            String keyValueSeparator = "_";
            RecordWriter recordWriter = null;
            if (!FileOutputFormat.getCompressOutput(job)) {  // 如果是压缩,则根据压缩获取扩展名
                Path file = new Path(new Path(fileInfo[0]), fileInfo[2]);
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false, 1048576, (short) 3, 10485760);
                recordWriter = new MessageRecordWriter(new DataOutputStream(fileOut), keyValueSeparator);
            } else {
                Class codecClass = FileOutputFormat.getOutputCompressorClass(job, LzopCodec.class);
                CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
                Path file = new Path(new Path(fileInfo[0]), fileInfo[2] + codec.getDefaultExtension());
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false, 1048576, (short) 3, 10485760);
                recordWriter = new MessageRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator);
            }
            return recordWriter;
        }
    }
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144

存在写入同一个或者不同的路径情况

MultiRecordWriter 类 —>提供并发量: 批量写入数据,存在写入同一个或者不同的路径情况

MessageRecordWriter 有点像单个hdfs路径的对象,把所有出现的路径保存下来HashMap<String, RecordWriter<K, V>> recordWriters,
数据来就直接写入, 不需要新打开一个hdfs路径对象。


MessageRecordWriter.java

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

public class MessageRecordWriter<K, V> extends RecordWriter<K, V> {
    private static final String utf8 = "UTF-8";
    private static final byte[] newline;
    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

    public MessageRecordWriter(DataOutputStream out, String keyValueSeparator) {
        this.out = out;
        try {
            this.keyValueSeparator = keyValueSeparator.getBytes("UTF-8");
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find UTF-8 encoding");
        }
    }

    public MessageRecordWriter(DataOutputStream out) {
        this(out, "\t");
    }

    private void writeObject(Object o) throws IOException {
        if ((o instanceof Text)) {
            Text to = (Text) o;
            this.out.write(to.getBytes(), 0, to.getLength());
        } else {
            this.out.write(o.toString().getBytes("UTF-8"));
        }
    }
    
    @Override
    public synchronized void write(K key, V value) throws IOException {
        boolean nullKey = (key == null) || ((key instanceof NullWritable));
        boolean nullValue = (value == null) || ((value instanceof NullWritable));
        if ((nullKey) && (nullValue)) {
            return;
        }
        if (!nullValue) {
            writeObject(value);
        }
        this.out.write(newline);
    }
    
    @Override
    public synchronized void close(TaskAttemptContext context) throws IOException {
        this.out.flush();
        this.out.close();
    }

    static {
        try {
            newline = "\n".getBytes("UTF-8");
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find UTF-8 encoding");
        }
    }
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/685954
推荐阅读
相关标签
  

闽ICP备14008679号