赞
踩
使用flume采集web服务器日志,架构见图
每台webserver 的agent的配置:
#configuration 'flume74Agent'
flume74Agent.sources=source74
flume74Agent.sinks=sink74-1 sink74-2
flume74Agent.channels=channel74
#configuration sinks group
flume74Agent.sinksgroups=group74
#配置source为syslogtcp的源
flume74Agent.sources.source74.type=syslogtcp
flume74Agent.sources.source74.port=514
flume74Agent.sources.source74.host=10.21.3.74
flume74Agent.sources.source74.channels=channel74
#配置 memory channels,说明capacity必须大于transactionCapacity,容量配置越小,Agent挂了之后丢失的数据量越少,keep-alive的单位是秒,存活时间
flume74Agent.channels.channel74.type=memory
flume74Agent.channels.channel74.capacity=2000
flume74Agent.channels.channel74.transactionCapacity=1000
flume74Agent.channels.channel74.keep-alive=30
#配置 file channel,为了提高效率checkpointDir和dataDir的目录最好分开
#flume74Agent.channels.channel74.type=file
#flume74Agent.channels.channel74.checkpointDir=/usr/local/new-cluster/apache-flume-1.6.0-bin/checkpoint
#flume74Agent.channels.channel74.dataDirs=/usr/local/new-cluster/apache-flume-1.6.0-bin/data
#flume74Agent.channels.channel74.transactionCapacity=10000
#flume74Agent.channels.channel74.checkpointInterval=60000
#flume74Agent.channels.channel74.capacity=20000
#flume74Agent.channels.channel74.keep-alive=30
#配置第一个sink sink74-1
flume74Agent.sinks.sink74-1.type=avro
flume74Agent.sinks.sink74-1.port=4141
flume74Agent.sinks.sink74-1.hostname=10.21.3.73
flume74Agent.sinks.sink74-1.channel=channel74
#配置第二个sink sink74-2
flume74Agent.sinks.sink74-2.type=avro
flume74Agent.sinks.sink74-2.port=4141
flume74Agent.sinks.sink74-2.hostname=10.21.3.75
flume74Agent.sinks.sink74-2.channel=channel74
#配置sink组
flume74Agent.sinkgroups.group74.sinks=sink74-1 sink74-2
# 配置sink组的负载均衡,既能分摊压力又能防止其中一个collect采集挂了丢失数据问题
flume74Agent.sinkgroups.group74.processor.type = load_balance
flume74Agent.sinkgroups.group74.processor.backoff = true
flume74Agent.sinkgroups.group74.processor.selector = random
flume collect的agent配置:
collection75Agent.sources=source75
collection75Agent.sinks=sink75-1
collection75Agent.channels=channel75
#configuration source
collection75Agent.sources.source75.type=avro
collection75Agent.sources.source75.channels=channel75
collection75Agent.sources.source75.bind=10.21.3.75
collection75Agent.sources.source75.port=4141
collection75Agent.sources.source75.interceptors = i1 i2
collection75Agent.sources.source75.interceptors.i1.type = org.apache.flume.interceptor.HostInterceptor$Builder
collection75Agent.sources.source75.interceptors.i1.preserveExisting = false
collection75Agent.sources.source75.interceptors.i1.hostHeader = hostname
collection75Agent.sources.source75.interceptors.i2.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#configuration memory channel
collection75Agent.channels.channel75.type=memory
collection75Agent.channels.channel75.capacity=2000
collection75Agent.channels.channel75.transactionCapacity=1000
collection75Agent.channels.channel75.keep-alive=30
#configuration file channel
#collection75Agent.channels.channel75.type=file
#collection75Agent.channels.channel75.checkpointDir=/usr/local/new-cluster/apache-flume-1.6.0-bin/checkpoint
#collection75Agent.channels.channel75.dataDirs=/usr/local/new-cluster/apache-flume-1.6.0-bin/data
#collection75Agent.channels.channel75.transactionCapacity=10000
#collection75Agent.channels.channel75.checkpointInterval=60000
#collection75Agent.channels.channel75.capacity=20000
#collection75Agent.channels.channel75.keep-alive=30
#confituration sinks
collection75Agent.sinks.sink75-1.type=hdfs
collection75Agent.sinks.sink75-1.channel=channel75
collection75Agent.sinks.sink75-1.hdfs.path=hdfs://mycluster1/flume/%Y-%m
collection75Agent.sinks.sink75-1.hdfs.filePrefix=syslog75.%Y-%m-%d
collection75Agent.sinks.sink75-1.hdfs.fileSuffix=.log
collection75Agent.sinks.sink75-1.hdfs.round=true
collection75Agent.sinks.sink75-1.hdfs.roundValue=10
collection75Agent.sinks.sink75-1.hdfs.roundUnit=minute
collection75Agent.sinks.sink75-1.hdfs.rollInterval=0 #多久后重新生成日志文件,0从不生成日志文件
collection75Agent.sinks.sink75-1.hdfs.rollSize=0 #日志多大后重新生成日志文件
collection75Agent.sinks.sink75-1.hdfs.batchSize=1000 #flush到hdfs的日志条数
collection75Agent.sinks.sink75-1.hdfs.rollCount=0 #多少条后重新生成日志文件
collection75Agent.sinks.sink75-1.hdfs.fileType = DataStream
collection75Agent.sinks.sink75-1.hdfs.writeFormat=Text
collection75Agent.sinks.sink75-1.hdfs.callTimeout=600000 #和hdfs通讯多久超时
collection75Agent.sinks.sink75-1.hdfs.threadsPoolSize=20
collection75Agent.sinks.sink75-1.hdfs.rollTimerPoolSize=5
collection75Agent.sinks.sink75-1.hdfs.idleTimeout=600 #间隔多久没有往该日志文件写数据,那么把这个文件结束重命名去除.tmp状态,单位为s
#confituration sinks
#collection75Agent.sinks.sink75-2.type=hdfs
#collection75Agent.sinks.sink75-2.channel=channel75
#collection75Agent.sinks.sink75-2.hdfs.path=hdfs://mycluster1/flume/%Y-%m
#collection75Agent.sinks.sink75-2.hdfs.filePrefix=syslog2.%Y-%m-%d
#collection75Agent.sinks.sink75-2.hdfs.fileSuffix=.log
#collection75Agent.sinks.sink75-2.hdfs.round=true
#collection75Agent.sinks.sink75-2.hdfs.roundValue=10
#collection75Agent.sinks.sink75-2.hdfs.roundUnit=minute
#collection75Agent.sinks.sink75-2.hdfs.rollInterval=0
#collection75Agent.sinks.sink75-2.hdfs.rollSize=0
#collection75Agent.sinks.sink75-2.hdfs.batchSize=1000
#collection75Agent.sinks.sink75-2.hdfs.rollCount=0
#collection75Agent.sinks.sink75-2.hdfs.fileType = DataStream
#collection75Agent.sinks.sink75-2.hdfs.writeFormat=Text
#collection75Agent.sinks.sink75-2.hdfs.callTimeout=600000
#collection75Agent.sinks.sink75-2.hdfs.threadsPoolSize=20
#collection75Agent.sinks.sink75-2.hdfs.rollTimerPoolSize=5
#collection75Agent.sinks.sink75-2.channel=channel75
后台启动flume Agent:
nohup flume-ng agent -c conf/ -f conf/collection73Agent.conf -n collection73Agent > start.log 2>&1 &
rsyslog.conf配置图:
补充:flume-env.sh配置
JAVA_OPTS="-Xms2048m -Xmx2048m -Xss256k -Xmn512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit"
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。