当前位置:   article > 正文

kafka producer 异常处理_got error produce response in correla tion id 5017

got error produce response in correla tion id 5017324 on topic-partition bos

1 重试

Sender.completeBatch
if (error != Errors.NONE && canRetry(batch, error)) {
            // retry
            log.warn("Got error produce response with correlation id {} on topic-partition {}, retrying ({} attempts left). Error: {}",
                     correlationId,
                     batch.topicPartition,
                     this.retries - batch.attempts - 1,
                     error);
            //重新把发送失败等着批次 加入到队列里面。
            this.accumulator.reenqueue(batch, now);
            this.sensors.recordRetries(batch.topicPartition.topic(), batch.recordCount);
        } else {

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

2 超时

Sender.run. this.accumulator.abortExpiredBatches
    public List<RecordBatch> abortExpiredBatches(int requestTimeout, long now) {
        List<RecordBatch> expiredBatches = new ArrayList<>();
        int count = 0;
        //
        for (Map.Entry<TopicPartition, Deque<RecordBatch>> entry : this.batches.entrySet()) {
            //获取到每个分区的队列 -》 队列里面对应的批次
            Deque<RecordBatch> dq = entry.getValue();
            TopicPartition tp = entry.getKey();
            // We only check if the batch should be expired if the partition does not have a batch in flight.
            // This is to prevent later batches from being expired while an earlier batch is still in progress.
            // Note that `muted` is only ever populated if `max.in.flight.request.per.connection=1` so this protection
            // is only active in this case. Otherwise the expiration order is not guaranteed.
            if (!muted.contains(tp)) {
                synchronized (dq) {
                    // iterate over the batches and expire them if they have been in the accumulator for more than requestTimeOut
                    RecordBatch lastBatch = dq.peekLast();
                    //迭代的看每个分区里面的每个批次
                    Iterator<RecordBatch> batchIterator = dq.iterator();
                    while (batchIterator.hasNext()) {
                        RecordBatch batch = batchIterator.next();
                        boolean isFull = batch != lastBatch || batch.records.isFull();
                        // check if the batch is expired
                        //TODO 判断一下是否超时
                        if (batch.maybeExpire(requestTimeout, retryBackoffMs, now, this.lingerMs, isFull)) {
                            //增加到超时的数据结构里面
                            expiredBatches.add(batch);
                            count++;
                            //从数据结构里面移除
                            batchIterator.remove();
                            //释放资源
                            deallocate(batch);
                        } else {
                            // Stop at the first batch that has not expired.
                            break;
                        }
                    }
                }
            }
        }
        if (!expiredBatches.isEmpty())
            log.trace("Expired {} batches in accumulator", count);

        return expiredBatches;
    }

batch.maybeExpire
    public boolean maybeExpire(int requestTimeoutMs, long retryBackoffMs, long now, long lingerMs, boolean isFull) {
        boolean expire = false;
        String errorMessage = null;
        /**
         * requestTimeoutMs:代表的是请求发送的超时的时间。默认值是30.
         * now:当前时间
         * lastAppendTime:批次的创建的时间(上一次重试的时间)
         * now - this.lastAppendTime 大于30秒,说明批次超时了 还没发送出去。
         */
        if (!this.inRetry() && isFull && requestTimeoutMs < (now - this.lastAppendTime)) {
            expire = true;
            //记录异常信息
            errorMessage = (now - this.lastAppendTime) + " ms has passed since last append";
            /**
             * lingerMs: 100ms,无论如何都要把消息发送出去的时间
             *
             * createdMs:批次创建的时间
             *
             * 已经大于30秒了。 说明也是超时了。
             *
             */
        } else if (!this.inRetry() && requestTimeoutMs < (now - (this.createdMs + lingerMs))) {
            expire = true;
            errorMessage = (now - (this.createdMs + lingerMs)) + " ms has passed since batch creation plus linger time";
            /**
             * 针对重试
             * lastAttemptMs: 上一次重试的时间(批次创建的时间)
             * retryBackoffMs: 重试的时间间隔
             * 说明也是超时了。
             */
        } else if (this.inRetry() && requestTimeoutMs < (now - (this.lastAttemptMs + retryBackoffMs))) {
            expire = true;
            errorMessage = (now - (this.lastAttemptMs + retryBackoffMs)) + " ms has passed since last attempt plus backoff time";
        }

        if (expire) {
            this.records.close();
            //调用done方法
            //方法里面传过去了一个TimeoutException的异常。(超时了)
            //TODO 处理超时的批次
            this.done(-1L, Record.NO_TIMESTAMP,
                      new TimeoutException("Expiring " + recordCount + " record(s) for " + topicPartition + " due to " + errorMessage));
        }

        return expire;
    }

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94

3 长时间没有响应的消息处理

NetWorkClient.poll.handleTimedOutRequests
    private void handleTimedOutRequests(List<ClientResponse> responses, long now) {
        //获取到请求超时的主机。
        List<String> nodeIds = this.inFlightRequests.getNodesWithTimedOutRequests(now, this.requestTimeoutMs);
        for (String nodeId : nodeIds) {
            // close connection to the node
            //关闭请求超时的主机的连接
            this.selector.close(nodeId);
            log.debug("Disconnecting from node {} due to request timeout.", nodeId);
            //我们猜应该是会去修改 连接的状态
            processDisconnection(responses, nodeId, now);
        }

        // we disconnected, so we should probably refresh our metadata
        if (nodeIds.size() > 0)
            metadataUpdater.requestUpdate();
    }
processDisconnection
    private void processDisconnection(List<ClientResponse> responses, String nodeId, long now) {
        //修改连接状态
        connectionStates.disconnected(nodeId, now);
        //
        for (ClientRequest request : this.inFlightRequests.clearAll(nodeId)) {
            log.trace("Cancelled request {} due to node {} being disconnected", request, nodeId);
            if (!metadataUpdater.maybeHandleDisconnection(request))
                //对这些请求进行处理
                //大家会看到一个比较有意思的事
                //自己封装了一个响应。这个响应里面没有服务端响应消息(服务端没给响应)
                //失去连接的状态表标识为true
                responses.add(new ClientResponse(request, now, true, null));
        }
    }
disconnected
    public void disconnected(String id, long now) {
        NodeConnectionState nodeState = nodeState(id);
        //修缓存的对应主机的连接转态:DISCONNECTED
        //sender -> 检查网络是否可以举报发送消息的条件 -> 是否可以尝试建立网络连接。
        //如果主机的状态是:DISCONNECTED,可以尝试初始化连接。
        //最后调用networkclient的poll方法(Selector 去完成的最后的网络连接)
        nodeState.state = ConnectionState.DISCONNECTED;
        nodeState.lastConnectAttemptMs = now;
    }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/寸_铁/article/detail/839287
推荐阅读
相关标签
  

闽ICP备14008679号