赞
踩
PROC文件tcp_notsent_lowat控制发送缓存队列中的未发送数据量。低于此值可发送,超出此值停止发送,。
内核在TCP初始化函数tcp_sk_init中,将sysctl_tcp_notsent_lowat设置为无符号整数的最大值,此值为TCP套接口全局的tcp_notsent_lowat控制值。如果用户层针对特定套接口使用setsockopt的设置选项TCP_NOTSENT_LOWAT设定了新的notsent_lowat值,其优先级大于全局的sysctl_tcp_notsent_lowat值。
- static int __net_init tcp_sk_init(struct net *net)
- {
- net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
- }
- $ cat /proc/sys/net/ipv4/tcp_notsent_lowat
- -4294967295
- $
- static int do_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen)
- {
- switch (optname) {
- case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
- sk->sk_write_space(sk);
- break;
- }
- }
如函数tcp_stream_memory_free所示,当未发送的数据长度小于设定的notsent_lowat时,内核即认为TCP的发送缓存为空,而不必真正等到全部数据发送完成。
- static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
- {
- struct net *net = sock_net((struct sock *)tp);
- return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
- }
- static inline bool tcp_stream_memory_free(const struct sock *sk)
- {
- u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
-
- return notsent_bytes < tcp_notsent_lowat(tp);
- }
套接口层函数sk_stream_memory_free封装了TCP函数tcp_stream_memory_free,套接口层对缓存的判断,首先比较发送队列的空间与设定的限值sk_sndbuf,小于限值缓存可用,否则,调用TCP的判定函数,即未发送数据notsent_bytes大于等于设定的notsent_lowat值,认为未发送的数据量过大,发送缓存暂时不可用。
- static inline bool sk_stream_memory_free(const struct sock *sk)
- {
- if (sk->sk_wmem_queued >= sk->sk_sndbuf)
- return false;
-
- return sk->sk_prot->stream_memory_free ? sk->sk_prot->stream_memory_free(sk) : true;
- }
相反的,在判断发送缓存何时变得可用时,内核使用sk_stream_is_writeable函数,首要条件是发送队列缓存空间的余量(sk_stream_wspace)大于等于当前发送队列占用空间的一半,即还有1/3以上的空余空间。其次,是未发送的数据量低于notsent_lowat的值。
- static inline bool sk_stream_is_writeable(const struct sock *sk)
- {
- return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sk_stream_memory_free(sk);
- }
- static inline int sk_stream_wspace(const struct sock *sk)
- {
- return sk->sk_sndbuf - sk->sk_wmem_queued;
- }
- static inline int sk_stream_min_wspace(const struct sock *sk)
- {
- return sk->sk_wmem_queued >> 1;
- }
发送路径缓存判断
发送路径函数do_tcp_sendpages和tcp_sendmsg_locked函数都要对notsent_lowat进行判断,以后者为例,看一下其中的逻辑。首先在分配发送skb之前,判断发送缓存是否可用,不可用的话,跳转到wait_for_sndbuf标签,虽然字面上是等待缓存变得可用,但是如果用户设置了MSG_DONTWAIT标志,不执行等待直接返回。详细信息将函数sk_stream_wait_memory。此处设置了套接口的SOCK_NOSPACE标志。
- int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
- {
- while (msg_data_left(msg)) {
- if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
- new_segment:
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, first_skb);
- }
- continue;
-
- wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- wait_for_memory:
- if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal);
-
- err = sk_stream_wait_memory(sk, &timeo);
- if (err != 0)
- goto do_error;
- }
接下来是do_error标签的处理,如果在发送缓存不可用之前,有数据拷贝发生,接收了部分应用层数据,返回拷贝的数据长度。反之,如果没有数据拷贝发送,返回错误码。
- out:
- if (copied) {
- tcp_tx_timestamp(sk, sockc.tsflags);
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
- }
- out_nopush:
- sock_zerocopy_put(uarg);
- return copied + copied_syn;
- do_error:
- if (copied + copied_syn)
- goto out;
- out_err:
- sock_zerocopy_put_abort(uarg);
- err = sk_stream_error(sk, flags, err);
- /* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&err == -EAGAIN)) {
- sk->sk_write_space(sk);
- }
- return err;
- }
等待缓存可用函数sk_stream_wait_memory如下,如果timeo_p所定义的超时时间为0,并且用户设定了MSG_DONTWAIT标志,立即返回。否则,使用sk_wait_event函数等待缓存可用事件发生,或者超时。
- int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
- {
- long current_timeo = *timeo_p;
- bool noblock = (*timeo_p ? false : true);
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
-
- add_wait_queue(sk_sleep(sk), &wait);
-
- while (1) {
- sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
- if (!*timeo_p) {
- if (noblock)
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- goto do_nonblock;
- }
- if (signal_pending(current))
- goto do_interrupted;
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- if (sk_stream_memory_free(sk) && !vm_wait)
- break;
-
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- sk->sk_write_pending++;
- sk_wait_event(sk, ¤t_timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) &&!vm_wait), &wait);
- sk->sk_write_pending--;
-
- if (vm_wait) {
- vm_wait -= current_timeo;
- current_timeo = *timeo_p;
- if (current_timeo != MAX_SCHEDULE_TIMEOUT && (current_timeo -= vm_wait) < 0)
- current_timeo = 0;
- vm_wait = 0;
- }
- *timeo_p = current_timeo;
- }
- out:
- remove_wait_queue(sk_sleep(sk), &wait);
- return err;
- }
如果发送缓存可写,清除套接口socket的SOCK_NOSPACE标志,唤醒等待的进程。
- void sk_stream_write_space(struct sock *sk)
- {
- struct socket *sock = sk->sk_socket;
- struct socket_wq *wq;
-
- if (sk_stream_is_writeable(sk) && sock) {
- clear_bit(SOCK_NOSPACE, &sock->flags);
-
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND);
- }
- }
如下内核函数tcp_poll,通过函数sk_stream_is_writeable的结果判定,可通知用户层发送数据的时机。当发送缓存队列的数据流小于notsent_lowat值的时候,由POLLOUT通知应用层可写,无需等待;反之,应用层需要等待。
- unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
- {
- if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
- mask |= POLLOUT | POLLWRNORM;
- } else {
- /* Race breaker. If space is freed after wspace test but before the flags are set, IO signal will be lost. Memory barrier pairs with the input side. */
- smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
- mask |= POLLOUT | POLLWRNORM;
- }
- }
- }
- }
内核版本 4.15
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。