]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/tcp_cc.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_cc.c
index ade6b7d03933cf2025ebfd553ab17f9c3b24a3eb..461b180f9e98361b446583035d5b61e777da0e93 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/sdt.h>
 #include <libkern/OSAtomic.h>
 
-struct tcp_cc_debug_state {
-       u_int64_t ccd_tsns;
-       char ccd_srcaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_srcport;
-       char ccd_destaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_destport;
-       uint32_t ccd_snd_cwnd;
-       uint32_t ccd_snd_wnd;
-       uint32_t ccd_snd_ssthresh;
-       uint32_t ccd_pipeack;
-       uint32_t ccd_rttcur;
-       uint32_t ccd_rxtcur;
-       uint32_t ccd_srtt;
-       uint32_t ccd_event;
-       uint32_t ccd_sndcc;
-       uint32_t ccd_sndhiwat;
-       uint32_t ccd_bytes_acked;
-       u_int8_t ccd_cc_index;
-       u_int8_t ccd_unused_1__;
-       u_int16_t ccd_unused_2__;
-       union {
-               struct {
-                       uint32_t ccd_last_max;
-                       uint32_t ccd_tcp_win;
-                       uint32_t ccd_target_win;
-                       uint32_t ccd_avg_lastmax;
-                       uint32_t ccd_mean_deviation;
-               } cubic_state;
-               struct {
-                       u_int32_t led_base_rtt;
-               } ledbat_state;
-       } u;
-};
-
-int tcp_cc_debug = 0;
+static int tcp_cc_debug;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &tcp_cc_debug, 0, "Enable debug data collection");
+    &tcp_cc_debug, 0, "Enable debug data collection");
 
 extern struct tcp_cc_algo tcp_cc_newreno;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets,
-       CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_newreno.num_sockets,
-       0, "Number of sockets using newreno");
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_newreno.num_sockets,
+    0, "Number of sockets using newreno");
 
 extern struct tcp_cc_algo tcp_cc_ledbat;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets,
-       CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets,
-       0, "Number of sockets using background transport");
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets,
+    0, "Number of sockets using background transport");
 
 extern struct tcp_cc_algo tcp_cc_cubic;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_sockets,
-       CTLFLAG_RD | CTLFLAG_LOCKED,&tcp_cc_cubic.num_sockets, 
-       0, "Number of sockets using cubic");
+    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_cubic.num_sockets,
+    0, "Number of sockets using cubic");
 
-int tcp_use_newreno = 0;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, use_newreno,
-       CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_newreno, 0, 
-       "Use TCP NewReno by default");
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_newreno,
+    CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_use_newreno, 0,
+    "Use TCP NewReno by default");
 
 static int tcp_check_cwnd_nonvalidated = 1;
 #if (DEBUG || DEVELOPMENT)
@@ -115,8 +80,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated,
 struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT];
 struct zone *tcp_cc_zone;
 
-/* Information for colelcting TCP debug information using control socket */
-#define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug"
 #define TCP_CCDBG_NOUNIT 0xffffffff
 static kern_ctl_ref tcp_ccdbg_ctlref = NULL;
 volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT;
@@ -124,14 +87,14 @@ volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT;
 void tcp_cc_init(void);
 static void tcp_cc_control_register(void);
 static errno_t tcp_ccdbg_control_connect(kern_ctl_ref kctl,
-       struct sockaddr_ctl *sac, void **uinfo);
+    struct sockaddr_ctl *sac, void **uinfo);
 static errno_t tcp_ccdbg_control_disconnect(kern_ctl_ref kctl,
-       u_int32_t unit, void *uinfo);
+    u_int32_t unit, void *uinfo);
 static struct tcp_cc_algo tcp_cc_algo_none;
 /*
  * Initialize TCP congestion control algorithms.
  */
+
 void
 tcp_cc_init(void)
 {
@@ -153,12 +116,13 @@ tcp_cc_control_register(void)
        errno_t err;
 
        bzero(&ccdbg_control, sizeof(ccdbg_control));
-       strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME,
+       strlcpy(ccdbg_control.ctl_name, TCP_CC_CONTROL_NAME,
            sizeof(ccdbg_control.ctl_name));
        ccdbg_control.ctl_connect = tcp_ccdbg_control_connect;
        ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect;
        ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED;
        ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM;
+       ccdbg_control.ctl_sendsize = 32 * 1024;
 
        err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref);
        if (err != 0) {
@@ -169,7 +133,7 @@ tcp_cc_control_register(void)
 /* Allow only one socket to connect at any time for debugging */
 static errno_t
 tcp_ccdbg_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac,
-       void **uinfo)
+    void **uinfo)
 {
 #pragma unused(kctl)
 #pragma unused(uinfo)
@@ -177,13 +141,15 @@ tcp_ccdbg_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac,
        UInt32 old_value = TCP_CCDBG_NOUNIT;
        UInt32 new_value = sac->sc_unit;
 
-       if (tcp_ccdbg_unit != old_value)
-               return (EALREADY);
+       if (tcp_ccdbg_unit != old_value) {
+               return EALREADY;
+       }
 
-       if (OSCompareAndSwap(old_value, new_value, &tcp_ccdbg_unit))
-               return (0);
-       else
-               return (EALREADY);
+       if (OSCompareAndSwap(old_value, new_value, &tcp_ccdbg_unit)) {
+               return 0;
+       } else {
+               return EALREADY;
+       }
 }
 
 static errno_t
@@ -194,15 +160,17 @@ tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo)
        if (unit == tcp_ccdbg_unit) {
                UInt32 old_value = tcp_ccdbg_unit;
                UInt32 new_value = TCP_CCDBG_NOUNIT;
-               if (tcp_ccdbg_unit == new_value)
-                       return (0);
+               if (tcp_ccdbg_unit == new_value) {
+                       return 0;
+               }
 
                if (!OSCompareAndSwap(old_value, new_value,
-                       &tcp_ccdbg_unit))
-                       log(LOG_DEBUG, 
+                   &tcp_ccdbg_unit)) {
+                       log(LOG_DEBUG,
                            "failed to disconnect tcp_cc debug control");
+               }
        }
-       return (0);
+       return 0;
 }
 
 inline void
@@ -218,11 +186,11 @@ tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event)
                struct timespec tv;
 
                bzero(&dbg_state, sizeof(dbg_state));
-               
+
                nanotime(&tv);
                /* Take time in seconds */
                dbg_state.ccd_tsns = (tv.tv_sec * 1000000000) + tv.tv_nsec;
-               inet_ntop(SOCK_DOM(inp->inp_socket), 
+               inet_ntop(SOCK_DOM(inp->inp_socket),
                    ((SOCK_DOM(inp->inp_socket) == PF_INET) ?
                    (void *)&inp->inp_laddr.s_addr :
                    (void *)&inp->in6p_laddr), dbg_state.ccd_srcaddr,
@@ -248,34 +216,33 @@ tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event)
                dbg_state.ccd_bytes_acked = tp->t_bytes_acked;
                dbg_state.ccd_cc_index = tp->tcp_cc_index;
                switch (tp->tcp_cc_index) {
-                   case TCP_CC_ALGO_CUBIC_INDEX:
+               case TCP_CC_ALGO_CUBIC_INDEX:
                        dbg_state.u.cubic_state.ccd_last_max =
                            tp->t_ccstate->cub_last_max;
                        dbg_state.u.cubic_state.ccd_tcp_win =
                            tp->t_ccstate->cub_tcp_win;
-                       dbg_state.u.cubic_state.ccd_target_win =
-                           tp->t_ccstate->cub_target_win;
                        dbg_state.u.cubic_state.ccd_avg_lastmax =
                            tp->t_ccstate->cub_avg_lastmax;
                        dbg_state.u.cubic_state.ccd_mean_deviation =
                            tp->t_ccstate->cub_mean_dev;
                        break;
-                   case TCP_CC_ALGO_BACKGROUND_INDEX:
+               case TCP_CC_ALGO_BACKGROUND_INDEX:
                        dbg_state.u.ledbat_state.led_base_rtt =
                            get_base_rtt(tp);
                        break;
-                   default:
+               default:
                        break;
                }
 
                ctl_enqueuedata(tcp_ccdbg_ctlref, tcp_ccdbg_unit,
-                       &dbg_state, sizeof(dbg_state), 0);
+                   &dbg_state, sizeof(dbg_state), 0);
        }
        DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
-               struct tcpcb *, tp, struct tcphdr *, th, int32_t, event);
+           struct tcpcb *, tp, struct tcphdr *, th, int32_t, event);
 }
 
-void tcp_cc_resize_sndbuf(struct tcpcb *tp)
+void
+tcp_cc_resize_sndbuf(struct tcpcb *tp)
 {
        struct sockbuf *sb;
        /*
@@ -285,7 +252,7 @@ void tcp_cc_resize_sndbuf(struct tcpcb *tp)
         */
        sb = &tp->t_inpcb->inp_socket->so_snd;
        if (sb->sb_hiwat > tp->snd_ssthresh &&
-               (sb->sb_flags & SB_AUTOSIZE)) {
+           (sb->sb_flags & SB_AUTOSIZE)) {
                if (sb->sb_idealsize > tp->snd_ssthresh) {
                        SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh);
                }
@@ -293,13 +260,14 @@ void tcp_cc_resize_sndbuf(struct tcpcb *tp)
        }
 }
 
-void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp)
+void
+tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp)
 {
        struct sockbuf *sb;
        sb = &tp->t_inpcb->inp_socket->so_snd;
-       if ((sb->sb_flags & (SB_TRIM|SB_AUTOSIZE)) == (SB_TRIM|SB_AUTOSIZE)) {
+       if ((sb->sb_flags & (SB_TRIM | SB_AUTOSIZE)) == (SB_TRIM | SB_AUTOSIZE)) {
                /*
-                * If there was a retransmission that was not necessary 
+                * If there was a retransmission that was not necessary
                 * then the size of socket buffer can be restored to
                 * what it was before
                 */
@@ -313,9 +281,6 @@ void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp)
 
 /*
  * Calculate initial cwnd according to RFC3390.
- *
- * Keep the old ss_fltsz sysctl for ABI compabitility issues.
- * but it will be overriden if tcp_do_rfc3390 sysctl when it is set.
  */
 void
 tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
@@ -323,68 +288,103 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
        if (tp->t_flags & TF_LOCAL) {
                tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
        } else {
-               /* initial congestion window according to RFC 3390 */
-               if (tcp_do_rfc3390)
+               if (tcp_cubic_minor_fixes) {
+                       tp->snd_cwnd = tcp_initial_cwnd(tp);
+               } else {
+                       /* initial congestion window according to RFC 3390 */
                        tp->snd_cwnd = min(4 * tp->t_maxseg,
-                               max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES));
-               else
-                       tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
+                           max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES));
+               }
        }
 }
 
 /*
  * Indicate whether this ack should be delayed.
  * Here is the explanation for different settings of tcp_delack_enabled:
- *  - when set to 1, the bhavior is same as when set to 2. We kept this 
+ *  - when set to 1, the behavior is same as when set to 2. We kept this
  *    for binary compatibility.
  *  - when set to 2, will "ack every other packet"
  *      - if our last ack wasn't a 0-sized window.
- *      - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). 
- *              If TH_PUSH is set, take this as a clue that we need to ACK 
- *              with no delay. This helps higher level protocols who 
- *              won't send us more data even if the window is open 
+ *      - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245).
+ *              If TH_PUSH is set, take this as a clue that we need to ACK
+ *              with no delay. This helps higher level protocols who
+ *              won't send us more data even if the window is open
  *              because their last "segment" hasn't been ACKed
- *  - when set to 3,  will do "streaming detection" 
- *      - if we receive more than "maxseg_unacked" full packets 
+ *  - when set to 3,  will do "streaming detection"
+ *      - if we receive more than "maxseg_unacked" full packets
  *        in the last 100ms
- *      - if the connection is not in slow-start or idle or 
+ *      - if the connection is not in slow-start or idle or
  *        loss/recovery states
  *      - if those criteria aren't met, it will ack every other packet.
  */
 int
 tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th)
 {
-       /* If any flags other than TH_ACK is set, set "end-of-write" bit */
-       if ((th->th_flags & ~TH_ACK))
-               tp->t_flagsext |= TF_STREAMEOW;
-       else    
-               tp->t_flagsext &= ~(TF_STREAMEOW);
-    
        switch (tcp_delack_enabled) {
-           case 1:
-           case 2:
-               if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
-                   (th->th_flags & TH_PUSH) == 0 &&
-                   (tp->t_unacksegs == 1))
-                       return(1);
-               break;  
-           case 3:
+       case 1:
+       case 2:
                if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
                    (th->th_flags & TH_PUSH) == 0 &&
-                   ((tp->t_unacksegs == 1) ||
-                   ((tp->t_flags & TF_STRETCHACK) != 0 &&
-                       tp->t_unacksegs < (maxseg_unacked))))
-                       return(1);
+                   (tp->t_unacksegs == 1)) {
+                       return 1;
+               }
+               break;
+       case 3:
+               if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
+                       if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
+                           (th->th_flags & TH_PUSH) == 0 &&
+                           ((tp->t_unacksegs == 1) ||
+                           ((tp->t_flags & TF_STRETCHACK) &&
+                           tp->t_unacksegs < maxseg_unacked))) {
+                               return 1;
+                       }
+               } else {
+                       uint32_t recwin;
+
+                       /* Get the receive-window we would announce */
+                       recwin = tcp_sbspace(tp);
+                       if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) {
+                               recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale);
+                       }
+
+                       /* Delay ACK, if:
+                        *
+                        * 1. We are not sending a zero-window
+                        * 2. We are not forcing fast ACKs
+                        * 3. We have more than the low-water mark in receive-buffer
+                        * 4. The receive-window is not increasing
+                        * 5. We have less than or equal of an MSS unacked or
+                        *    Window actually has been growing larger than the initial value by half of it.
+                        *    (this makes sure that during ramp-up we ACK every second MSS
+                        *    until we pass the tcp_recvspace * 1.5-threshold)
+                        * 6. We haven't waited for half a BDP
+                        *
+                        * (a note on 6: The receive-window is
+                        * roughly 2 BDP. Thus, recwin / 4 means half a BDP and
+                        * thus we enforce an ACK roughly twice per RTT - even
+                        * if the app does not read)
+                        */
+                       if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
+                           tp->t_forced_acks == 0 &&
+                           tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat &&
+                           recwin <= tp->t_last_recwin &&
+                           (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg ||
+                           recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) &&
+                           (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2)) {
+                               tp->t_stat.acks_delayed++;
+                               return 1;
+                       }
+               }
                break;
        }
-       return(0);
+       return 0;
 }
 
 void
 tcp_cc_allocate_state(struct tcpcb *tp)
 {
        if (tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX &&
-               tp->t_ccstate == NULL) {
+           tp->t_ccstate == NULL) {
                tp->t_ccstate = (struct tcp_ccstate *)zalloc(tcp_cc_zone);
 
                /*
@@ -392,29 +392,32 @@ tcp_cc_allocate_state(struct tcpcb *tp)
                 * state, revert to using TCP NewReno as it does not
                 * require any state
                 */
-               if (tp->t_ccstate == NULL)
+               if (tp->t_ccstate == NULL) {
                        tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
-               else
+               } else {
                        bzero(tp->t_ccstate, sizeof(*tp->t_ccstate));
+               }
        }
 }
 
 /*
- * If stretch ack was disabled automatically on long standing connections, 
+ * If stretch ack was disabled automatically on long standing connections,
  * re-evaluate the situation after 15 minutes to enable it.
  */
-#define        TCP_STRETCHACK_DISABLE_WIN      (15 * 60 * TCP_RETRANSHZ)
+#define TCP_STRETCHACK_DISABLE_WIN      (15 * 60 * TCP_RETRANSHZ)
 void
 tcp_cc_after_idle_stretchack(struct tcpcb *tp)
 {
        int32_t tdiff;
 
-       if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK))
+       if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) {
                return;
+       }
 
        tdiff = timer_diff(tcp_now, 0, tp->rcv_nostrack_ts, 0);
-       if (tdiff < 0)
+       if (tdiff < 0) {
                tdiff = -tdiff;
+       }
 
        if (tdiff > TCP_STRETCHACK_DISABLE_WIN) {
                tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
@@ -432,15 +435,24 @@ tcp_cc_after_idle_stretchack(struct tcpcb *tp)
 inline uint32_t
 tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp)
 {
+       struct socket *so = tp->t_inpcb->inp_socket;
        if (tp->t_pipeack == 0 || tcp_check_cwnd_nonvalidated == 0) {
                tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
-               return (0);
+               return 0;
        }
-       if (tp->t_pipeack >= (tp->snd_cwnd) >> 1)
+
+       /*
+        * The congestion window is validated if the number of bytes acked
+        * is more than half of the current window or if there is more
+        * data to send in the send socket buffer
+        */
+       if (tp->t_pipeack >= (tp->snd_cwnd >> 1) ||
+           (so != NULL && so->so_snd.sb_cc > tp->snd_cwnd)) {
                tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
-       else
+       } else {
                tp->t_flagsext |= TF_CWND_NONVALIDATED;
-       return (tp->t_flagsext & TF_CWND_NONVALIDATED);
+       }
+       return tp->t_flagsext & TF_CWND_NONVALIDATED;
 }
 
 /*
@@ -453,7 +465,11 @@ tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp)
        tp->t_pipeack = tcp_get_max_pipeack(tp);
        tcp_clear_pipeack_state(tp);
        tp->snd_cwnd = (max(tp->t_pipeack, tp->t_lossflightsize) >> 1);
-       tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+       if (tcp_cubic_minor_fixes) {
+               tp->snd_cwnd = max(tp->snd_cwnd, tp->t_maxseg);
+       } else {
+               tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+       }
        tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh;
        tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
 }
@@ -472,7 +488,7 @@ tcp_get_max_pipeack(struct tcpcb *tp)
        max_pipeack = (tp->t_pipeack_sample[2] > max_pipeack) ?
            tp->t_pipeack_sample[2] : max_pipeack;
 
-       return (max_pipeack);
+       return max_pipeack;
 }
 
 inline void