]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/tcp_cc.c
xnu-7195.50.7.100.1.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_cc.c
index 3512bc9a1e0163cf45af1fef8a1e57d64bc958f1..461b180f9e98361b446583035d5b61e777da0e93 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <mach/sdt.h>
 #include <libkern/OSAtomic.h>
 
-struct tcp_cc_debug_state {
-       u_int64_t ccd_tsns;
-       char ccd_srcaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_srcport;
-       char ccd_destaddr[INET6_ADDRSTRLEN];
-       uint16_t ccd_destport;
-       uint32_t ccd_snd_cwnd;
-       uint32_t ccd_snd_wnd;
-       uint32_t ccd_snd_ssthresh;
-       uint32_t ccd_pipeack;
-       uint32_t ccd_rttcur;
-       uint32_t ccd_rxtcur;
-       uint32_t ccd_srtt;
-       uint32_t ccd_event;
-       uint32_t ccd_sndcc;
-       uint32_t ccd_sndhiwat;
-       uint32_t ccd_bytes_acked;
-       u_int8_t ccd_cc_index;
-       u_int8_t ccd_unused_1__;
-       u_int16_t ccd_unused_2__;
-       union {
-               struct {
-                       uint32_t ccd_last_max;
-                       uint32_t ccd_tcp_win;
-                       uint32_t ccd_target_win;
-                       uint32_t ccd_avg_lastmax;
-                       uint32_t ccd_mean_deviation;
-               } cubic_state;
-               struct {
-                       u_int32_t led_base_rtt;
-               } ledbat_state;
-       } u;
-};
-
-SYSCTL_SKMEM_TCP_INT(OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
-    int, tcp_cc_debug, 0, "Enable debug data collection");
+static int tcp_cc_debug;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &tcp_cc_debug, 0, "Enable debug data collection");
 
 extern struct tcp_cc_algo tcp_cc_newreno;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets,
@@ -113,8 +80,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated,
 struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT];
 struct zone *tcp_cc_zone;
 
-/* Information for colelcting TCP debug information using control socket */
-#define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug"
 #define TCP_CCDBG_NOUNIT 0xffffffff
 static kern_ctl_ref tcp_ccdbg_ctlref = NULL;
 volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT;
@@ -151,12 +116,13 @@ tcp_cc_control_register(void)
        errno_t err;
 
        bzero(&ccdbg_control, sizeof(ccdbg_control));
-       strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME,
+       strlcpy(ccdbg_control.ctl_name, TCP_CC_CONTROL_NAME,
            sizeof(ccdbg_control.ctl_name));
        ccdbg_control.ctl_connect = tcp_ccdbg_control_connect;
        ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect;
        ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED;
        ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM;
+       ccdbg_control.ctl_sendsize = 32 * 1024;
 
        err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref);
        if (err != 0) {
@@ -255,8 +221,6 @@ tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event)
                            tp->t_ccstate->cub_last_max;
                        dbg_state.u.cubic_state.ccd_tcp_win =
                            tp->t_ccstate->cub_tcp_win;
-                       dbg_state.u.cubic_state.ccd_target_win =
-                           tp->t_ccstate->cub_target_win;
                        dbg_state.u.cubic_state.ccd_avg_lastmax =
                            tp->t_ccstate->cub_avg_lastmax;
                        dbg_state.u.cubic_state.ccd_mean_deviation =
@@ -317,9 +281,6 @@ tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp)
 
 /*
  * Calculate initial cwnd according to RFC3390.
- *
- * Keep the old ss_fltsz sysctl for ABI compabitility issues.
- * but it will be overriden if tcp_do_rfc3390 sysctl when it is set.
  */
 void
 tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
@@ -327,12 +288,12 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
        if (tp->t_flags & TF_LOCAL) {
                tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
        } else {
-               /* initial congestion window according to RFC 3390 */
-               if (tcp_do_rfc3390) {
+               if (tcp_cubic_minor_fixes) {
+                       tp->snd_cwnd = tcp_initial_cwnd(tp);
+               } else {
+                       /* initial congestion window according to RFC 3390 */
                        tp->snd_cwnd = min(4 * tp->t_maxseg,
                            max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES));
-               } else {
-                       tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
                }
        }
 }
@@ -340,7 +301,7 @@ tcp_cc_cwnd_init_or_reset(struct tcpcb *tp)
 /*
  * Indicate whether this ack should be delayed.
  * Here is the explanation for different settings of tcp_delack_enabled:
- *  - when set to 1, the bhavior is same as when set to 2. We kept this
+ *  - when set to 1, the behavior is same as when set to 2. We kept this
  *    for binary compatibility.
  *  - when set to 2, will "ack every other packet"
  *      - if our last ack wasn't a 0-sized window.
@@ -369,12 +330,50 @@ tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th)
                }
                break;
        case 3:
-               if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
-                   (th->th_flags & TH_PUSH) == 0 &&
-                   ((tp->t_unacksegs == 1) ||
-                   ((tp->t_flags & TF_STRETCHACK) != 0 &&
-                   tp->t_unacksegs < (maxseg_unacked)))) {
-                       return 1;
+               if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
+                       if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
+                           (th->th_flags & TH_PUSH) == 0 &&
+                           ((tp->t_unacksegs == 1) ||
+                           ((tp->t_flags & TF_STRETCHACK) &&
+                           tp->t_unacksegs < maxseg_unacked))) {
+                               return 1;
+                       }
+               } else {
+                       uint32_t recwin;
+
+                       /* Get the receive-window we would announce */
+                       recwin = tcp_sbspace(tp);
+                       if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) {
+                               recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale);
+                       }
+
+                       /* Delay ACK, if:
+                        *
+                        * 1. We are not sending a zero-window
+                        * 2. We are not forcing fast ACKs
+                        * 3. We have more than the low-water mark in receive-buffer
+                        * 4. The receive-window is not increasing
+                        * 5. We have less than or equal of an MSS unacked or
+                        *    Window actually has been growing larger than the initial value by half of it.
+                        *    (this makes sure that during ramp-up we ACK every second MSS
+                        *    until we pass the tcp_recvspace * 1.5-threshold)
+                        * 6. We haven't waited for half a BDP
+                        *
+                        * (a note on 6: The receive-window is
+                        * roughly 2 BDP. Thus, recwin / 4 means half a BDP and
+                        * thus we enforce an ACK roughly twice per RTT - even
+                        * if the app does not read)
+                        */
+                       if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
+                           tp->t_forced_acks == 0 &&
+                           tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat &&
+                           recwin <= tp->t_last_recwin &&
+                           (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg ||
+                           recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) &&
+                           (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2)) {
+                               tp->t_stat.acks_delayed++;
+                               return 1;
+                       }
                }
                break;
        }
@@ -466,7 +465,11 @@ tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp)
        tp->t_pipeack = tcp_get_max_pipeack(tp);
        tcp_clear_pipeack_state(tp);
        tp->snd_cwnd = (max(tp->t_pipeack, tp->t_lossflightsize) >> 1);
-       tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+       if (tcp_cubic_minor_fixes) {
+               tp->snd_cwnd = max(tp->snd_cwnd, tp->t_maxseg);
+       } else {
+               tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
+       }
        tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh;
        tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
 }