2  * Copyright (c) 2013-2014 Apple Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  28 #include <sys/param.h> 
  29 #include <sys/systm.h> 
  30 #include <sys/kernel.h> 
  31 #include <sys/protosw.h> 
  32 #include <sys/socketvar.h> 
  33 #include <sys/syslog.h> 
  35 #include <net/route.h> 
  36 #include <netinet/in.h> 
  37 #include <netinet/in_systm.h> 
  38 #include <netinet/ip.h> 
  41 #include <netinet/ip6.h> 
  44 #include <netinet/ip_var.h> 
  45 #include <netinet/tcp.h> 
  46 #include <netinet/tcp_timer.h> 
  47 #include <netinet/tcp_var.h> 
  48 #include <netinet/tcp_fsm.h> 
  49 #include <netinet/tcp_var.h> 
  50 #include <netinet/tcp_cc.h> 
  51 #include <netinet/tcpip.h> 
  52 #include <netinet/tcp_seq.h> 
  53 #include <kern/task.h> 
  54 #include <libkern/OSAtomic.h> 
  56 static int tcp_cubic_init(struct tcpcb 
*tp
); 
  57 static int tcp_cubic_cleanup(struct tcpcb 
*tp
); 
  58 static void tcp_cubic_cwnd_init_or_reset(struct tcpcb 
*tp
); 
  59 static void tcp_cubic_congestion_avd(struct tcpcb 
*tp
, struct tcphdr 
*th
); 
  60 static void tcp_cubic_ack_rcvd(struct tcpcb 
*tp
, struct tcphdr 
*th
); 
  61 static void tcp_cubic_pre_fr(struct tcpcb 
*tp
); 
  62 static void tcp_cubic_post_fr(struct tcpcb 
*tp
, struct tcphdr 
*th
); 
  63 static void tcp_cubic_after_timeout(struct tcpcb 
*tp
); 
  64 static int tcp_cubic_delay_ack(struct tcpcb 
*tp
, struct tcphdr 
*th
); 
  65 static void tcp_cubic_switch_cc(struct tcpcb 
*tp
, u_int16_t old_index
); 
  66 static uint32_t tcp_cubic_update(struct tcpcb 
*tp
, u_int32_t rtt
); 
  67 static uint32_t tcp_cubic_tcpwin(struct tcpcb 
*tp
, struct tcphdr 
*th
); 
  68 static inline void tcp_cubic_clear_state(struct tcpcb 
*tp
); 
  71 extern float cbrtf(float x
); 
  73 struct tcp_cc_algo tcp_cc_cubic 
= { 
  75         .init 
= tcp_cubic_init
, 
  76         .cleanup 
= tcp_cubic_cleanup
, 
  77         .cwnd_init 
= tcp_cubic_cwnd_init_or_reset
, 
  78         .congestion_avd 
= tcp_cubic_congestion_avd
, 
  79         .ack_rcvd 
= tcp_cubic_ack_rcvd
, 
  80         .pre_fr 
= tcp_cubic_pre_fr
, 
  81         .post_fr 
= tcp_cubic_post_fr
, 
  82         .after_idle 
= tcp_cubic_cwnd_init_or_reset
, 
  83         .after_timeout 
= tcp_cubic_after_timeout
, 
  84         .delay_ack 
= tcp_cubic_delay_ack
, 
  85         .switch_to 
= tcp_cubic_switch_cc
 
  88 const float tcp_cubic_backoff 
= 0.2; /* multiplicative decrease factor */ 
  89 const float tcp_cubic_coeff 
= 0.4; 
  90 const float tcp_cubic_fast_convergence_factor 
= 0.875; 
  92 static int tcp_cubic_tcp_friendliness 
= 0; 
  93 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, cubic_tcp_friendliness
, 
  94         CTLFLAG_RW 
| CTLFLAG_LOCKED
, &tcp_cubic_tcp_friendliness
, 0, 
  95         "Enable TCP friendliness"); 
  97 static int tcp_cubic_fast_convergence 
= 0; 
  98 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, cubic_fast_convergence
, 
  99         CTLFLAG_RW 
| CTLFLAG_LOCKED
, &tcp_cubic_fast_convergence
, 0, 
 100         "Enable fast convergence"); 
 102 static int tcp_cubic_use_minrtt 
= 0; 
 103 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, cubic_use_minrtt
, 
 104         CTLFLAG_RW 
| CTLFLAG_LOCKED
, &tcp_cubic_use_minrtt
, 0, 
 105         "use a min of 5 sec rtt"); 
 107 static int tcp_cubic_init(struct tcpcb 
*tp
) 
 109         OSIncrementAtomic((volatile SInt32 
*)&tcp_cc_cubic
.num_sockets
); 
 111         VERIFY(tp
->t_ccstate 
!= NULL
); 
 112         tcp_cubic_clear_state(tp
); 
 116 static int tcp_cubic_cleanup(struct tcpcb 
*tp
) 
 119         OSDecrementAtomic((volatile SInt32 
*)&tcp_cc_cubic
.num_sockets
); 
 124  * Initialize the congestion window at the beginning of a connection or  
 127 static void tcp_cubic_cwnd_init_or_reset(struct tcpcb 
*tp
) 
 129         VERIFY(tp
->t_ccstate 
!= NULL
);   
 131         tcp_cubic_clear_state(tp
); 
 132         tcp_cc_cwnd_init_or_reset(tp
); 
 134         tcp_clear_pipeack_state(tp
); 
 136         /* Start counting bytes for RFC 3465 again */ 
 137         tp
->t_bytes_acked 
= 0; 
 140          * slow start threshold could get initialized to a lower value 
 141          * when there is a cached value in the route metrics. In this case, 
 142          * the connection can enter congestion avoidance without any packet 
 143          * loss and Cubic will enter steady-state too early. It is better 
 144          * to always probe to find the initial slow-start threshold. 
 146         if (tp
->t_inpcb
->inp_stat
->txbytes 
<= TCP_CC_CWND_INIT_BYTES
 
 147             && tp
->snd_ssthresh 
< (TCP_MAXWIN 
<< TCP_MAX_WINSHIFT
)) 
 148                 tp
->snd_ssthresh 
= TCP_MAXWIN 
<< TCP_MAX_WINSHIFT
; 
 150         /* Initialize cubic last max to be same as ssthresh */ 
 151         tp
->t_ccstate
->cub_last_max 
= tp
->snd_ssthresh
; 
 155  * Compute the target congestion window for the next RTT according to  
 156  * cubic equation when an ack is received. 
 158  * W(t) = C(t-K)^3 + W(last_max) 
 161 tcp_cubic_update(struct tcpcb 
*tp
, u_int32_t rtt
) 
 164         u_int32_t elapsed_time
, win
; 
 166         win 
= min(tp
->snd_cwnd
, tp
->snd_wnd
); 
 167         if (tp
->t_ccstate
->cub_last_max 
== 0) 
 168                 tp
->t_ccstate
->cub_last_max 
= tp
->snd_ssthresh
; 
 170         if (tp
->t_ccstate
->cub_epoch_start 
== 0) { 
 172                  * This is the beginning of a new epoch, initialize some of 
 173                  * the variables that we need to use for computing the  
 174                  * congestion window later. 
 176                 tp
->t_ccstate
->cub_epoch_start 
= tcp_now
; 
 177                 if (tp
->t_ccstate
->cub_epoch_start 
== 0) 
 178                         tp
->t_ccstate
->cub_epoch_start 
= 1; 
 179                 if (win 
< tp
->t_ccstate
->cub_last_max
) { 
 181                         VERIFY(current_task() == kernel_task
); 
 184                          * Compute cubic epoch period, this is the time 
 185                          * period that the window will take to increase to 
 186                          * last_max again after backoff due to loss. 
 188                         K 
= (tp
->t_ccstate
->cub_last_max 
- win
) 
 189                             / tp
->t_maxseg 
/ tcp_cubic_coeff
; 
 191                         tp
->t_ccstate
->cub_epoch_period 
= K 
* TCP_RETRANSHZ
; 
 193                         tp
->t_ccstate
->cub_origin_point 
=  
 194                                 tp
->t_ccstate
->cub_last_max
; 
 196                         tp
->t_ccstate
->cub_epoch_period 
= 0; 
 197                         tp
->t_ccstate
->cub_origin_point 
= win
; 
 199                 tp
->t_ccstate
->cub_target_win 
= 0; 
 202         VERIFY(tp
->t_ccstate
->cub_origin_point 
> 0);     
 204          * Compute the target window for the next RTT using smoothed RTT 
 205          * as an estimate for next RTT. 
 207         elapsed_time 
= timer_diff(tcp_now
, 0,  
 208                 tp
->t_ccstate
->cub_epoch_start
, 0); 
 210         if (tcp_cubic_use_minrtt
) 
 211                 elapsed_time 
+= max(tcp_cubic_use_minrtt
, rtt
); 
 214         var 
= (elapsed_time  
- tp
->t_ccstate
->cub_epoch_period
) / TCP_RETRANSHZ
; 
 215         var 
= var 
* var 
* var 
* (tcp_cubic_coeff 
* tp
->t_maxseg
); 
 217         tp
->t_ccstate
->cub_target_win 
= tp
->t_ccstate
->cub_origin_point 
+ var
; 
 218         return (tp
->t_ccstate
->cub_target_win
); 
 222  * Standard TCP utilizes bandwidth well in low RTT and low BDP connections 
 223  * even when there is some packet loss. Enabling TCP mode will help Cubic 
 224  * to achieve this kind of utilization. 
 226  * But if there is a bottleneck link in the path with a fixed size queue 
 227  * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this 
 228  * link because of the steady-state behavior. Using average and mean 
 229  * absolute deviation of W(lastmax), we try to detect if the congestion 
 230  * window is close to the bottleneck bandwidth. In that case, disabling 
 231  * TCP mode will help to minimize packet loss at this link.  
 233  * Disable TCP mode if the W(lastmax) (the window where previous packet 
 234  * loss happened) is within a small range from the average last max 
 237 #define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \ 
 238         ((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \ 
 239         (_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0) 
 242  * Compute the window growth if standard TCP (AIMD) was used with  
 243  * a backoff of 0.5 and additive increase of 1 packet per RTT. 
 245  * TCP window at time t can be calculated using the following equation 
 248  * W(t) <- Wmax * beta + 3 * ((1 - beta)/(1 + beta)) * t/RTT 
 252 tcp_cubic_tcpwin(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 254         if (tp
->t_ccstate
->cub_tcp_win 
== 0) { 
 255                 tp
->t_ccstate
->cub_tcp_win 
= min(tp
->snd_cwnd
, tp
->snd_wnd
); 
 256                 tp
->t_ccstate
->cub_tcp_bytes_acked 
= 0; 
 258                 tp
->t_ccstate
->cub_tcp_bytes_acked 
+= 
 260                 if (tp
->t_ccstate
->cub_tcp_bytes_acked 
>= 
 261                     tp
->t_ccstate
->cub_tcp_win
) { 
 262                         tp
->t_ccstate
->cub_tcp_bytes_acked 
-= 
 263                             tp
->t_ccstate
->cub_tcp_win
; 
 264                         tp
->t_ccstate
->cub_tcp_win 
+= tp
->t_maxseg
; 
 267         return (tp
->t_ccstate
->cub_tcp_win
); 
 271  * Handle an in-sequence ack during congestion avoidance phase. 
 274 tcp_cubic_congestion_avd(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 276         u_int32_t cubic_target_win
, tcp_win
, rtt
; 
 278         /* Do not increase congestion window in non-validated phase */ 
 279         if (tcp_cc_is_cwnd_nonvalidated(tp
) != 0) 
 282         tp
->t_bytes_acked 
+= BYTES_ACKED(th
, tp
); 
 284         rtt 
= get_base_rtt(tp
); 
 286          * First compute cubic window. If cubic variables are not 
 287          * initialized (after coming out of recovery), this call will 
 290         cubic_target_win 
= tcp_cubic_update(tp
, rtt
); 
 292         /* Compute TCP window if a multiplicative decrease of 0.2 is used */ 
 293         tcp_win 
= tcp_cubic_tcpwin(tp
, th
); 
 295         if (tp
->snd_cwnd 
< tcp_win 
&& 
 296             (tcp_cubic_tcp_friendliness 
== 1 || 
 297             TCP_CUBIC_ENABLE_TCPMODE(tp
))) { 
 298                 /* this connection is in TCP-friendly region */ 
 299                 if (tp
->t_bytes_acked 
>= tp
->snd_cwnd
) { 
 300                         tp
->t_bytes_acked 
-= tp
->snd_cwnd
; 
 301                         tp
->snd_cwnd 
= min(tcp_win
, TCP_MAXWIN 
<< tp
->snd_scale
); 
 304                 if (cubic_target_win 
> tp
->snd_cwnd
) { 
 306                          * The target win is computed for the next RTT. 
 307                          * To reach this value, cwnd will have to be updated 
 308                          * one segment at a time. Compute how many bytes  
 309                          * need to be acknowledged before we can increase  
 310                          * the cwnd by one segment. 
 313                         incr_win 
= tp
->snd_cwnd 
* tp
->t_maxseg
; 
 314                         incr_win 
/= (cubic_target_win 
- tp
->snd_cwnd
); 
 316                             tp
->t_bytes_acked 
>= incr_win
) { 
 317                                 tp
->t_bytes_acked 
-= incr_win
; 
 319                                     min((tp
->snd_cwnd 
+ tp
->t_maxseg
), 
 320                                     TCP_MAXWIN 
<< tp
->snd_scale
); 
 327 tcp_cubic_ack_rcvd(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 329         /* Do not increase the congestion window in non-validated phase */ 
 330         if (tcp_cc_is_cwnd_nonvalidated(tp
) != 0) 
 333         if (tp
->snd_cwnd 
>= tp
->snd_ssthresh
) { 
 334                 /* Congestion avoidance phase */ 
 335                 tcp_cubic_congestion_avd(tp
, th
); 
 338                  * Use 2*SMSS as limit on increment as suggested 
 339                  * by RFC 3465 section 2.3 
 341                 uint32_t acked
, abc_lim
, incr
; 
 343                 acked 
= BYTES_ACKED(th
, tp
); 
 344                 abc_lim 
= (tcp_do_rfc3465_lim2 
&&  
 345                         tp
->snd_nxt 
== tp
->snd_max
) ? 
 346                         2 * tp
->t_maxseg 
: tp
->t_maxseg
; 
 347                 incr 
= min(acked
, abc_lim
); 
 349                 tp
->snd_cwnd 
+= incr
; 
 350                 tp
->snd_cwnd 
= min(tp
->snd_cwnd
,  
 351                         TCP_MAXWIN 
<< tp
->snd_scale
); 
 356 tcp_cubic_pre_fr(struct tcpcb 
*tp
) 
 360         tp
->t_ccstate
->cub_epoch_start 
= 0; 
 361         tp
->t_ccstate
->cub_tcp_win 
= 0; 
 362         tp
->t_ccstate
->cub_target_win 
= 0; 
 363         tp
->t_ccstate
->cub_tcp_bytes_acked 
= 0; 
 365         win 
= min(tp
->snd_cwnd
, tp
->snd_wnd
); 
 366         if (tp
->t_flagsext 
& TF_CWND_NONVALIDATED
) { 
 367                 tp
->t_lossflightsize 
= tp
->snd_max 
- tp
->snd_una
; 
 368                 win 
= (max(tp
->t_pipeack
, tp
->t_lossflightsize
)) >> 1; 
 370                 tp
->t_lossflightsize 
= 0; 
 373          * Note the congestion window at which packet loss occurred as 
 376          * If the congestion window is less than the last max window when 
 377          * loss occurred, it indicates that capacity available in the  
 378          * network has gone down. This can happen if a new flow has started 
 379          * and it is capturing some of the bandwidth. To reach convergence 
 380          * quickly, backoff a little more. Disable fast convergence to  
 381          * disable this behavior. 
 383         if (win 
< tp
->t_ccstate
->cub_last_max 
&& 
 384                 tcp_cubic_fast_convergence 
== 1) 
 385                 tp
->t_ccstate
->cub_last_max 
= win 
*  
 386                         tcp_cubic_fast_convergence_factor
; 
 388                 tp
->t_ccstate
->cub_last_max 
= win
; 
 390         if (tp
->t_ccstate
->cub_last_max 
== 0) { 
 392                  * If last_max is zero because snd_wnd is zero or for 
 393                  * any other reason, initialize it to the amount of data 
 396                 tp
->t_ccstate
->cub_last_max 
= tp
->snd_max 
- tp
->snd_una
; 
 400          * Compute average and mean absolute deviation of the 
 401          * window at which packet loss occurred. 
 403         if (tp
->t_ccstate
->cub_avg_lastmax 
== 0) { 
 404                 tp
->t_ccstate
->cub_avg_lastmax 
= tp
->t_ccstate
->cub_last_max
; 
 407                  * Average is computed by taking 63 parts of 
 408                  * history and one part of the most recent value 
 410                 avg 
= tp
->t_ccstate
->cub_avg_lastmax
; 
 411                 avg 
= (avg 
<< 6) - avg
; 
 412                 tp
->t_ccstate
->cub_avg_lastmax 
= 
 413                     (avg 
+ tp
->t_ccstate
->cub_last_max
) >> 6;  
 416         /* caluclate deviation from average */ 
 417         dev 
= tp
->t_ccstate
->cub_avg_lastmax 
- tp
->t_ccstate
->cub_last_max
; 
 419         /* Take the absolute value */ 
 423         if (tp
->t_ccstate
->cub_mean_dev 
== 0) { 
 424                 tp
->t_ccstate
->cub_mean_dev 
= dev
; 
 426                 dev 
= dev 
+ ((tp
->t_ccstate
->cub_mean_dev 
<< 4) 
 427                     - tp
->t_ccstate
->cub_mean_dev
); 
 428                 tp
->t_ccstate
->cub_mean_dev 
= dev 
>> 4; 
 431         /* Backoff congestion window by tcp_cubic_backoff factor */ 
 432         win 
= win 
- (win 
* tcp_cubic_backoff
); 
 433         win 
= (win 
/ tp
->t_maxseg
); 
 436         tp
->snd_ssthresh 
= win 
* tp
->t_maxseg
; 
 437         tcp_cc_resize_sndbuf(tp
); 
 441 tcp_cubic_post_fr(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 443         uint32_t flight_size 
= 0; 
 445         if (SEQ_LEQ(th
->th_ack
, tp
->snd_max
)) 
 446                 flight_size 
= tp
->snd_max 
- th
->th_ack
; 
 448         if (SACK_ENABLED(tp
) && tp
->t_lossflightsize 
> 0) { 
 449                 u_int32_t total_rxt_size 
= 0, ncwnd
; 
 451                  * When SACK is enabled, the number of retransmitted bytes 
 452                  * can be counted more accurately. 
 454                 total_rxt_size 
= tcp_rxtseg_total_size(tp
); 
 455                 ncwnd 
= max(tp
->t_pipeack
, tp
->t_lossflightsize
); 
 456                 if (total_rxt_size 
<= ncwnd
) { 
 457                         ncwnd 
= ncwnd 
- total_rxt_size
; 
 461                  * To avoid sending a large burst at the end of recovery 
 462                  * set a max limit on ncwnd 
 464                 ncwnd 
= min(ncwnd
, (tp
->t_maxseg 
<< 6)); 
 466                 flight_size 
= max(ncwnd
, flight_size
); 
 469          * Complete ack. The current window was inflated for fast recovery. 
 470          * It has to be deflated post recovery. 
 472          * Window inflation should have left us with approx snd_ssthresh  
 473          * outstanding data. If the flight size is zero or one segment, 
 474          * make congestion window to be at least as big as 2 segments to 
 475          * avoid delayed acknowledgements. This is according to RFC 6582. 
 477         if (flight_size 
< tp
->snd_ssthresh
) 
 478                 tp
->snd_cwnd 
= max(flight_size
, tp
->t_maxseg
)  
 481                 tp
->snd_cwnd 
= tp
->snd_ssthresh
; 
 482         tp
->t_ccstate
->cub_tcp_win 
= 0; 
 483         tp
->t_ccstate
->cub_target_win 
= 0; 
 484         tp
->t_ccstate
->cub_tcp_bytes_acked 
= 0; 
 488 tcp_cubic_after_timeout(struct tcpcb 
*tp
) 
 490         VERIFY(tp
->t_ccstate 
!= NULL
); 
 493          * Avoid adjusting congestion window due to SYN retransmissions. 
 494          * If more than one byte (SYN) is outstanding then it is still 
 495          * needed to adjust the window. 
 497         if (tp
->t_state 
< TCPS_ESTABLISHED 
&& 
 498             ((int)(tp
->snd_max 
- tp
->snd_una
) <= 1)) 
 501         if (!IN_FASTRECOVERY(tp
)) { 
 502                 tcp_cubic_clear_state(tp
); 
 503                 tcp_cubic_pre_fr(tp
); 
 507          * Close the congestion window down to one segment as a retransmit 
 508          * timeout might indicate severe congestion. 
 510         tp
->snd_cwnd 
= tp
->t_maxseg
; 
 514 tcp_cubic_delay_ack(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 516         return (tcp_cc_delay_ack(tp
, th
)); 
 520  * When switching from a different CC it is better for Cubic to start  
 521  * fresh. The state required for Cubic calculation might be stale and it 
 522  * might not represent the current state of the network. If it starts as 
 523  * a new connection it will probe and learn the existing network conditions. 
 526 tcp_cubic_switch_cc(struct tcpcb 
*tp
, uint16_t old_cc_index
) 
 528 #pragma unused(old_cc_index) 
 529         tcp_cubic_cwnd_init_or_reset(tp
); 
 531         OSIncrementAtomic((volatile SInt32 
*)&tcp_cc_cubic
.num_sockets
); 
 534 static inline void tcp_cubic_clear_state(struct tcpcb 
*tp
) 
 536         tp
->t_ccstate
->cub_last_max 
= 0; 
 537         tp
->t_ccstate
->cub_epoch_start 
= 0; 
 538         tp
->t_ccstate
->cub_origin_point 
= 0; 
 539         tp
->t_ccstate
->cub_tcp_win 
= 0; 
 540         tp
->t_ccstate
->cub_tcp_bytes_acked 
= 0; 
 541         tp
->t_ccstate
->cub_epoch_period 
= 0; 
 542         tp
->t_ccstate
->cub_target_win 
= 0;