2 * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/syslog.h>
33 #include <sys/protosw.h>
34 #include <sys/socketvar.h>
35 #include <sys/kern_control.h>
36 #include <sys/domain.h>
38 #include <netinet/in.h>
39 #include <netinet/tcp.h>
40 #include <netinet/tcp_var.h>
41 #include <netinet/tcp_cc.h>
43 #include <libkern/OSAtomic.h>
45 struct tcp_cc_debug_state
{
47 char ccd_srcaddr
[INET6_ADDRSTRLEN
];
49 char ccd_destaddr
[INET6_ADDRSTRLEN
];
50 uint16_t ccd_destport
;
51 uint32_t ccd_snd_cwnd
;
53 uint32_t ccd_snd_ssthresh
;
60 uint32_t ccd_sndhiwat
;
61 uint32_t ccd_bytes_acked
;
62 u_int8_t ccd_cc_index
;
63 u_int8_t ccd_unused_1__
;
64 u_int16_t ccd_unused_2__
;
67 uint32_t ccd_last_max
;
69 uint32_t ccd_target_win
;
70 uint32_t ccd_avg_lastmax
;
71 uint32_t ccd_mean_deviation
;
74 u_int32_t led_base_rtt
;
79 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, cc_debug
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
80 int, tcp_cc_debug
, 0, "Enable debug data collection");
82 extern struct tcp_cc_algo tcp_cc_newreno
;
83 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, newreno_sockets
,
84 CTLFLAG_RD
| CTLFLAG_LOCKED
, &tcp_cc_newreno
.num_sockets
,
85 0, "Number of sockets using newreno");
87 extern struct tcp_cc_algo tcp_cc_ledbat
;
88 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, background_sockets
,
89 CTLFLAG_RD
| CTLFLAG_LOCKED
, &tcp_cc_ledbat
.num_sockets
,
90 0, "Number of sockets using background transport");
92 extern struct tcp_cc_algo tcp_cc_cubic
;
93 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, cubic_sockets
,
94 CTLFLAG_RD
| CTLFLAG_LOCKED
, &tcp_cc_cubic
.num_sockets
,
95 0, "Number of sockets using cubic");
97 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, use_newreno
,
98 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_use_newreno
, 0,
99 "Use TCP NewReno by default");
101 static int tcp_check_cwnd_nonvalidated
= 1;
102 #if (DEBUG || DEVELOPMENT)
103 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, cwnd_nonvalidated
,
104 CTLFLAG_RW
| CTLFLAG_LOCKED
, &tcp_check_cwnd_nonvalidated
, 0,
105 "Check if congestion window is non-validated");
106 #endif /* (DEBUG || DEVELOPMENT) */
108 #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \
109 sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \
112 /* Array containing pointers to currently implemented TCP CC algorithms */
113 struct tcp_cc_algo
* tcp_cc_algo_list
[TCP_CC_ALGO_COUNT
];
114 struct zone
*tcp_cc_zone
;
116 /* Information for colelcting TCP debug information using control socket */
117 #define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug"
118 #define TCP_CCDBG_NOUNIT 0xffffffff
119 static kern_ctl_ref tcp_ccdbg_ctlref
= NULL
;
120 volatile UInt32 tcp_ccdbg_unit
= TCP_CCDBG_NOUNIT
;
122 void tcp_cc_init(void);
123 static void tcp_cc_control_register(void);
124 static errno_t
tcp_ccdbg_control_connect(kern_ctl_ref kctl
,
125 struct sockaddr_ctl
*sac
, void **uinfo
);
126 static errno_t
tcp_ccdbg_control_disconnect(kern_ctl_ref kctl
,
127 u_int32_t unit
, void *uinfo
);
128 static struct tcp_cc_algo tcp_cc_algo_none
;
130 * Initialize TCP congestion control algorithms.
136 bzero(&tcp_cc_algo_list
, sizeof(tcp_cc_algo_list
));
137 bzero(&tcp_cc_algo_none
, sizeof(tcp_cc_algo_none
));
139 tcp_cc_algo_list
[TCP_CC_ALGO_NONE
] = &tcp_cc_algo_none
;
140 tcp_cc_algo_list
[TCP_CC_ALGO_NEWRENO_INDEX
] = &tcp_cc_newreno
;
141 tcp_cc_algo_list
[TCP_CC_ALGO_BACKGROUND_INDEX
] = &tcp_cc_ledbat
;
142 tcp_cc_algo_list
[TCP_CC_ALGO_CUBIC_INDEX
] = &tcp_cc_cubic
;
144 tcp_cc_control_register();
148 tcp_cc_control_register(void)
150 struct kern_ctl_reg ccdbg_control
;
153 bzero(&ccdbg_control
, sizeof(ccdbg_control
));
154 strlcpy(ccdbg_control
.ctl_name
, TCP_CCDEBUG_CONTROL_NAME
,
155 sizeof(ccdbg_control
.ctl_name
));
156 ccdbg_control
.ctl_connect
= tcp_ccdbg_control_connect
;
157 ccdbg_control
.ctl_disconnect
= tcp_ccdbg_control_disconnect
;
158 ccdbg_control
.ctl_flags
|= CTL_FLAG_PRIVILEGED
;
159 ccdbg_control
.ctl_flags
|= CTL_FLAG_REG_SOCK_STREAM
;
161 err
= ctl_register(&ccdbg_control
, &tcp_ccdbg_ctlref
);
163 log(LOG_ERR
, "failed to register tcp_cc debug control");
167 /* Allow only one socket to connect at any time for debugging */
169 tcp_ccdbg_control_connect(kern_ctl_ref kctl
, struct sockaddr_ctl
*sac
,
173 #pragma unused(uinfo)
175 UInt32 old_value
= TCP_CCDBG_NOUNIT
;
176 UInt32 new_value
= sac
->sc_unit
;
178 if (tcp_ccdbg_unit
!= old_value
) {
182 if (OSCompareAndSwap(old_value
, new_value
, &tcp_ccdbg_unit
)) {
190 tcp_ccdbg_control_disconnect(kern_ctl_ref kctl
, u_int32_t unit
, void *uinfo
)
192 #pragma unused(kctl, unit, uinfo)
194 if (unit
== tcp_ccdbg_unit
) {
195 UInt32 old_value
= tcp_ccdbg_unit
;
196 UInt32 new_value
= TCP_CCDBG_NOUNIT
;
197 if (tcp_ccdbg_unit
== new_value
) {
201 if (!OSCompareAndSwap(old_value
, new_value
,
204 "failed to disconnect tcp_cc debug control");
211 tcp_ccdbg_trace(struct tcpcb
*tp
, struct tcphdr
*th
, int32_t event
)
215 #endif /* !CONFIG_DTRACE */
216 struct inpcb
*inp
= tp
->t_inpcb
;
218 if (tcp_cc_debug
&& tcp_ccdbg_unit
> 0) {
219 struct tcp_cc_debug_state dbg_state
;
222 bzero(&dbg_state
, sizeof(dbg_state
));
225 /* Take time in seconds */
226 dbg_state
.ccd_tsns
= (tv
.tv_sec
* 1000000000) + tv
.tv_nsec
;
227 inet_ntop(SOCK_DOM(inp
->inp_socket
),
228 ((SOCK_DOM(inp
->inp_socket
) == PF_INET
) ?
229 (void *)&inp
->inp_laddr
.s_addr
:
230 (void *)&inp
->in6p_laddr
), dbg_state
.ccd_srcaddr
,
231 sizeof(dbg_state
.ccd_srcaddr
));
232 dbg_state
.ccd_srcport
= ntohs(inp
->inp_lport
);
233 inet_ntop(SOCK_DOM(inp
->inp_socket
),
234 ((SOCK_DOM(inp
->inp_socket
) == PF_INET
) ?
235 (void *)&inp
->inp_faddr
.s_addr
:
236 (void *)&inp
->in6p_faddr
), dbg_state
.ccd_destaddr
,
237 sizeof(dbg_state
.ccd_destaddr
));
238 dbg_state
.ccd_destport
= ntohs(inp
->inp_fport
);
240 dbg_state
.ccd_snd_cwnd
= tp
->snd_cwnd
;
241 dbg_state
.ccd_snd_wnd
= tp
->snd_wnd
;
242 dbg_state
.ccd_snd_ssthresh
= tp
->snd_ssthresh
;
243 dbg_state
.ccd_pipeack
= tp
->t_pipeack
;
244 dbg_state
.ccd_rttcur
= tp
->t_rttcur
;
245 dbg_state
.ccd_rxtcur
= tp
->t_rxtcur
;
246 dbg_state
.ccd_srtt
= tp
->t_srtt
>> TCP_RTT_SHIFT
;
247 dbg_state
.ccd_event
= event
;
248 dbg_state
.ccd_sndcc
= inp
->inp_socket
->so_snd
.sb_cc
;
249 dbg_state
.ccd_sndhiwat
= inp
->inp_socket
->so_snd
.sb_hiwat
;
250 dbg_state
.ccd_bytes_acked
= tp
->t_bytes_acked
;
251 dbg_state
.ccd_cc_index
= tp
->tcp_cc_index
;
252 switch (tp
->tcp_cc_index
) {
253 case TCP_CC_ALGO_CUBIC_INDEX
:
254 dbg_state
.u
.cubic_state
.ccd_last_max
=
255 tp
->t_ccstate
->cub_last_max
;
256 dbg_state
.u
.cubic_state
.ccd_tcp_win
=
257 tp
->t_ccstate
->cub_tcp_win
;
258 dbg_state
.u
.cubic_state
.ccd_target_win
=
259 tp
->t_ccstate
->cub_target_win
;
260 dbg_state
.u
.cubic_state
.ccd_avg_lastmax
=
261 tp
->t_ccstate
->cub_avg_lastmax
;
262 dbg_state
.u
.cubic_state
.ccd_mean_deviation
=
263 tp
->t_ccstate
->cub_mean_dev
;
265 case TCP_CC_ALGO_BACKGROUND_INDEX
:
266 dbg_state
.u
.ledbat_state
.led_base_rtt
=
273 ctl_enqueuedata(tcp_ccdbg_ctlref
, tcp_ccdbg_unit
,
274 &dbg_state
, sizeof(dbg_state
), 0);
276 DTRACE_TCP5(cc
, void, NULL
, struct inpcb
*, inp
,
277 struct tcpcb
*, tp
, struct tcphdr
*, th
, int32_t, event
);
281 tcp_cc_resize_sndbuf(struct tcpcb
*tp
)
285 * If the send socket buffer size is bigger than ssthresh,
286 * it is time to trim it because we do not want to hold
287 * too many mbufs in the socket buffer
289 sb
= &tp
->t_inpcb
->inp_socket
->so_snd
;
290 if (sb
->sb_hiwat
> tp
->snd_ssthresh
&&
291 (sb
->sb_flags
& SB_AUTOSIZE
)) {
292 if (sb
->sb_idealsize
> tp
->snd_ssthresh
) {
293 SET_SNDSB_IDEAL_SIZE(sb
, tp
->snd_ssthresh
);
295 sb
->sb_flags
|= SB_TRIM
;
300 tcp_bad_rexmt_fix_sndbuf(struct tcpcb
*tp
)
303 sb
= &tp
->t_inpcb
->inp_socket
->so_snd
;
304 if ((sb
->sb_flags
& (SB_TRIM
| SB_AUTOSIZE
)) == (SB_TRIM
| SB_AUTOSIZE
)) {
306 * If there was a retransmission that was not necessary
307 * then the size of socket buffer can be restored to
310 SET_SNDSB_IDEAL_SIZE(sb
, tp
->snd_ssthresh
);
311 if (sb
->sb_hiwat
<= sb
->sb_idealsize
) {
312 sbreserve(sb
, sb
->sb_idealsize
);
313 sb
->sb_flags
&= ~SB_TRIM
;
319 * Calculate initial cwnd according to RFC3390.
321 * Keep the old ss_fltsz sysctl for ABI compabitility issues.
322 * but it will be overriden if tcp_do_rfc3390 sysctl when it is set.
325 tcp_cc_cwnd_init_or_reset(struct tcpcb
*tp
)
327 if (tp
->t_flags
& TF_LOCAL
) {
328 tp
->snd_cwnd
= tp
->t_maxseg
* ss_fltsz_local
;
330 /* initial congestion window according to RFC 3390 */
331 if (tcp_do_rfc3390
) {
332 tp
->snd_cwnd
= min(4 * tp
->t_maxseg
,
333 max(2 * tp
->t_maxseg
, TCP_CC_CWND_INIT_BYTES
));
335 tp
->snd_cwnd
= tp
->t_maxseg
* ss_fltsz
;
341 * Indicate whether this ack should be delayed.
342 * Here is the explanation for different settings of tcp_delack_enabled:
343 * - when set to 1, the bhavior is same as when set to 2. We kept this
344 * for binary compatibility.
345 * - when set to 2, will "ack every other packet"
346 * - if our last ack wasn't a 0-sized window.
347 * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245).
348 * If TH_PUSH is set, take this as a clue that we need to ACK
349 * with no delay. This helps higher level protocols who
350 * won't send us more data even if the window is open
351 * because their last "segment" hasn't been ACKed
352 * - when set to 3, will do "streaming detection"
353 * - if we receive more than "maxseg_unacked" full packets
355 * - if the connection is not in slow-start or idle or
356 * loss/recovery states
357 * - if those criteria aren't met, it will ack every other packet.
360 tcp_cc_delay_ack(struct tcpcb
*tp
, struct tcphdr
*th
)
362 switch (tcp_delack_enabled
) {
365 if ((tp
->t_flags
& TF_RXWIN0SENT
) == 0 &&
366 (th
->th_flags
& TH_PUSH
) == 0 &&
367 (tp
->t_unacksegs
== 1)) {
372 if ((tp
->t_flags
& TF_RXWIN0SENT
) == 0 &&
373 (th
->th_flags
& TH_PUSH
) == 0 &&
374 ((tp
->t_unacksegs
== 1) ||
375 ((tp
->t_flags
& TF_STRETCHACK
) != 0 &&
376 tp
->t_unacksegs
< (maxseg_unacked
)))) {
385 tcp_cc_allocate_state(struct tcpcb
*tp
)
387 if (tp
->tcp_cc_index
== TCP_CC_ALGO_CUBIC_INDEX
&&
388 tp
->t_ccstate
== NULL
) {
389 tp
->t_ccstate
= (struct tcp_ccstate
*)zalloc(tcp_cc_zone
);
392 * If we could not allocate memory for congestion control
393 * state, revert to using TCP NewReno as it does not
396 if (tp
->t_ccstate
== NULL
) {
397 tp
->tcp_cc_index
= TCP_CC_ALGO_NEWRENO_INDEX
;
399 bzero(tp
->t_ccstate
, sizeof(*tp
->t_ccstate
));
405 * If stretch ack was disabled automatically on long standing connections,
406 * re-evaluate the situation after 15 minutes to enable it.
408 #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ)
410 tcp_cc_after_idle_stretchack(struct tcpcb
*tp
)
414 if (!(tp
->t_flagsext
& TF_DISABLE_STRETCHACK
)) {
418 tdiff
= timer_diff(tcp_now
, 0, tp
->rcv_nostrack_ts
, 0);
423 if (tdiff
> TCP_STRETCHACK_DISABLE_WIN
) {
424 tp
->t_flagsext
&= ~TF_DISABLE_STRETCHACK
;
425 tp
->t_stretchack_delayed
= 0;
427 tcp_reset_stretch_ack(tp
);
432 * Detect if the congestion window is non-vlidated according to
433 * draft-ietf-tcpm-newcwv-07
437 tcp_cc_is_cwnd_nonvalidated(struct tcpcb
*tp
)
439 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
440 if (tp
->t_pipeack
== 0 || tcp_check_cwnd_nonvalidated
== 0) {
441 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
446 * The congestion window is validated if the number of bytes acked
447 * is more than half of the current window or if there is more
448 * data to send in the send socket buffer
450 if (tp
->t_pipeack
>= (tp
->snd_cwnd
>> 1) ||
451 (so
!= NULL
&& so
->so_snd
.sb_cc
> tp
->snd_cwnd
)) {
452 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
454 tp
->t_flagsext
|= TF_CWND_NONVALIDATED
;
456 return tp
->t_flagsext
& TF_CWND_NONVALIDATED
;
460 * Adjust congestion window in response to congestion in non-validated
464 tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb
*tp
)
466 tp
->t_pipeack
= tcp_get_max_pipeack(tp
);
467 tcp_clear_pipeack_state(tp
);
468 tp
->snd_cwnd
= (max(tp
->t_pipeack
, tp
->t_lossflightsize
) >> 1);
469 tp
->snd_cwnd
= max(tp
->snd_cwnd
, TCP_CC_CWND_INIT_BYTES
);
470 tp
->snd_cwnd
+= tp
->t_maxseg
* tcprexmtthresh
;
471 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
475 * Return maximum of all the pipeack samples. Since the number of samples
476 * TCP_PIPEACK_SAMPLE_COUNT is 3 at this time, it will be simpler to do
477 * a comparision. We should change ths if the number of samples increases.
480 tcp_get_max_pipeack(struct tcpcb
*tp
)
482 u_int32_t max_pipeack
= 0;
483 max_pipeack
= (tp
->t_pipeack_sample
[0] > tp
->t_pipeack_sample
[1]) ?
484 tp
->t_pipeack_sample
[0] : tp
->t_pipeack_sample
[1];
485 max_pipeack
= (tp
->t_pipeack_sample
[2] > max_pipeack
) ?
486 tp
->t_pipeack_sample
[2] : max_pipeack
;
492 tcp_clear_pipeack_state(struct tcpcb
*tp
)
494 bzero(tp
->t_pipeack_sample
, sizeof(tp
->t_pipeack_sample
));
495 tp
->t_pipeack_ind
= 0;
496 tp
->t_lossflightsize
= 0;