]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2013-2018 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <sys/param.h> | |
30 | #include <sys/systm.h> | |
31 | #include <sys/kernel.h> | |
32 | #include <sys/syslog.h> | |
33 | #include <sys/protosw.h> | |
34 | #include <sys/socketvar.h> | |
35 | #include <sys/kern_control.h> | |
36 | #include <sys/domain.h> | |
37 | ||
38 | #include <netinet/in.h> | |
39 | #include <netinet/tcp.h> | |
40 | #include <netinet/tcp_var.h> | |
41 | #include <netinet/tcp_cc.h> | |
42 | #include <mach/sdt.h> | |
43 | #include <libkern/OSAtomic.h> | |
44 | ||
45 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED, | |
46 | int, tcp_cc_debug, 0, "Enable debug data collection"); | |
47 | ||
48 | extern struct tcp_cc_algo tcp_cc_newreno; | |
49 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, | |
50 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_newreno.num_sockets, | |
51 | 0, "Number of sockets using newreno"); | |
52 | ||
53 | extern struct tcp_cc_algo tcp_cc_ledbat; | |
54 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, | |
55 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets, | |
56 | 0, "Number of sockets using background transport"); | |
57 | ||
58 | extern struct tcp_cc_algo tcp_cc_cubic; | |
59 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_sockets, | |
60 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_cubic.num_sockets, | |
61 | 0, "Number of sockets using cubic"); | |
62 | ||
63 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_newreno, | |
64 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_use_newreno, 0, | |
65 | "Use TCP NewReno by default"); | |
66 | ||
67 | static int tcp_check_cwnd_nonvalidated = 1; | |
68 | #if (DEBUG || DEVELOPMENT) | |
69 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated, | |
70 | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_check_cwnd_nonvalidated, 0, | |
71 | "Check if congestion window is non-validated"); | |
72 | #endif /* (DEBUG || DEVELOPMENT) */ | |
73 | ||
74 | #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ | |
75 | sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ | |
76 | tcp_autosndbuf_max); | |
77 | ||
78 | /* Array containing pointers to currently implemented TCP CC algorithms */ | |
79 | struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; | |
80 | struct zone *tcp_cc_zone; | |
81 | ||
82 | #define TCP_CCDBG_NOUNIT 0xffffffff | |
83 | static kern_ctl_ref tcp_ccdbg_ctlref = NULL; | |
84 | volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT; | |
85 | ||
86 | void tcp_cc_init(void); | |
87 | static void tcp_cc_control_register(void); | |
88 | static errno_t tcp_ccdbg_control_connect(kern_ctl_ref kctl, | |
89 | struct sockaddr_ctl *sac, void **uinfo); | |
90 | static errno_t tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, | |
91 | u_int32_t unit, void *uinfo); | |
92 | static struct tcp_cc_algo tcp_cc_algo_none; | |
93 | /* | |
94 | * Initialize TCP congestion control algorithms. | |
95 | */ | |
96 | ||
97 | void | |
98 | tcp_cc_init(void) | |
99 | { | |
100 | bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); | |
101 | bzero(&tcp_cc_algo_none, sizeof(tcp_cc_algo_none)); | |
102 | ||
103 | tcp_cc_algo_list[TCP_CC_ALGO_NONE] = &tcp_cc_algo_none; | |
104 | tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; | |
105 | tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; | |
106 | tcp_cc_algo_list[TCP_CC_ALGO_CUBIC_INDEX] = &tcp_cc_cubic; | |
107 | ||
108 | tcp_cc_control_register(); | |
109 | } | |
110 | ||
111 | static void | |
112 | tcp_cc_control_register(void) | |
113 | { | |
114 | struct kern_ctl_reg ccdbg_control; | |
115 | errno_t err; | |
116 | ||
117 | bzero(&ccdbg_control, sizeof(ccdbg_control)); | |
118 | strlcpy(ccdbg_control.ctl_name, TCP_CC_CONTROL_NAME, | |
119 | sizeof(ccdbg_control.ctl_name)); | |
120 | ccdbg_control.ctl_connect = tcp_ccdbg_control_connect; | |
121 | ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect; | |
122 | ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED; | |
123 | ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM; | |
124 | ccdbg_control.ctl_sendsize = 32 * 1024; | |
125 | ||
126 | err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref); | |
127 | if (err != 0) { | |
128 | log(LOG_ERR, "failed to register tcp_cc debug control"); | |
129 | } | |
130 | } | |
131 | ||
132 | /* Allow only one socket to connect at any time for debugging */ | |
133 | static errno_t | |
134 | tcp_ccdbg_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, | |
135 | void **uinfo) | |
136 | { | |
137 | #pragma unused(kctl) | |
138 | #pragma unused(uinfo) | |
139 | ||
140 | UInt32 old_value = TCP_CCDBG_NOUNIT; | |
141 | UInt32 new_value = sac->sc_unit; | |
142 | ||
143 | if (tcp_ccdbg_unit != old_value) { | |
144 | return EALREADY; | |
145 | } | |
146 | ||
147 | if (OSCompareAndSwap(old_value, new_value, &tcp_ccdbg_unit)) { | |
148 | return 0; | |
149 | } else { | |
150 | return EALREADY; | |
151 | } | |
152 | } | |
153 | ||
154 | static errno_t | |
155 | tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo) | |
156 | { | |
157 | #pragma unused(kctl, unit, uinfo) | |
158 | ||
159 | if (unit == tcp_ccdbg_unit) { | |
160 | UInt32 old_value = tcp_ccdbg_unit; | |
161 | UInt32 new_value = TCP_CCDBG_NOUNIT; | |
162 | if (tcp_ccdbg_unit == new_value) { | |
163 | return 0; | |
164 | } | |
165 | ||
166 | if (!OSCompareAndSwap(old_value, new_value, | |
167 | &tcp_ccdbg_unit)) { | |
168 | log(LOG_DEBUG, | |
169 | "failed to disconnect tcp_cc debug control"); | |
170 | } | |
171 | } | |
172 | return 0; | |
173 | } | |
174 | ||
175 | inline void | |
176 | tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event) | |
177 | { | |
178 | #if !CONFIG_DTRACE | |
179 | #pragma unused(th) | |
180 | #endif /* !CONFIG_DTRACE */ | |
181 | struct inpcb *inp = tp->t_inpcb; | |
182 | ||
183 | if (tcp_cc_debug && tcp_ccdbg_unit > 0) { | |
184 | struct tcp_cc_debug_state dbg_state; | |
185 | struct timespec tv; | |
186 | ||
187 | bzero(&dbg_state, sizeof(dbg_state)); | |
188 | ||
189 | nanotime(&tv); | |
190 | /* Take time in seconds */ | |
191 | dbg_state.ccd_tsns = (tv.tv_sec * 1000000000) + tv.tv_nsec; | |
192 | inet_ntop(SOCK_DOM(inp->inp_socket), | |
193 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? | |
194 | (void *)&inp->inp_laddr.s_addr : | |
195 | (void *)&inp->in6p_laddr), dbg_state.ccd_srcaddr, | |
196 | sizeof(dbg_state.ccd_srcaddr)); | |
197 | dbg_state.ccd_srcport = ntohs(inp->inp_lport); | |
198 | inet_ntop(SOCK_DOM(inp->inp_socket), | |
199 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? | |
200 | (void *)&inp->inp_faddr.s_addr : | |
201 | (void *)&inp->in6p_faddr), dbg_state.ccd_destaddr, | |
202 | sizeof(dbg_state.ccd_destaddr)); | |
203 | dbg_state.ccd_destport = ntohs(inp->inp_fport); | |
204 | ||
205 | dbg_state.ccd_snd_cwnd = tp->snd_cwnd; | |
206 | dbg_state.ccd_snd_wnd = tp->snd_wnd; | |
207 | dbg_state.ccd_snd_ssthresh = tp->snd_ssthresh; | |
208 | dbg_state.ccd_pipeack = tp->t_pipeack; | |
209 | dbg_state.ccd_rttcur = tp->t_rttcur; | |
210 | dbg_state.ccd_rxtcur = tp->t_rxtcur; | |
211 | dbg_state.ccd_srtt = tp->t_srtt >> TCP_RTT_SHIFT; | |
212 | dbg_state.ccd_event = event; | |
213 | dbg_state.ccd_sndcc = inp->inp_socket->so_snd.sb_cc; | |
214 | dbg_state.ccd_sndhiwat = inp->inp_socket->so_snd.sb_hiwat; | |
215 | dbg_state.ccd_bytes_acked = tp->t_bytes_acked; | |
216 | dbg_state.ccd_cc_index = tp->tcp_cc_index; | |
217 | switch (tp->tcp_cc_index) { | |
218 | case TCP_CC_ALGO_CUBIC_INDEX: | |
219 | dbg_state.u.cubic_state.ccd_last_max = | |
220 | tp->t_ccstate->cub_last_max; | |
221 | dbg_state.u.cubic_state.ccd_tcp_win = | |
222 | tp->t_ccstate->cub_tcp_win; | |
223 | dbg_state.u.cubic_state.ccd_target_win = | |
224 | tp->t_ccstate->cub_target_win; | |
225 | dbg_state.u.cubic_state.ccd_avg_lastmax = | |
226 | tp->t_ccstate->cub_avg_lastmax; | |
227 | dbg_state.u.cubic_state.ccd_mean_deviation = | |
228 | tp->t_ccstate->cub_mean_dev; | |
229 | break; | |
230 | case TCP_CC_ALGO_BACKGROUND_INDEX: | |
231 | dbg_state.u.ledbat_state.led_base_rtt = | |
232 | get_base_rtt(tp); | |
233 | break; | |
234 | default: | |
235 | break; | |
236 | } | |
237 | ||
238 | ctl_enqueuedata(tcp_ccdbg_ctlref, tcp_ccdbg_unit, | |
239 | &dbg_state, sizeof(dbg_state), 0); | |
240 | } | |
241 | DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, | |
242 | struct tcpcb *, tp, struct tcphdr *, th, int32_t, event); | |
243 | } | |
244 | ||
245 | void | |
246 | tcp_cc_resize_sndbuf(struct tcpcb *tp) | |
247 | { | |
248 | struct sockbuf *sb; | |
249 | /* | |
250 | * If the send socket buffer size is bigger than ssthresh, | |
251 | * it is time to trim it because we do not want to hold | |
252 | * too many mbufs in the socket buffer | |
253 | */ | |
254 | sb = &tp->t_inpcb->inp_socket->so_snd; | |
255 | if (sb->sb_hiwat > tp->snd_ssthresh && | |
256 | (sb->sb_flags & SB_AUTOSIZE)) { | |
257 | if (sb->sb_idealsize > tp->snd_ssthresh) { | |
258 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); | |
259 | } | |
260 | sb->sb_flags |= SB_TRIM; | |
261 | } | |
262 | } | |
263 | ||
264 | void | |
265 | tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) | |
266 | { | |
267 | struct sockbuf *sb; | |
268 | sb = &tp->t_inpcb->inp_socket->so_snd; | |
269 | if ((sb->sb_flags & (SB_TRIM | SB_AUTOSIZE)) == (SB_TRIM | SB_AUTOSIZE)) { | |
270 | /* | |
271 | * If there was a retransmission that was not necessary | |
272 | * then the size of socket buffer can be restored to | |
273 | * what it was before | |
274 | */ | |
275 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); | |
276 | if (sb->sb_hiwat <= sb->sb_idealsize) { | |
277 | sbreserve(sb, sb->sb_idealsize); | |
278 | sb->sb_flags &= ~SB_TRIM; | |
279 | } | |
280 | } | |
281 | } | |
282 | ||
283 | /* | |
284 | * Calculate initial cwnd according to RFC3390. | |
285 | * | |
286 | * Keep the old ss_fltsz sysctl for ABI compabitility issues. | |
287 | * but it will be overriden if tcp_do_rfc3390 sysctl when it is set. | |
288 | */ | |
289 | void | |
290 | tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) | |
291 | { | |
292 | if (tp->t_flags & TF_LOCAL) { | |
293 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; | |
294 | } else { | |
295 | /* initial congestion window according to RFC 3390 */ | |
296 | if (tcp_do_rfc3390) { | |
297 | tp->snd_cwnd = min(4 * tp->t_maxseg, | |
298 | max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); | |
299 | } else { | |
300 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz; | |
301 | } | |
302 | } | |
303 | } | |
304 | ||
305 | /* | |
306 | * Indicate whether this ack should be delayed. | |
307 | * Here is the explanation for different settings of tcp_delack_enabled: | |
308 | * - when set to 1, the behavior is same as when set to 2. We kept this | |
309 | * for binary compatibility. | |
310 | * - when set to 2, will "ack every other packet" | |
311 | * - if our last ack wasn't a 0-sized window. | |
312 | * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). | |
313 | * If TH_PUSH is set, take this as a clue that we need to ACK | |
314 | * with no delay. This helps higher level protocols who | |
315 | * won't send us more data even if the window is open | |
316 | * because their last "segment" hasn't been ACKed | |
317 | * - when set to 3, will do "streaming detection" | |
318 | * - if we receive more than "maxseg_unacked" full packets | |
319 | * in the last 100ms | |
320 | * - if the connection is not in slow-start or idle or | |
321 | * loss/recovery states | |
322 | * - if those criteria aren't met, it will ack every other packet. | |
323 | */ | |
324 | int | |
325 | tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) | |
326 | { | |
327 | switch (tcp_delack_enabled) { | |
328 | case 1: | |
329 | case 2: | |
330 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && | |
331 | (th->th_flags & TH_PUSH) == 0 && | |
332 | (tp->t_unacksegs == 1)) { | |
333 | return 1; | |
334 | } | |
335 | break; | |
336 | case 3: | |
337 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && | |
338 | (th->th_flags & TH_PUSH) == 0 && | |
339 | ((tp->t_unacksegs == 1) || | |
340 | ((tp->t_flags & TF_STRETCHACK) && | |
341 | tp->t_unacksegs < maxseg_unacked))) { | |
342 | return 1; | |
343 | } | |
344 | break; | |
345 | } | |
346 | return 0; | |
347 | } | |
348 | ||
349 | void | |
350 | tcp_cc_allocate_state(struct tcpcb *tp) | |
351 | { | |
352 | if (tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX && | |
353 | tp->t_ccstate == NULL) { | |
354 | tp->t_ccstate = (struct tcp_ccstate *)zalloc(tcp_cc_zone); | |
355 | ||
356 | /* | |
357 | * If we could not allocate memory for congestion control | |
358 | * state, revert to using TCP NewReno as it does not | |
359 | * require any state | |
360 | */ | |
361 | if (tp->t_ccstate == NULL) { | |
362 | tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; | |
363 | } else { | |
364 | bzero(tp->t_ccstate, sizeof(*tp->t_ccstate)); | |
365 | } | |
366 | } | |
367 | } | |
368 | ||
369 | /* | |
370 | * If stretch ack was disabled automatically on long standing connections, | |
371 | * re-evaluate the situation after 15 minutes to enable it. | |
372 | */ | |
373 | #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) | |
374 | void | |
375 | tcp_cc_after_idle_stretchack(struct tcpcb *tp) | |
376 | { | |
377 | int32_t tdiff; | |
378 | ||
379 | if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) { | |
380 | return; | |
381 | } | |
382 | ||
383 | tdiff = timer_diff(tcp_now, 0, tp->rcv_nostrack_ts, 0); | |
384 | if (tdiff < 0) { | |
385 | tdiff = -tdiff; | |
386 | } | |
387 | ||
388 | if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { | |
389 | tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; | |
390 | tp->t_stretchack_delayed = 0; | |
391 | ||
392 | tcp_reset_stretch_ack(tp); | |
393 | } | |
394 | } | |
395 | ||
396 | /* | |
397 | * Detect if the congestion window is non-vlidated according to | |
398 | * draft-ietf-tcpm-newcwv-07 | |
399 | */ | |
400 | ||
401 | inline uint32_t | |
402 | tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp) | |
403 | { | |
404 | struct socket *so = tp->t_inpcb->inp_socket; | |
405 | if (tp->t_pipeack == 0 || tcp_check_cwnd_nonvalidated == 0) { | |
406 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; | |
407 | return 0; | |
408 | } | |
409 | ||
410 | /* | |
411 | * The congestion window is validated if the number of bytes acked | |
412 | * is more than half of the current window or if there is more | |
413 | * data to send in the send socket buffer | |
414 | */ | |
415 | if (tp->t_pipeack >= (tp->snd_cwnd >> 1) || | |
416 | (so != NULL && so->so_snd.sb_cc > tp->snd_cwnd)) { | |
417 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; | |
418 | } else { | |
419 | tp->t_flagsext |= TF_CWND_NONVALIDATED; | |
420 | } | |
421 | return tp->t_flagsext & TF_CWND_NONVALIDATED; | |
422 | } | |
423 | ||
424 | /* | |
425 | * Adjust congestion window in response to congestion in non-validated | |
426 | * phase. | |
427 | */ | |
428 | inline void | |
429 | tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp) | |
430 | { | |
431 | tp->t_pipeack = tcp_get_max_pipeack(tp); | |
432 | tcp_clear_pipeack_state(tp); | |
433 | tp->snd_cwnd = (max(tp->t_pipeack, tp->t_lossflightsize) >> 1); | |
434 | tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); | |
435 | tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh; | |
436 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; | |
437 | } | |
438 | ||
439 | /* | |
440 | * Return maximum of all the pipeack samples. Since the number of samples | |
441 | * TCP_PIPEACK_SAMPLE_COUNT is 3 at this time, it will be simpler to do | |
442 | * a comparision. We should change ths if the number of samples increases. | |
443 | */ | |
444 | inline u_int32_t | |
445 | tcp_get_max_pipeack(struct tcpcb *tp) | |
446 | { | |
447 | u_int32_t max_pipeack = 0; | |
448 | max_pipeack = (tp->t_pipeack_sample[0] > tp->t_pipeack_sample[1]) ? | |
449 | tp->t_pipeack_sample[0] : tp->t_pipeack_sample[1]; | |
450 | max_pipeack = (tp->t_pipeack_sample[2] > max_pipeack) ? | |
451 | tp->t_pipeack_sample[2] : max_pipeack; | |
452 | ||
453 | return max_pipeack; | |
454 | } | |
455 | ||
456 | inline void | |
457 | tcp_clear_pipeack_state(struct tcpcb *tp) | |
458 | { | |
459 | bzero(tp->t_pipeack_sample, sizeof(tp->t_pipeack_sample)); | |
460 | tp->t_pipeack_ind = 0; | |
461 | tp->t_lossflightsize = 0; | |
462 | } |