]>
Commit | Line | Data |
---|---|---|
fe8ab488 A |
1 | /* |
2 | * Copyright (c) 2013-2014 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <sys/param.h> | |
30 | #include <sys/systm.h> | |
31 | #include <sys/kernel.h> | |
32 | #include <sys/syslog.h> | |
33 | #include <sys/protosw.h> | |
34 | #include <sys/socketvar.h> | |
35 | #include <sys/kern_control.h> | |
36 | #include <sys/domain.h> | |
37 | ||
38 | #include <netinet/in.h> | |
39 | #include <netinet/tcp.h> | |
40 | #include <netinet/tcp_var.h> | |
41 | #include <netinet/tcp_cc.h> | |
42 | #include <mach/sdt.h> | |
43 | #include <libkern/OSAtomic.h> | |
44 | ||
45 | struct tcp_cc_debug_state { | |
46 | u_int64_t ccd_tsns; | |
47 | char ccd_srcaddr[INET6_ADDRSTRLEN]; | |
48 | uint16_t ccd_srcport; | |
49 | char ccd_destaddr[INET6_ADDRSTRLEN]; | |
50 | uint16_t ccd_destport; | |
51 | uint32_t ccd_snd_cwnd; | |
52 | uint32_t ccd_snd_wnd; | |
53 | uint32_t ccd_snd_ssthresh; | |
54 | uint32_t ccd_rttcur; | |
55 | uint32_t ccd_rxtcur; | |
56 | uint32_t ccd_srtt; | |
57 | uint32_t ccd_event; | |
58 | uint32_t ccd_sndcc; | |
59 | uint32_t ccd_sndhiwat; | |
60 | uint32_t ccd_bytes_acked; | |
61 | union { | |
62 | struct { | |
63 | uint32_t ccd_last_max; | |
64 | uint32_t ccd_tcp_win; | |
65 | uint32_t ccd_target_win; | |
66 | uint32_t ccd_avg_lastmax; | |
67 | uint32_t ccd_mean_deviation; | |
68 | } cubic_state; | |
69 | } u; | |
70 | }; | |
71 | ||
72 | int tcp_cc_debug = 0; | |
73 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED, | |
74 | &tcp_cc_debug, 0, "Enable debug data collection"); | |
75 | ||
76 | extern struct tcp_cc_algo tcp_cc_newreno; | |
77 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, | |
78 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_newreno.num_sockets, | |
79 | 0, "Number of sockets using newreno"); | |
80 | ||
81 | extern struct tcp_cc_algo tcp_cc_ledbat; | |
82 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, | |
83 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets, | |
84 | 0, "Number of sockets using background transport"); | |
85 | ||
86 | extern struct tcp_cc_algo tcp_cc_cubic; | |
87 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_sockets, | |
88 | CTLFLAG_RD | CTLFLAG_LOCKED,&tcp_cc_cubic.num_sockets, | |
89 | 0, "Number of sockets using cubic"); | |
90 | ||
91 | int tcp_use_newreno = 0; | |
92 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, use_newreno, | |
93 | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_newreno, 0, | |
94 | "Use TCP NewReno by default"); | |
95 | ||
96 | #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ | |
97 | sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ | |
98 | tcp_autosndbuf_max); | |
99 | ||
100 | /* Array containing pointers to currently implemented TCP CC algorithms */ | |
101 | struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; | |
102 | struct zone *tcp_cc_zone; | |
103 | ||
104 | /* Information for colelcting TCP debug information using control socket */ | |
105 | #define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug" | |
106 | #define TCP_CCDBG_NOUNIT 0xffffffff | |
107 | static kern_ctl_ref tcp_ccdbg_ctlref = NULL; | |
108 | volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT; | |
109 | ||
110 | void tcp_cc_init(void); | |
111 | static void tcp_cc_control_register(void); | |
112 | static errno_t tcp_ccdbg_control_connect(kern_ctl_ref kctl, | |
113 | struct sockaddr_ctl *sac, void **uinfo); | |
114 | static errno_t tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, | |
115 | u_int32_t unit, void *uinfo); | |
116 | static struct tcp_cc_algo tcp_cc_algo_none; | |
117 | /* | |
118 | * Initialize TCP congestion control algorithms. | |
119 | */ | |
120 | ||
121 | void | |
122 | tcp_cc_init(void) | |
123 | { | |
124 | bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); | |
125 | bzero(&tcp_cc_algo_none, sizeof(tcp_cc_algo_none)); | |
126 | ||
127 | tcp_cc_algo_list[TCP_CC_ALGO_NONE] = &tcp_cc_algo_none; | |
128 | tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; | |
129 | tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; | |
130 | tcp_cc_algo_list[TCP_CC_ALGO_CUBIC_INDEX] = &tcp_cc_cubic; | |
131 | ||
132 | tcp_cc_control_register(); | |
133 | } | |
134 | ||
135 | static void | |
136 | tcp_cc_control_register(void) | |
137 | { | |
138 | struct kern_ctl_reg ccdbg_control; | |
139 | errno_t err; | |
140 | ||
141 | bzero(&ccdbg_control, sizeof(ccdbg_control)); | |
142 | strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME, | |
143 | sizeof(ccdbg_control.ctl_name)); | |
144 | ccdbg_control.ctl_connect = tcp_ccdbg_control_connect; | |
145 | ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect; | |
146 | ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED; | |
147 | ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM; | |
148 | ||
149 | err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref); | |
150 | if (err != 0) { | |
151 | log(LOG_ERR, "failed to register tcp_cc debug control"); | |
152 | } | |
153 | } | |
154 | ||
155 | /* Allow only one socket to connect at any time for debugging */ | |
156 | static errno_t | |
157 | tcp_ccdbg_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, | |
158 | void **uinfo) | |
159 | { | |
160 | #pragma unused(kctl) | |
161 | #pragma unused(uinfo) | |
162 | ||
163 | UInt32 old_value = TCP_CCDBG_NOUNIT; | |
164 | UInt32 new_value = sac->sc_unit; | |
165 | ||
166 | if (tcp_ccdbg_unit != old_value) | |
167 | return (EALREADY); | |
168 | ||
169 | if (OSCompareAndSwap(old_value, new_value, &tcp_ccdbg_unit)) | |
170 | return (0); | |
171 | else | |
172 | return (EALREADY); | |
173 | } | |
174 | ||
175 | static errno_t | |
176 | tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo) | |
177 | { | |
178 | #pragma unused(kctl, unit, uinfo) | |
179 | ||
180 | if (unit == tcp_ccdbg_unit) { | |
181 | UInt32 old_value = tcp_ccdbg_unit; | |
182 | UInt32 new_value = TCP_CCDBG_NOUNIT; | |
183 | if (tcp_ccdbg_unit == new_value) | |
184 | return (0); | |
185 | ||
186 | if (!OSCompareAndSwap(old_value, new_value, | |
187 | &tcp_ccdbg_unit)) | |
188 | log(LOG_DEBUG, | |
189 | "failed to disconnect tcp_cc debug control"); | |
190 | } | |
191 | return (0); | |
192 | } | |
193 | ||
194 | inline void | |
195 | tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event) | |
196 | { | |
197 | #if !CONFIG_DTRACE | |
198 | #pragma unused(th) | |
199 | #endif /* !CONFIG_DTRACE */ | |
200 | struct inpcb *inp = tp->t_inpcb; | |
201 | ||
202 | if (tcp_cc_debug && tcp_ccdbg_unit > 0) { | |
203 | struct tcp_cc_debug_state dbg_state; | |
204 | struct timespec tv; | |
205 | ||
206 | bzero(&dbg_state, sizeof(dbg_state)); | |
207 | ||
208 | nanotime(&tv); | |
209 | /* Take time in seconds */ | |
210 | dbg_state.ccd_tsns = (tv.tv_sec * 1000000000) + tv.tv_nsec; | |
211 | inet_ntop(SOCK_DOM(inp->inp_socket), | |
212 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? | |
213 | (void *)&inp->inp_laddr.s_addr : | |
214 | (void *)&inp->in6p_laddr), dbg_state.ccd_srcaddr, | |
215 | sizeof(dbg_state.ccd_srcaddr)); | |
216 | dbg_state.ccd_srcport = ntohs(inp->inp_lport); | |
217 | inet_ntop(SOCK_DOM(inp->inp_socket), | |
218 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? | |
219 | (void *)&inp->inp_faddr.s_addr : | |
220 | (void *)&inp->in6p_faddr), dbg_state.ccd_destaddr, | |
221 | sizeof(dbg_state.ccd_destaddr)); | |
222 | dbg_state.ccd_destport = ntohs(inp->inp_fport); | |
223 | ||
224 | dbg_state.ccd_snd_cwnd = tp->snd_cwnd; | |
225 | dbg_state.ccd_snd_wnd = tp->snd_wnd; | |
226 | dbg_state.ccd_snd_ssthresh = tp->snd_ssthresh; | |
227 | dbg_state.ccd_rttcur = tp->t_rttcur; | |
228 | dbg_state.ccd_rxtcur = tp->t_rxtcur; | |
229 | dbg_state.ccd_srtt = tp->t_srtt >> TCP_RTT_SHIFT; | |
230 | dbg_state.ccd_event = event; | |
231 | dbg_state.ccd_sndcc = inp->inp_socket->so_snd.sb_cc; | |
232 | dbg_state.ccd_sndhiwat = inp->inp_socket->so_snd.sb_hiwat; | |
233 | dbg_state.ccd_bytes_acked = tp->t_bytes_acked; | |
234 | switch (tp->tcp_cc_index) { | |
235 | case TCP_CC_ALGO_CUBIC_INDEX: | |
236 | dbg_state.u.cubic_state.ccd_last_max = | |
237 | tp->t_ccstate->cub_last_max; | |
238 | dbg_state.u.cubic_state.ccd_tcp_win = | |
239 | tp->t_ccstate->cub_tcp_win; | |
240 | dbg_state.u.cubic_state.ccd_target_win = | |
241 | tp->t_ccstate->cub_target_win; | |
242 | dbg_state.u.cubic_state.ccd_avg_lastmax = | |
243 | tp->t_ccstate->cub_avg_lastmax; | |
244 | dbg_state.u.cubic_state.ccd_mean_deviation = | |
245 | tp->t_ccstate->cub_mean_dev; | |
246 | break; | |
247 | default: | |
248 | break; | |
249 | } | |
250 | ||
251 | ctl_enqueuedata(tcp_ccdbg_ctlref, tcp_ccdbg_unit, | |
252 | &dbg_state, sizeof(dbg_state), 0); | |
253 | } | |
254 | DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, | |
255 | struct tcpcb *, tp, struct tcphdr *, th, int32_t, event); | |
256 | } | |
257 | ||
258 | void tcp_cc_resize_sndbuf(struct tcpcb *tp) | |
259 | { | |
260 | struct sockbuf *sb; | |
261 | /* | |
262 | * If the send socket buffer size is bigger than ssthresh, | |
263 | * it is time to trim it because we do not want to hold | |
264 | * too many mbufs in the socket buffer | |
265 | */ | |
266 | sb = &tp->t_inpcb->inp_socket->so_snd; | |
267 | if (sb->sb_hiwat > tp->snd_ssthresh && | |
268 | (sb->sb_flags & SB_AUTOSIZE)) { | |
269 | if (sb->sb_idealsize > tp->snd_ssthresh) { | |
270 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); | |
271 | } | |
272 | sb->sb_flags |= SB_TRIM; | |
273 | } | |
274 | } | |
275 | ||
276 | void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) | |
277 | { | |
278 | struct sockbuf *sb; | |
279 | sb = &tp->t_inpcb->inp_socket->so_snd; | |
280 | if ((sb->sb_flags & (SB_TRIM|SB_AUTOSIZE)) == (SB_TRIM|SB_AUTOSIZE)) { | |
281 | /* | |
282 | * If there was a retransmission that was not necessary | |
283 | * then the size of socket buffer can be restored to | |
284 | * what it was before | |
285 | */ | |
286 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); | |
287 | if (sb->sb_hiwat <= sb->sb_idealsize) { | |
288 | sbreserve(sb, sb->sb_idealsize); | |
289 | sb->sb_flags &= ~SB_TRIM; | |
290 | } | |
291 | } | |
292 | } | |
293 | ||
294 | /* | |
295 | * Calculate initial cwnd according to RFC3390. | |
296 | * | |
297 | * Keep the old ss_fltsz sysctl for ABI compabitility issues. | |
298 | * but it will be overriden if tcp_do_rfc3390 sysctl when it is set. | |
299 | */ | |
300 | void | |
301 | tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) | |
302 | { | |
303 | if (tp->t_flags & TF_LOCAL) { | |
304 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; | |
305 | } else { | |
306 | /* initial congestion window according to RFC 3390 */ | |
307 | if (tcp_do_rfc3390) | |
308 | tp->snd_cwnd = min(4 * tp->t_maxseg, | |
309 | max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); | |
310 | else | |
311 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz; | |
312 | } | |
313 | } | |
314 | ||
315 | /* | |
316 | * Indicate whether this ack should be delayed. | |
317 | * Here is the explanation for different settings of tcp_delack_enabled: | |
318 | * - when set to 1, the bhavior is same as when set to 2. We kept this | |
319 | * for binary compatibility. | |
320 | * - when set to 2, will "ack every other packet" | |
321 | * - if our last ack wasn't a 0-sized window. | |
322 | * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). | |
323 | * If TH_PUSH is set, take this as a clue that we need to ACK | |
324 | * with no delay. This helps higher level protocols who | |
325 | * won't send us more data even if the window is open | |
326 | * because their last "segment" hasn't been ACKed | |
327 | * - when set to 3, will do "streaming detection" | |
328 | * - if we receive more than "maxseg_unacked" full packets | |
329 | * in the last 100ms | |
330 | * - if the connection is not in slow-start or idle or | |
331 | * loss/recovery states | |
332 | * - if those criteria aren't met, it will ack every other packet. | |
333 | */ | |
334 | int | |
335 | tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) | |
336 | { | |
337 | /* If any flags other than TH_ACK is set, set "end-of-write" bit */ | |
338 | if ((th->th_flags & ~TH_ACK)) | |
339 | tp->t_flagsext |= TF_STREAMEOW; | |
340 | else | |
341 | tp->t_flagsext &= ~(TF_STREAMEOW); | |
342 | ||
343 | switch (tcp_delack_enabled) { | |
344 | case 1: | |
345 | case 2: | |
346 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && | |
347 | (th->th_flags & TH_PUSH) == 0 && | |
348 | (tp->t_unacksegs == 1)) | |
349 | return(1); | |
350 | break; | |
351 | case 3: | |
352 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && | |
353 | (th->th_flags & TH_PUSH) == 0 && | |
354 | ((tp->t_unacksegs == 1) || | |
355 | ((tp->t_flags & TF_STRETCHACK) != 0 && | |
356 | tp->t_unacksegs < (maxseg_unacked)))) | |
357 | return(1); | |
358 | break; | |
359 | } | |
360 | return(0); | |
361 | } | |
362 | ||
363 | void | |
364 | tcp_cc_allocate_state(struct tcpcb *tp) | |
365 | { | |
366 | if (tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX && | |
367 | tp->t_ccstate == NULL) { | |
368 | tp->t_ccstate = (struct tcp_ccstate *)zalloc(tcp_cc_zone); | |
369 | ||
370 | /* | |
371 | * If we could not allocate memory for congestion control | |
372 | * state, revert to using TCP NewReno as it does not | |
373 | * require any state | |
374 | */ | |
375 | if (tp->t_ccstate == NULL) | |
376 | tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; | |
377 | else | |
378 | bzero(tp->t_ccstate, sizeof(*tp->t_ccstate)); | |
379 | } | |
380 | } | |
381 | ||
382 | /* | |
383 | * If stretch ack was disabled automatically on long standing connections, | |
384 | * re-evaluate the situation after 15 minutes to enable it. | |
385 | */ | |
386 | #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) | |
387 | void | |
388 | tcp_cc_after_idle_stretchack(struct tcpcb *tp) | |
389 | { | |
390 | int32_t tdiff; | |
391 | ||
392 | if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) | |
393 | return; | |
394 | ||
395 | tdiff = timer_diff(tcp_now, 0, tp->rcv_nostrack_ts, 0); | |
396 | if (tdiff < 0) | |
397 | tdiff = -tdiff; | |
398 | ||
399 | if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { | |
400 | tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; | |
401 | tp->t_stretchack_delayed = 0; | |
402 | ||
403 | tcp_reset_stretch_ack(tp); | |
404 | } | |
405 | } |