]>
Commit | Line | Data |
---|---|---|
fe8ab488 | 1 | /* |
f427ee49 | 2 | * Copyright (c) 2013-2020 Apple Inc. All rights reserved. |
fe8ab488 A |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | #include <sys/param.h> | |
29 | #include <sys/systm.h> | |
30 | #include <sys/kernel.h> | |
31 | #include <sys/protosw.h> | |
32 | #include <sys/socketvar.h> | |
33 | #include <sys/syslog.h> | |
34 | ||
35 | #include <net/route.h> | |
36 | #include <netinet/in.h> | |
37 | #include <netinet/in_systm.h> | |
38 | #include <netinet/ip.h> | |
39 | ||
fe8ab488 | 40 | #include <netinet/ip6.h> |
fe8ab488 A |
41 | |
42 | #include <netinet/ip_var.h> | |
43 | #include <netinet/tcp.h> | |
44 | #include <netinet/tcp_timer.h> | |
45 | #include <netinet/tcp_var.h> | |
46 | #include <netinet/tcp_fsm.h> | |
47 | #include <netinet/tcp_var.h> | |
48 | #include <netinet/tcp_cc.h> | |
49 | #include <netinet/tcpip.h> | |
50 | #include <netinet/tcp_seq.h> | |
51 | #include <kern/task.h> | |
52 | #include <libkern/OSAtomic.h> | |
53 | ||
54 | static int tcp_cubic_init(struct tcpcb *tp); | |
55 | static int tcp_cubic_cleanup(struct tcpcb *tp); | |
56 | static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp); | |
57 | static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th); | |
58 | static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); | |
59 | static void tcp_cubic_pre_fr(struct tcpcb *tp); | |
60 | static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th); | |
61 | static void tcp_cubic_after_timeout(struct tcpcb *tp); | |
62 | static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th); | |
63 | static void tcp_cubic_switch_cc(struct tcpcb *tp, u_int16_t old_index); | |
64 | static uint32_t tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt); | |
fe8ab488 A |
65 | static inline void tcp_cubic_clear_state(struct tcpcb *tp); |
66 | ||
67 | ||
68 | extern float cbrtf(float x); | |
69 | ||
70 | struct tcp_cc_algo tcp_cc_cubic = { | |
71 | .name = "cubic", | |
72 | .init = tcp_cubic_init, | |
73 | .cleanup = tcp_cubic_cleanup, | |
74 | .cwnd_init = tcp_cubic_cwnd_init_or_reset, | |
75 | .congestion_avd = tcp_cubic_congestion_avd, | |
76 | .ack_rcvd = tcp_cubic_ack_rcvd, | |
77 | .pre_fr = tcp_cubic_pre_fr, | |
78 | .post_fr = tcp_cubic_post_fr, | |
79 | .after_idle = tcp_cubic_cwnd_init_or_reset, | |
80 | .after_timeout = tcp_cubic_after_timeout, | |
81 | .delay_ack = tcp_cubic_delay_ack, | |
82 | .switch_to = tcp_cubic_switch_cc | |
83 | }; | |
84 | ||
f427ee49 A |
85 | static float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ |
86 | static float tcp_cubic_coeff = 0.4f; | |
87 | static float tcp_cubic_fast_convergence_factor = 0.875f; | |
88 | ||
89 | static float tcp_cubic_beta = 0.8f; | |
fe8ab488 | 90 | |
5ba3f43e | 91 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_tcp_friendliness, CTLFLAG_RW | CTLFLAG_LOCKED, |
0a7de745 | 92 | static int, tcp_cubic_tcp_friendliness, 0, "Enable TCP friendliness"); |
fe8ab488 | 93 | |
5ba3f43e | 94 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_fast_convergence, CTLFLAG_RW | CTLFLAG_LOCKED, |
0a7de745 | 95 | static int, tcp_cubic_fast_convergence, 0, "Enable fast convergence"); |
fe8ab488 | 96 | |
5ba3f43e | 97 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_use_minrtt, CTLFLAG_RW | CTLFLAG_LOCKED, |
0a7de745 | 98 | static int, tcp_cubic_use_minrtt, 0, "use a min of 5 sec rtt"); |
fe8ab488 | 99 | |
f427ee49 A |
100 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_minor_fixes, CTLFLAG_RW | CTLFLAG_LOCKED, |
101 | int, tcp_cubic_minor_fixes, 1, "Minor fixes to TCP Cubic"); | |
102 | ||
103 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_rfc_compliant, CTLFLAG_RW | CTLFLAG_LOCKED, | |
104 | int, tcp_cubic_rfc_compliant, 1, "RFC Compliance for TCP Cubic"); | |
105 | ||
0a7de745 A |
106 | static int |
107 | tcp_cubic_init(struct tcpcb *tp) | |
fe8ab488 A |
108 | { |
109 | OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); | |
110 | ||
f427ee49 A |
111 | if (tcp_cubic_rfc_compliant) { |
112 | tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */ | |
113 | tcp_cubic_fast_convergence_factor = 0.85f; | |
114 | tcp_cubic_beta = 0.7f; | |
115 | } else { | |
116 | tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ | |
117 | tcp_cubic_fast_convergence_factor = 0.875f; | |
118 | tcp_cubic_beta = 0.8f; | |
119 | } | |
120 | ||
fe8ab488 A |
121 | VERIFY(tp->t_ccstate != NULL); |
122 | tcp_cubic_clear_state(tp); | |
0a7de745 | 123 | return 0; |
fe8ab488 A |
124 | } |
125 | ||
0a7de745 A |
126 | static int |
127 | tcp_cubic_cleanup(struct tcpcb *tp) | |
fe8ab488 A |
128 | { |
129 | #pragma unused(tp) | |
130 | OSDecrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); | |
0a7de745 | 131 | return 0; |
fe8ab488 A |
132 | } |
133 | ||
134 | /* | |
0a7de745 | 135 | * Initialize the congestion window at the beginning of a connection or |
fe8ab488 A |
136 | * after idle time |
137 | */ | |
0a7de745 A |
138 | static void |
139 | tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp) | |
fe8ab488 | 140 | { |
0a7de745 | 141 | VERIFY(tp->t_ccstate != NULL); |
fe8ab488 A |
142 | |
143 | tcp_cubic_clear_state(tp); | |
144 | tcp_cc_cwnd_init_or_reset(tp); | |
3e170ce0 A |
145 | tp->t_pipeack = 0; |
146 | tcp_clear_pipeack_state(tp); | |
147 | ||
148 | /* Start counting bytes for RFC 3465 again */ | |
149 | tp->t_bytes_acked = 0; | |
fe8ab488 A |
150 | |
151 | /* | |
152 | * slow start threshold could get initialized to a lower value | |
153 | * when there is a cached value in the route metrics. In this case, | |
154 | * the connection can enter congestion avoidance without any packet | |
155 | * loss and Cubic will enter steady-state too early. It is better | |
156 | * to always probe to find the initial slow-start threshold. | |
157 | */ | |
f427ee49 A |
158 | if (tp->t_inpcb->inp_stat->txbytes <= tcp_initial_cwnd(tp) && |
159 | tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) { | |
fe8ab488 | 160 | tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
0a7de745 | 161 | } |
fe8ab488 A |
162 | |
163 | /* Initialize cubic last max to be same as ssthresh */ | |
164 | tp->t_ccstate->cub_last_max = tp->snd_ssthresh; | |
fe8ab488 A |
165 | } |
166 | ||
167 | /* | |
0a7de745 | 168 | * Compute the target congestion window for the next RTT according to |
fe8ab488 A |
169 | * cubic equation when an ack is received. |
170 | * | |
171 | * W(t) = C(t-K)^3 + W(last_max) | |
172 | */ | |
173 | static uint32_t | |
174 | tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt) | |
175 | { | |
176 | float K, var; | |
177 | u_int32_t elapsed_time, win; | |
178 | ||
fe8ab488 | 179 | win = min(tp->snd_cwnd, tp->snd_wnd); |
0a7de745 | 180 | if (tp->t_ccstate->cub_last_max == 0) { |
04b8595b | 181 | tp->t_ccstate->cub_last_max = tp->snd_ssthresh; |
0a7de745 | 182 | } |
04b8595b | 183 | |
fe8ab488 A |
184 | if (tp->t_ccstate->cub_epoch_start == 0) { |
185 | /* | |
186 | * This is the beginning of a new epoch, initialize some of | |
0a7de745 | 187 | * the variables that we need to use for computing the |
fe8ab488 A |
188 | * congestion window later. |
189 | */ | |
190 | tp->t_ccstate->cub_epoch_start = tcp_now; | |
0a7de745 | 191 | if (tp->t_ccstate->cub_epoch_start == 0) { |
fe8ab488 | 192 | tp->t_ccstate->cub_epoch_start = 1; |
0a7de745 | 193 | } |
fe8ab488 | 194 | if (win < tp->t_ccstate->cub_last_max) { |
fe8ab488 A |
195 | /* |
196 | * Compute cubic epoch period, this is the time | |
197 | * period that the window will take to increase to | |
198 | * last_max again after backoff due to loss. | |
199 | */ | |
f427ee49 A |
200 | if (tcp_cubic_minor_fixes) { |
201 | K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; | |
202 | } else { | |
203 | K = (tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; | |
204 | } | |
fe8ab488 A |
205 | K = cbrtf(K); |
206 | tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ; | |
207 | /* Origin point */ | |
f427ee49 | 208 | tp->t_ccstate->cub_origin_point = tp->t_ccstate->cub_last_max; |
fe8ab488 A |
209 | } else { |
210 | tp->t_ccstate->cub_epoch_period = 0; | |
211 | tp->t_ccstate->cub_origin_point = win; | |
212 | } | |
fe8ab488 | 213 | } |
0a7de745 A |
214 | |
215 | VERIFY(tp->t_ccstate->cub_origin_point > 0); | |
fe8ab488 A |
216 | /* |
217 | * Compute the target window for the next RTT using smoothed RTT | |
218 | * as an estimate for next RTT. | |
219 | */ | |
f427ee49 | 220 | elapsed_time = timer_diff(tcp_now, 0, tp->t_ccstate->cub_epoch_start, 0); |
fe8ab488 | 221 | |
0a7de745 | 222 | if (tcp_cubic_use_minrtt) { |
fe8ab488 | 223 | elapsed_time += max(tcp_cubic_use_minrtt, rtt); |
0a7de745 | 224 | } else { |
fe8ab488 | 225 | elapsed_time += rtt; |
0a7de745 | 226 | } |
fe8ab488 A |
227 | var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ; |
228 | var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg); | |
229 | ||
f427ee49 | 230 | return (u_int32_t)(tp->t_ccstate->cub_origin_point + var); |
fe8ab488 A |
231 | } |
232 | ||
233 | /* | |
234 | * Standard TCP utilizes bandwidth well in low RTT and low BDP connections | |
235 | * even when there is some packet loss. Enabling TCP mode will help Cubic | |
236 | * to achieve this kind of utilization. | |
237 | * | |
238 | * But if there is a bottleneck link in the path with a fixed size queue | |
239 | * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this | |
240 | * link because of the steady-state behavior. Using average and mean | |
241 | * absolute deviation of W(lastmax), we try to detect if the congestion | |
242 | * window is close to the bottleneck bandwidth. In that case, disabling | |
0a7de745 | 243 | * TCP mode will help to minimize packet loss at this link. |
fe8ab488 A |
244 | * |
245 | * Disable TCP mode if the W(lastmax) (the window where previous packet | |
246 | * loss happened) is within a small range from the average last max | |
247 | * calculated. | |
248 | */ | |
249 | #define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \ | |
250 | ((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \ | |
251 | (_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0) | |
252 | ||
253 | /* | |
0a7de745 | 254 | * Compute the window growth if standard TCP (AIMD) was used with |
fe8ab488 | 255 | * a backoff of 0.5 and additive increase of 1 packet per RTT. |
0a7de745 | 256 | * |
fe8ab488 | 257 | * TCP window at time t can be calculated using the following equation |
f427ee49 | 258 | * with tcp_beta_cubic |
fe8ab488 | 259 | * |
f427ee49 | 260 | * W(t) <- Wmax * tcp_beta_cubic + 3 * ((1 - tcp_beta_cubic)/(1 + tcp_beta_cubic)) * t/RTT |
fe8ab488 A |
261 | * |
262 | */ | |
263 | static uint32_t | |
264 | tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th) | |
265 | { | |
266 | if (tp->t_ccstate->cub_tcp_win == 0) { | |
f427ee49 A |
267 | /* Start of the epoch, we set the tcp_win to whatever Cubic decided |
268 | * at the beginning of the epoch. | |
269 | */ | |
fe8ab488 | 270 | tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd); |
f427ee49 A |
271 | if (tcp_cubic_minor_fixes) { |
272 | tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp); | |
273 | } else { | |
274 | tp->t_ccstate->cub_tcp_bytes_acked = 0; | |
275 | } | |
fe8ab488 | 276 | } else { |
f427ee49 A |
277 | tp->t_ccstate->cub_tcp_bytes_acked += BYTES_ACKED(th, tp); |
278 | ||
279 | if (tcp_cubic_minor_fixes) { | |
280 | /* | |
281 | * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked | |
282 | * against the snd_cwnd represents exactly one RTT at full rate. | |
283 | */ | |
284 | while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) { | |
285 | /* Enough bytes have been ACK'd for TCP to do AIMD*/ | |
286 | tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd; | |
287 | ||
288 | if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max || !tcp_cubic_rfc_compliant) { | |
289 | tp->t_ccstate->cub_tcp_win += tp->t_maxseg; | |
290 | } else { | |
291 | /* Increase-rate from Section 4.2, RFC 8312 */ | |
292 | float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta); | |
293 | ||
294 | tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor); | |
295 | } | |
296 | } | |
297 | } else { | |
298 | if (tp->t_ccstate->cub_tcp_bytes_acked >= tp->t_ccstate->cub_tcp_win) { | |
299 | tp->t_ccstate->cub_tcp_bytes_acked -= tp->t_ccstate->cub_tcp_win; | |
300 | tp->t_ccstate->cub_tcp_win += tp->t_maxseg; | |
301 | } | |
fe8ab488 | 302 | } |
0a7de745 A |
303 | } |
304 | return tp->t_ccstate->cub_tcp_win; | |
fe8ab488 A |
305 | } |
306 | ||
f427ee49 A |
307 | static uint32_t |
308 | tcp_round_to(uint32_t val, uint32_t round) | |
309 | { | |
310 | if (tcp_cubic_minor_fixes) { | |
311 | /* | |
312 | * Round up or down based on the middle. Meaning, if we round upon a | |
313 | * multiple of 10, 16 will round to 20 and 14 will round to 10. | |
314 | */ | |
315 | return ((val + (round / 2)) / round) * round; | |
316 | } else { | |
317 | return (val / round) * round; | |
318 | } | |
319 | } | |
320 | ||
fe8ab488 A |
321 | /* |
322 | * Handle an in-sequence ack during congestion avoidance phase. | |
323 | */ | |
324 | static void | |
325 | tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th) | |
326 | { | |
327 | u_int32_t cubic_target_win, tcp_win, rtt; | |
f427ee49 | 328 | u_int64_t incr_win = UINT32_MAX; |
fe8ab488 | 329 | |
3e170ce0 | 330 | /* Do not increase congestion window in non-validated phase */ |
0a7de745 | 331 | if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) { |
3e170ce0 | 332 | return; |
0a7de745 | 333 | } |
3e170ce0 | 334 | |
fe8ab488 A |
335 | tp->t_bytes_acked += BYTES_ACKED(th, tp); |
336 | ||
337 | rtt = get_base_rtt(tp); | |
338 | /* | |
339 | * First compute cubic window. If cubic variables are not | |
340 | * initialized (after coming out of recovery), this call will | |
341 | * initialize them. | |
342 | */ | |
343 | cubic_target_win = tcp_cubic_update(tp, rtt); | |
344 | ||
345 | /* Compute TCP window if a multiplicative decrease of 0.2 is used */ | |
346 | tcp_win = tcp_cubic_tcpwin(tp, th); | |
347 | ||
f427ee49 | 348 | if (tp->snd_cwnd < tcp_win && tcp_cubic_minor_fixes == 0 && TCP_CUBIC_ENABLE_TCPMODE(tp)) { |
fe8ab488 A |
349 | /* this connection is in TCP-friendly region */ |
350 | if (tp->t_bytes_acked >= tp->snd_cwnd) { | |
351 | tp->t_bytes_acked -= tp->snd_cwnd; | |
352 | tp->snd_cwnd = min(tcp_win, TCP_MAXWIN << tp->snd_scale); | |
353 | } | |
354 | } else { | |
355 | if (cubic_target_win > tp->snd_cwnd) { | |
356 | /* | |
357 | * The target win is computed for the next RTT. | |
358 | * To reach this value, cwnd will have to be updated | |
0a7de745 A |
359 | * one segment at a time. Compute how many bytes |
360 | * need to be acknowledged before we can increase | |
fe8ab488 A |
361 | * the cwnd by one segment. |
362 | */ | |
f427ee49 | 363 | incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; |
fe8ab488 | 364 | incr_win /= (cubic_target_win - tp->snd_cwnd); |
f427ee49 A |
365 | if (!tcp_cubic_minor_fixes) { |
366 | if (incr_win > 0 && | |
367 | tp->t_bytes_acked >= incr_win) { | |
368 | tp->t_bytes_acked -= incr_win; | |
369 | tp->snd_cwnd = | |
370 | min((tp->snd_cwnd + tp->t_maxseg), | |
371 | TCP_MAXWIN << tp->snd_scale); | |
372 | } | |
373 | } | |
374 | } | |
375 | } | |
376 | ||
377 | if (tcp_cubic_minor_fixes) { | |
378 | tcp_win = tcp_round_to(tcp_win, tp->t_maxseg); | |
379 | ||
380 | if (tp->snd_cwnd < tcp_win) { | |
381 | uint64_t tcp_incr_win; | |
382 | ||
383 | tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; | |
384 | tcp_incr_win /= (tcp_win - tp->snd_cwnd); | |
385 | ||
386 | if (tcp_incr_win < incr_win) { | |
387 | /* this connection is in TCP-friendly region */ | |
388 | incr_win = tcp_incr_win; | |
fe8ab488 A |
389 | } |
390 | } | |
f427ee49 A |
391 | |
392 | if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { | |
393 | tp->t_bytes_acked -= incr_win; | |
394 | tp->snd_cwnd = min(tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale); | |
395 | } | |
fe8ab488 A |
396 | } |
397 | } | |
398 | ||
399 | static void | |
400 | tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) | |
401 | { | |
3e170ce0 | 402 | /* Do not increase the congestion window in non-validated phase */ |
0a7de745 | 403 | if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) { |
3e170ce0 | 404 | return; |
0a7de745 | 405 | } |
3e170ce0 | 406 | |
fe8ab488 A |
407 | if (tp->snd_cwnd >= tp->snd_ssthresh) { |
408 | /* Congestion avoidance phase */ | |
409 | tcp_cubic_congestion_avd(tp, th); | |
410 | } else { | |
411 | /* | |
412 | * Use 2*SMSS as limit on increment as suggested | |
413 | * by RFC 3465 section 2.3 | |
414 | */ | |
415 | uint32_t acked, abc_lim, incr; | |
3e170ce0 | 416 | |
fe8ab488 | 417 | acked = BYTES_ACKED(th, tp); |
f427ee49 A |
418 | if (tcp_cubic_minor_fixes) { |
419 | /* | |
420 | * Maximum burst-size is limited to the initial congestion-window. | |
421 | * We know that the network can survive this kind of burst. | |
422 | */ | |
423 | abc_lim = tcp_initial_cwnd(tp); | |
424 | } else { | |
425 | abc_lim = (tp->snd_nxt == tp->snd_max) ? 2 * tp->t_maxseg : tp->t_maxseg; | |
426 | } | |
fe8ab488 A |
427 | incr = min(acked, abc_lim); |
428 | ||
429 | tp->snd_cwnd += incr; | |
f427ee49 | 430 | tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); |
fe8ab488 A |
431 | } |
432 | } | |
433 | ||
434 | static void | |
435 | tcp_cubic_pre_fr(struct tcpcb *tp) | |
436 | { | |
5ba3f43e | 437 | u_int32_t win, avg; |
fe8ab488 A |
438 | int32_t dev; |
439 | tp->t_ccstate->cub_epoch_start = 0; | |
440 | tp->t_ccstate->cub_tcp_win = 0; | |
fe8ab488 A |
441 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
442 | ||
443 | win = min(tp->snd_cwnd, tp->snd_wnd); | |
3e170ce0 A |
444 | if (tp->t_flagsext & TF_CWND_NONVALIDATED) { |
445 | tp->t_lossflightsize = tp->snd_max - tp->snd_una; | |
f427ee49 A |
446 | if (tcp_flow_control_response) { |
447 | win = max(tp->t_pipeack, tp->t_lossflightsize); | |
448 | } else { | |
449 | win = (max(tp->t_pipeack, tp->t_lossflightsize)) >> 1; | |
450 | } | |
3e170ce0 A |
451 | } else { |
452 | tp->t_lossflightsize = 0; | |
453 | } | |
fe8ab488 A |
454 | /* |
455 | * Note the congestion window at which packet loss occurred as | |
456 | * cub_last_max. | |
457 | * | |
458 | * If the congestion window is less than the last max window when | |
0a7de745 | 459 | * loss occurred, it indicates that capacity available in the |
fe8ab488 A |
460 | * network has gone down. This can happen if a new flow has started |
461 | * and it is capturing some of the bandwidth. To reach convergence | |
f427ee49 | 462 | * quickly, backoff a little more. |
fe8ab488 | 463 | */ |
f427ee49 A |
464 | if (win < tp->t_ccstate->cub_last_max && tcp_cubic_minor_fixes) { |
465 | tp->t_ccstate->cub_last_max = (uint32_t)((float)win * tcp_cubic_fast_convergence_factor); | |
0a7de745 | 466 | } else { |
fe8ab488 | 467 | tp->t_ccstate->cub_last_max = win; |
0a7de745 | 468 | } |
fe8ab488 A |
469 | |
470 | if (tp->t_ccstate->cub_last_max == 0) { | |
471 | /* | |
472 | * If last_max is zero because snd_wnd is zero or for | |
473 | * any other reason, initialize it to the amount of data | |
474 | * in flight | |
475 | */ | |
476 | tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una; | |
477 | } | |
478 | ||
479 | /* | |
480 | * Compute average and mean absolute deviation of the | |
481 | * window at which packet loss occurred. | |
482 | */ | |
483 | if (tp->t_ccstate->cub_avg_lastmax == 0) { | |
484 | tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max; | |
485 | } else { | |
486 | /* | |
487 | * Average is computed by taking 63 parts of | |
488 | * history and one part of the most recent value | |
489 | */ | |
490 | avg = tp->t_ccstate->cub_avg_lastmax; | |
491 | avg = (avg << 6) - avg; | |
492 | tp->t_ccstate->cub_avg_lastmax = | |
0a7de745 | 493 | (avg + tp->t_ccstate->cub_last_max) >> 6; |
fe8ab488 A |
494 | } |
495 | ||
496 | /* caluclate deviation from average */ | |
497 | dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max; | |
498 | ||
499 | /* Take the absolute value */ | |
0a7de745 | 500 | if (dev < 0) { |
fe8ab488 | 501 | dev = -dev; |
0a7de745 | 502 | } |
fe8ab488 A |
503 | |
504 | if (tp->t_ccstate->cub_mean_dev == 0) { | |
505 | tp->t_ccstate->cub_mean_dev = dev; | |
506 | } else { | |
507 | dev = dev + ((tp->t_ccstate->cub_mean_dev << 4) | |
508 | - tp->t_ccstate->cub_mean_dev); | |
509 | tp->t_ccstate->cub_mean_dev = dev >> 4; | |
510 | } | |
511 | ||
512 | /* Backoff congestion window by tcp_cubic_backoff factor */ | |
5ba3f43e | 513 | win = (u_int32_t)(win - (win * tcp_cubic_backoff)); |
f427ee49 A |
514 | win = tcp_round_to(win, tp->t_maxseg); |
515 | if (win < 2 * tp->t_maxseg) { | |
516 | win = 2 * tp->t_maxseg; | |
0a7de745 | 517 | } |
f427ee49 | 518 | tp->snd_ssthresh = win; |
fe8ab488 A |
519 | tcp_cc_resize_sndbuf(tp); |
520 | } | |
521 | ||
522 | static void | |
523 | tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th) | |
524 | { | |
525 | uint32_t flight_size = 0; | |
f427ee49 | 526 | uint32_t ack; |
fe8ab488 | 527 | |
f427ee49 A |
528 | if (th != NULL) { |
529 | ack = th->th_ack; | |
530 | } else { | |
531 | ack = tp->snd_una; | |
0a7de745 | 532 | } |
3e170ce0 | 533 | |
f427ee49 A |
534 | if (SEQ_LEQ(ack, tp->snd_max) && (!tcp_cubic_minor_fixes || tcp_flow_control_response)) { |
535 | flight_size = tp->snd_max - ack; | |
536 | } else if (tcp_cubic_minor_fixes) { | |
537 | /* | |
538 | * Cubic Minor Fixes: snd_max - th_ack is a very very bad estimate | |
539 | * of the flight size. Either the app is sending at full speed and | |
540 | * flight_size *is* snd_sshtresh, or the app is not sending at full | |
541 | * speed and congestion-window validation would have kicked in earlier. | |
542 | * | |
543 | * Except that for the latter, snd_ssthresh is way too high. | |
544 | * When we exit recovery we will burst a lot of data out... | |
545 | * | |
546 | * So, tcp_flow_control_response brings us back to the old behavior. | |
547 | * Too many feature-flags... | |
548 | */ | |
549 | flight_size = tp->snd_ssthresh; | |
550 | } | |
551 | ||
552 | /* | |
553 | * Cubic Minor Fixes: t_lossflightsize is always 0, because of | |
554 | * EXIT_FASTRECOVERY. This here is basically dead code... | |
555 | */ | |
556 | if (SACK_ENABLED(tp) && tp->t_lossflightsize > 0 && !tcp_cubic_minor_fixes) { | |
3e170ce0 A |
557 | u_int32_t total_rxt_size = 0, ncwnd; |
558 | /* | |
559 | * When SACK is enabled, the number of retransmitted bytes | |
560 | * can be counted more accurately. | |
561 | */ | |
562 | total_rxt_size = tcp_rxtseg_total_size(tp); | |
563 | ncwnd = max(tp->t_pipeack, tp->t_lossflightsize); | |
564 | if (total_rxt_size <= ncwnd) { | |
565 | ncwnd = ncwnd - total_rxt_size; | |
566 | } | |
567 | ||
568 | /* | |
569 | * To avoid sending a large burst at the end of recovery | |
570 | * set a max limit on ncwnd | |
571 | */ | |
572 | ncwnd = min(ncwnd, (tp->t_maxseg << 6)); | |
573 | ncwnd = ncwnd >> 1; | |
574 | flight_size = max(ncwnd, flight_size); | |
575 | } | |
fe8ab488 A |
576 | /* |
577 | * Complete ack. The current window was inflated for fast recovery. | |
578 | * It has to be deflated post recovery. | |
579 | * | |
0a7de745 | 580 | * Window inflation should have left us with approx snd_ssthresh |
fe8ab488 A |
581 | * outstanding data. If the flight size is zero or one segment, |
582 | * make congestion window to be at least as big as 2 segments to | |
583 | * avoid delayed acknowledgements. This is according to RFC 6582. | |
584 | */ | |
0a7de745 A |
585 | if (flight_size < tp->snd_ssthresh) { |
586 | tp->snd_cwnd = max(flight_size, tp->t_maxseg) | |
587 | + tp->t_maxseg; | |
588 | } else { | |
fe8ab488 | 589 | tp->snd_cwnd = tp->snd_ssthresh; |
0a7de745 | 590 | } |
fe8ab488 | 591 | tp->t_ccstate->cub_tcp_win = 0; |
fe8ab488 A |
592 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
593 | } | |
594 | ||
0a7de745 | 595 | static void |
fe8ab488 A |
596 | tcp_cubic_after_timeout(struct tcpcb *tp) |
597 | { | |
598 | VERIFY(tp->t_ccstate != NULL); | |
3e170ce0 A |
599 | |
600 | /* | |
601 | * Avoid adjusting congestion window due to SYN retransmissions. | |
602 | * If more than one byte (SYN) is outstanding then it is still | |
603 | * needed to adjust the window. | |
604 | */ | |
605 | if (tp->t_state < TCPS_ESTABLISHED && | |
0a7de745 | 606 | ((int)(tp->snd_max - tp->snd_una) <= 1)) { |
3e170ce0 | 607 | return; |
0a7de745 | 608 | } |
3e170ce0 | 609 | |
fe8ab488 A |
610 | if (!IN_FASTRECOVERY(tp)) { |
611 | tcp_cubic_clear_state(tp); | |
612 | tcp_cubic_pre_fr(tp); | |
613 | } | |
614 | ||
615 | /* | |
616 | * Close the congestion window down to one segment as a retransmit | |
617 | * timeout might indicate severe congestion. | |
618 | */ | |
619 | tp->snd_cwnd = tp->t_maxseg; | |
620 | } | |
621 | ||
622 | static int | |
623 | tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th) | |
624 | { | |
0a7de745 | 625 | return tcp_cc_delay_ack(tp, th); |
fe8ab488 A |
626 | } |
627 | ||
628 | /* | |
0a7de745 | 629 | * When switching from a different CC it is better for Cubic to start |
fe8ab488 A |
630 | * fresh. The state required for Cubic calculation might be stale and it |
631 | * might not represent the current state of the network. If it starts as | |
632 | * a new connection it will probe and learn the existing network conditions. | |
633 | */ | |
634 | static void | |
635 | tcp_cubic_switch_cc(struct tcpcb *tp, uint16_t old_cc_index) | |
636 | { | |
637 | #pragma unused(old_cc_index) | |
638 | tcp_cubic_cwnd_init_or_reset(tp); | |
fe8ab488 A |
639 | |
640 | OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); | |
641 | } | |
642 | ||
0a7de745 A |
643 | static inline void |
644 | tcp_cubic_clear_state(struct tcpcb *tp) | |
fe8ab488 A |
645 | { |
646 | tp->t_ccstate->cub_last_max = 0; | |
647 | tp->t_ccstate->cub_epoch_start = 0; | |
648 | tp->t_ccstate->cub_origin_point = 0; | |
649 | tp->t_ccstate->cub_tcp_win = 0; | |
650 | tp->t_ccstate->cub_tcp_bytes_acked = 0; | |
651 | tp->t_ccstate->cub_epoch_period = 0; | |
fe8ab488 | 652 | } |