]>
Commit | Line | Data |
---|---|---|
3e170ce0 | 1 | /* |
39037602 | 2 | * Copyright (c) 2015-2016 Apple Inc. All rights reserved. |
3e170ce0 A |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* TCP-cache to store and retrieve TCP-related information */ | |
30 | ||
31 | #include <net/flowhash.h> | |
32 | #include <net/route.h> | |
33 | #include <netinet/in_pcb.h> | |
34 | #include <netinet/tcp_cache.h> | |
35 | #include <netinet/tcp_seq.h> | |
36 | #include <netinet/tcp_var.h> | |
37 | #include <kern/locks.h> | |
38 | #include <sys/queue.h> | |
39 | #include <dev/random/randomdev.h> | |
40 | ||
41 | struct tcp_heuristic_key { | |
42 | union { | |
43 | uint8_t thk_net_signature[IFNET_SIGNATURELEN]; | |
44 | union { | |
45 | struct in_addr addr; | |
46 | struct in6_addr addr6; | |
47 | } thk_ip; | |
48 | }; | |
49 | sa_family_t thk_family; | |
50 | }; | |
51 | ||
52 | struct tcp_heuristic { | |
53 | SLIST_ENTRY(tcp_heuristic) list; | |
54 | ||
55 | u_int32_t th_last_access; | |
56 | ||
57 | struct tcp_heuristic_key th_key; | |
58 | ||
4bd07ac2 A |
59 | char th_val_start[0]; /* Marker for memsetting to 0 */ |
60 | ||
61 | u_int8_t th_tfo_cookie_loss; /* The number of times a SYN+cookie has been lost */ | |
39037602 | 62 | u_int8_t th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */ |
4bd07ac2 A |
63 | u_int8_t th_ecn_loss; /* The number of times a SYN+ecn has been lost */ |
64 | u_int8_t th_ecn_aggressive; /* The number of times we did an aggressive fallback */ | |
39037602 A |
65 | u_int8_t th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */ |
66 | u_int8_t th_ecn_droprxmt; /* The number of times ECN connection is dropped after multiple retransmits */ | |
3e170ce0 A |
67 | u_int32_t th_tfo_fallback_trials; /* Number of times we did not try out TFO due to SYN-loss */ |
68 | u_int32_t th_tfo_cookie_backoff; /* Time until when we should not try out TFO */ | |
39037602 | 69 | u_int32_t th_mptcp_backoff; /* Time until when we should not try out MPTCP */ |
4bd07ac2 | 70 | u_int32_t th_ecn_backoff; /* Time until when we should not try out ECN */ |
3e170ce0 | 71 | |
4bd07ac2 A |
72 | u_int8_t th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */ |
73 | th_tfo_aggressive_fallback:1, /* Aggressive fallback due to nasty middlebox */ | |
3e170ce0 | 74 | th_tfo_snd_middlebox_supp:1, /* We are sure that the network supports TFO in upstream direction */ |
39037602 A |
75 | th_tfo_rcv_middlebox_supp:1, /* We are sure that the network supports TFO in downstream direction*/ |
76 | th_mptcp_in_backoff:1; /* Are we avoiding MPTCP due to the backoff timer? */ | |
4bd07ac2 A |
77 | |
78 | char th_val_end[0]; /* Marker for memsetting to 0 */ | |
3e170ce0 A |
79 | }; |
80 | ||
81 | struct tcp_heuristics_head { | |
82 | SLIST_HEAD(tcp_heur_bucket, tcp_heuristic) tcp_heuristics; | |
83 | ||
84 | /* Per-hashbucket lock to avoid lock-contention */ | |
85 | lck_mtx_t thh_mtx; | |
86 | }; | |
87 | ||
88 | struct tcp_cache_key { | |
89 | sa_family_t tck_family; | |
90 | ||
91 | struct tcp_heuristic_key tck_src; | |
92 | union { | |
93 | struct in_addr addr; | |
94 | struct in6_addr addr6; | |
95 | } tck_dst; | |
96 | }; | |
97 | ||
98 | struct tcp_cache { | |
99 | SLIST_ENTRY(tcp_cache) list; | |
100 | ||
101 | u_int32_t tc_last_access; | |
102 | ||
103 | struct tcp_cache_key tc_key; | |
104 | ||
105 | u_int8_t tc_tfo_cookie[TFO_COOKIE_LEN_MAX]; | |
106 | u_int8_t tc_tfo_cookie_len; | |
107 | }; | |
108 | ||
109 | struct tcp_cache_head { | |
110 | SLIST_HEAD(tcp_cache_bucket, tcp_cache) tcp_caches; | |
111 | ||
112 | /* Per-hashbucket lock to avoid lock-contention */ | |
113 | lck_mtx_t tch_mtx; | |
114 | }; | |
115 | ||
116 | static u_int32_t tcp_cache_hash_seed; | |
117 | ||
118 | size_t tcp_cache_size; | |
119 | ||
120 | /* | |
121 | * The maximum depth of the hash-bucket. This way we limit the tcp_cache to | |
122 | * TCP_CACHE_BUCKET_SIZE * tcp_cache_size and have "natural" garbage collection | |
123 | */ | |
124 | #define TCP_CACHE_BUCKET_SIZE 5 | |
125 | ||
126 | static struct tcp_cache_head *tcp_cache; | |
127 | ||
128 | decl_lck_mtx_data(, tcp_cache_mtx); | |
129 | ||
130 | static lck_attr_t *tcp_cache_mtx_attr; | |
131 | static lck_grp_t *tcp_cache_mtx_grp; | |
132 | static lck_grp_attr_t *tcp_cache_mtx_grp_attr; | |
133 | ||
134 | static struct tcp_heuristics_head *tcp_heuristics; | |
135 | ||
136 | decl_lck_mtx_data(, tcp_heuristics_mtx); | |
137 | ||
138 | static lck_attr_t *tcp_heuristic_mtx_attr; | |
139 | static lck_grp_t *tcp_heuristic_mtx_grp; | |
140 | static lck_grp_attr_t *tcp_heuristic_mtx_grp_attr; | |
141 | ||
39037602 | 142 | static int tcp_ecn_timeout = 60; |
4bd07ac2 A |
143 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, |
144 | &tcp_ecn_timeout, 0, "Initial minutes to wait before re-trying ECN"); | |
3e170ce0 | 145 | |
39037602 A |
146 | static int disable_tcp_heuristics = 0; |
147 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_tcp_heuristics, CTLFLAG_RW | CTLFLAG_LOCKED, | |
148 | &disable_tcp_heuristics, 0, "Set to 1, to disable all TCP heuristics (TFO, ECN, MPTCP)"); | |
149 | ||
150 | /* | |
151 | * This number is coupled with tcp_ecn_timeout, because we want to prevent | |
152 | * integer overflow. Need to find an unexpensive way to prevent integer overflow | |
153 | * while still allowing a dynamic sysctl. | |
154 | */ | |
155 | #define TCP_CACHE_OVERFLOW_PROTECT 9 | |
156 | ||
157 | /* Number of SYN-losses we accept */ | |
158 | #define TFO_MAX_COOKIE_LOSS 2 | |
159 | #define ECN_MAX_SYN_LOSS 2 | |
160 | #define MPTCP_MAX_SYN_LOSS 2 | |
161 | #define ECN_MAX_DROPRST 2 | |
162 | #define ECN_MAX_DROPRXMT 4 | |
163 | ||
164 | /* Flags for setting/unsetting loss-heuristics, limited to 1 byte */ | |
165 | #define TCPCACHE_F_TFO 0x01 | |
166 | #define TCPCACHE_F_ECN 0x02 | |
167 | #define TCPCACHE_F_MPTCP 0x04 | |
168 | #define TCPCACHE_F_ECN_DROPRST 0x08 | |
169 | #define TCPCACHE_F_ECN_DROPRXMT 0x10 | |
170 | ||
171 | /* Always retry ECN after backing off to this level for some heuristics */ | |
172 | #define ECN_RETRY_LIMIT 9 | |
173 | ||
3e170ce0 A |
174 | /* |
175 | * Round up to next higher power-of 2. See "Bit Twiddling Hacks". | |
176 | * | |
177 | * Might be worth moving this to a library so that others | |
178 | * (e.g., scale_to_powerof2()) can use this as well instead of a while-loop. | |
179 | */ | |
180 | static u_int32_t tcp_cache_roundup2(u_int32_t a) | |
181 | { | |
182 | a--; | |
183 | a |= a >> 1; | |
184 | a |= a >> 2; | |
185 | a |= a >> 4; | |
186 | a |= a >> 8; | |
187 | a |= a >> 16; | |
188 | a++; | |
189 | ||
190 | return a; | |
191 | } | |
192 | ||
193 | static void tcp_cache_hash_src(struct inpcb *inp, struct tcp_heuristic_key *key) | |
194 | { | |
195 | struct ifnet *ifn = inp->inp_last_outifp; | |
196 | uint8_t len = sizeof(key->thk_net_signature); | |
197 | uint16_t flags; | |
198 | ||
199 | if (inp->inp_vflag & INP_IPV6) { | |
200 | int ret; | |
201 | ||
202 | key->thk_family = AF_INET6; | |
203 | ret = ifnet_get_netsignature(ifn, AF_INET6, &len, &flags, | |
204 | key->thk_net_signature); | |
205 | ||
206 | /* | |
207 | * ifnet_get_netsignature only returns EINVAL if ifn is NULL | |
208 | * (we made sure that in the other cases it does not). So, | |
209 | * in this case we should take the connection's address. | |
210 | */ | |
211 | if (ret == ENOENT || ret == EINVAL) | |
212 | memcpy(&key->thk_ip.addr6, &inp->in6p_laddr, sizeof(struct in6_addr)); | |
213 | } else { | |
214 | int ret; | |
215 | ||
216 | key->thk_family = AF_INET; | |
217 | ret = ifnet_get_netsignature(ifn, AF_INET, &len, &flags, | |
218 | key->thk_net_signature); | |
219 | ||
220 | /* | |
221 | * ifnet_get_netsignature only returns EINVAL if ifn is NULL | |
222 | * (we made sure that in the other cases it does not). So, | |
223 | * in this case we should take the connection's address. | |
224 | */ | |
225 | if (ret == ENOENT || ret == EINVAL) | |
226 | memcpy(&key->thk_ip.addr, &inp->inp_laddr, sizeof(struct in_addr)); | |
227 | } | |
228 | } | |
229 | ||
230 | static u_int16_t tcp_cache_hash(struct inpcb *inp, struct tcp_cache_key *key) | |
231 | { | |
232 | u_int32_t hash; | |
233 | ||
234 | bzero(key, sizeof(struct tcp_cache_key)); | |
235 | ||
236 | tcp_cache_hash_src(inp, &key->tck_src); | |
237 | ||
238 | if (inp->inp_vflag & INP_IPV6) { | |
239 | key->tck_family = AF_INET6; | |
240 | memcpy(&key->tck_dst.addr6, &inp->in6p_faddr, | |
241 | sizeof(struct in6_addr)); | |
242 | } else { | |
243 | key->tck_family = AF_INET; | |
244 | memcpy(&key->tck_dst.addr, &inp->inp_faddr, | |
245 | sizeof(struct in_addr)); | |
246 | } | |
247 | ||
248 | hash = net_flowhash(key, sizeof(struct tcp_cache_key), | |
249 | tcp_cache_hash_seed); | |
250 | ||
251 | return (hash & (tcp_cache_size - 1)); | |
252 | } | |
253 | ||
254 | static void tcp_cache_unlock(struct tcp_cache_head *head) | |
255 | { | |
256 | lck_mtx_unlock(&head->tch_mtx); | |
257 | } | |
258 | ||
259 | /* | |
260 | * Make sure that everything that happens after tcp_getcache_with_lock() | |
261 | * is short enough to justify that you hold the per-bucket lock!!! | |
262 | * | |
263 | * Otherwise, better build another lookup-function that does not hold the | |
264 | * lock and you copy out the bits and bytes. | |
265 | * | |
266 | * That's why we provide the head as a "return"-pointer so that the caller | |
267 | * can give it back to use for tcp_cache_unlock(). | |
268 | */ | |
269 | static struct tcp_cache *tcp_getcache_with_lock(struct tcpcb *tp, int create, | |
270 | struct tcp_cache_head **headarg) | |
271 | { | |
272 | struct inpcb *inp = tp->t_inpcb; | |
273 | struct tcp_cache *tpcache = NULL; | |
274 | struct tcp_cache_head *head; | |
275 | struct tcp_cache_key key; | |
276 | u_int16_t hash; | |
277 | int i = 0; | |
278 | ||
279 | hash = tcp_cache_hash(inp, &key); | |
280 | head = &tcp_cache[hash]; | |
281 | ||
282 | lck_mtx_lock(&head->tch_mtx); | |
283 | ||
284 | /*** First step: Look for the tcp_cache in our bucket ***/ | |
285 | SLIST_FOREACH(tpcache, &head->tcp_caches, list) { | |
286 | if (memcmp(&tpcache->tc_key, &key, sizeof(key)) == 0) | |
287 | break; | |
288 | ||
289 | i++; | |
290 | } | |
291 | ||
292 | /*** Second step: If it's not there, create/recycle it ***/ | |
293 | if ((tpcache == NULL) && create) { | |
294 | if (i >= TCP_CACHE_BUCKET_SIZE) { | |
295 | struct tcp_cache *oldest_cache = NULL; | |
296 | u_int32_t max_age = 0; | |
297 | ||
298 | /* Look for the oldest tcp_cache in the bucket */ | |
299 | SLIST_FOREACH(tpcache, &head->tcp_caches, list) { | |
300 | u_int32_t age = tcp_now - tpcache->tc_last_access; | |
301 | if (age > max_age) { | |
302 | max_age = age; | |
303 | oldest_cache = tpcache; | |
304 | } | |
305 | } | |
306 | VERIFY(oldest_cache != NULL); | |
307 | ||
308 | tpcache = oldest_cache; | |
309 | ||
310 | /* We recycle, thus let's indicate that there is no cookie */ | |
311 | tpcache->tc_tfo_cookie_len = 0; | |
312 | } else { | |
313 | /* Create a new cache and add it to the list */ | |
314 | tpcache = _MALLOC(sizeof(struct tcp_cache), M_TEMP, | |
315 | M_NOWAIT | M_ZERO); | |
316 | if (tpcache == NULL) | |
317 | goto out_null; | |
318 | ||
319 | SLIST_INSERT_HEAD(&head->tcp_caches, tpcache, list); | |
320 | } | |
321 | ||
322 | memcpy(&tpcache->tc_key, &key, sizeof(key)); | |
323 | } | |
324 | ||
325 | if (tpcache == NULL) | |
326 | goto out_null; | |
327 | ||
328 | /* Update timestamp for garbage collection purposes */ | |
329 | tpcache->tc_last_access = tcp_now; | |
330 | *headarg = head; | |
331 | ||
332 | return (tpcache); | |
333 | ||
334 | out_null: | |
335 | tcp_cache_unlock(head); | |
336 | return (NULL); | |
337 | } | |
338 | ||
339 | void tcp_cache_set_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t len) | |
340 | { | |
341 | struct tcp_cache_head *head; | |
342 | struct tcp_cache *tpcache; | |
343 | ||
344 | /* Call lookup/create function */ | |
345 | tpcache = tcp_getcache_with_lock(tp, 1, &head); | |
346 | if (tpcache == NULL) | |
347 | return; | |
348 | ||
349 | tpcache->tc_tfo_cookie_len = len; | |
350 | memcpy(tpcache->tc_tfo_cookie, cookie, len); | |
351 | ||
352 | tcp_cache_unlock(head); | |
353 | } | |
354 | ||
355 | /* | |
356 | * Get the cookie related to 'tp', and copy it into 'cookie', provided that len | |
357 | * is big enough (len designates the available memory. | |
358 | * Upon return, 'len' is set to the cookie's length. | |
359 | * | |
360 | * Returns 0 if we should request a cookie. | |
361 | * Returns 1 if the cookie has been found and written. | |
362 | */ | |
363 | int tcp_cache_get_cookie(struct tcpcb *tp, u_char *cookie, u_int8_t *len) | |
364 | { | |
365 | struct tcp_cache_head *head; | |
366 | struct tcp_cache *tpcache; | |
367 | ||
368 | /* Call lookup/create function */ | |
369 | tpcache = tcp_getcache_with_lock(tp, 1, &head); | |
370 | if (tpcache == NULL) | |
371 | return (0); | |
372 | ||
373 | if (tpcache->tc_tfo_cookie_len == 0) { | |
374 | tcp_cache_unlock(head); | |
375 | return (0); | |
376 | } | |
377 | ||
378 | /* | |
379 | * Not enough space - this should never happen as it has been checked | |
380 | * in tcp_tfo_check. So, fail here! | |
381 | */ | |
382 | VERIFY(tpcache->tc_tfo_cookie_len <= *len); | |
383 | ||
384 | memcpy(cookie, tpcache->tc_tfo_cookie, tpcache->tc_tfo_cookie_len); | |
385 | *len = tpcache->tc_tfo_cookie_len; | |
386 | ||
387 | tcp_cache_unlock(head); | |
388 | ||
389 | return (1); | |
390 | } | |
391 | ||
392 | unsigned int tcp_cache_get_cookie_len(struct tcpcb *tp) | |
393 | { | |
394 | struct tcp_cache_head *head; | |
395 | struct tcp_cache *tpcache; | |
396 | unsigned int cookie_len; | |
397 | ||
398 | /* Call lookup/create function */ | |
399 | tpcache = tcp_getcache_with_lock(tp, 1, &head); | |
400 | if (tpcache == NULL) | |
401 | return (0); | |
402 | ||
403 | cookie_len = tpcache->tc_tfo_cookie_len; | |
404 | ||
405 | tcp_cache_unlock(head); | |
406 | ||
407 | return cookie_len; | |
408 | } | |
409 | ||
410 | static u_int16_t tcp_heuristics_hash(struct inpcb *inp, | |
411 | struct tcp_heuristic_key *key) | |
412 | { | |
413 | u_int32_t hash; | |
414 | ||
415 | bzero(key, sizeof(struct tcp_heuristic_key)); | |
416 | ||
417 | tcp_cache_hash_src(inp, key); | |
418 | ||
419 | hash = net_flowhash(key, sizeof(struct tcp_heuristic_key), | |
420 | tcp_cache_hash_seed); | |
421 | ||
422 | return (hash & (tcp_cache_size - 1)); | |
423 | } | |
424 | ||
425 | static void tcp_heuristic_unlock(struct tcp_heuristics_head *head) | |
426 | { | |
427 | lck_mtx_unlock(&head->thh_mtx); | |
428 | } | |
429 | ||
430 | /* | |
431 | * Make sure that everything that happens after tcp_getheuristic_with_lock() | |
432 | * is short enough to justify that you hold the per-bucket lock!!! | |
433 | * | |
434 | * Otherwise, better build another lookup-function that does not hold the | |
435 | * lock and you copy out the bits and bytes. | |
436 | * | |
437 | * That's why we provide the head as a "return"-pointer so that the caller | |
438 | * can give it back to use for tcp_heur_unlock(). | |
439 | * | |
440 | * | |
441 | * ToDo - way too much code-duplication. We should create an interface to handle | |
442 | * bucketized hashtables with recycling of the oldest element. | |
443 | */ | |
444 | static struct tcp_heuristic *tcp_getheuristic_with_lock(struct tcpcb *tp, | |
445 | int create, struct tcp_heuristics_head **headarg) | |
446 | { | |
447 | struct inpcb *inp = tp->t_inpcb; | |
448 | struct tcp_heuristic *tpheur = NULL; | |
449 | struct tcp_heuristics_head *head; | |
450 | struct tcp_heuristic_key key; | |
451 | u_int16_t hash; | |
452 | int i = 0; | |
453 | ||
454 | hash = tcp_heuristics_hash(inp, &key); | |
455 | head = &tcp_heuristics[hash]; | |
456 | ||
457 | lck_mtx_lock(&head->thh_mtx); | |
458 | ||
459 | /*** First step: Look for the tcp_heur in our bucket ***/ | |
460 | SLIST_FOREACH(tpheur, &head->tcp_heuristics, list) { | |
461 | if (memcmp(&tpheur->th_key, &key, sizeof(key)) == 0) | |
462 | break; | |
463 | ||
464 | i++; | |
465 | } | |
466 | ||
467 | /*** Second step: If it's not there, create/recycle it ***/ | |
468 | if ((tpheur == NULL) && create) { | |
469 | if (i >= TCP_CACHE_BUCKET_SIZE) { | |
470 | struct tcp_heuristic *oldest_heur = NULL; | |
471 | u_int32_t max_age = 0; | |
472 | ||
473 | /* Look for the oldest tcp_heur in the bucket */ | |
474 | SLIST_FOREACH(tpheur, &head->tcp_heuristics, list) { | |
475 | u_int32_t age = tcp_now - tpheur->th_last_access; | |
476 | if (age > max_age) { | |
477 | max_age = age; | |
478 | oldest_heur = tpheur; | |
479 | } | |
480 | } | |
481 | VERIFY(oldest_heur != NULL); | |
482 | ||
483 | tpheur = oldest_heur; | |
484 | ||
485 | /* We recycle - set everything to 0 */ | |
4bd07ac2 A |
486 | bzero(tpheur->th_val_start, |
487 | tpheur->th_val_end - tpheur->th_val_start); | |
3e170ce0 A |
488 | } else { |
489 | /* Create a new heuristic and add it to the list */ | |
490 | tpheur = _MALLOC(sizeof(struct tcp_heuristic), M_TEMP, | |
491 | M_NOWAIT | M_ZERO); | |
492 | if (tpheur == NULL) | |
493 | goto out_null; | |
494 | ||
495 | SLIST_INSERT_HEAD(&head->tcp_heuristics, tpheur, list); | |
496 | } | |
497 | ||
4bd07ac2 A |
498 | /* |
499 | * Set to tcp_now, to make sure it won't be > than tcp_now in the | |
500 | * near future. | |
501 | */ | |
502 | tpheur->th_ecn_backoff = tcp_now; | |
503 | tpheur->th_tfo_cookie_backoff = tcp_now; | |
39037602 | 504 | tpheur->th_mptcp_backoff = tcp_now; |
4bd07ac2 | 505 | |
3e170ce0 A |
506 | memcpy(&tpheur->th_key, &key, sizeof(key)); |
507 | } | |
508 | ||
509 | if (tpheur == NULL) | |
510 | goto out_null; | |
511 | ||
512 | /* Update timestamp for garbage collection purposes */ | |
513 | tpheur->th_last_access = tcp_now; | |
514 | *headarg = head; | |
515 | ||
516 | return (tpheur); | |
517 | ||
518 | out_null: | |
519 | tcp_heuristic_unlock(head); | |
520 | return (NULL); | |
521 | } | |
522 | ||
39037602 | 523 | static void tcp_heuristic_reset_loss(struct tcpcb *tp, u_int8_t flags) |
3e170ce0 A |
524 | { |
525 | struct tcp_heuristics_head *head; | |
39037602 | 526 | struct tcp_heuristic *tpheur; |
3e170ce0 | 527 | |
39037602 A |
528 | /* |
529 | * Don't attempt to create it! Keep the heuristics clean if the | |
530 | * server does not support TFO. This reduces the lookup-cost on | |
531 | * our side. | |
532 | */ | |
533 | tpheur = tcp_getheuristic_with_lock(tp, 0, &head); | |
3e170ce0 A |
534 | if (tpheur == NULL) |
535 | return; | |
536 | ||
39037602 A |
537 | if (flags & TCPCACHE_F_TFO) |
538 | tpheur->th_tfo_cookie_loss = 0; | |
539 | ||
540 | if (flags & TCPCACHE_F_ECN) | |
541 | tpheur->th_ecn_loss = 0; | |
542 | ||
543 | if (flags & TCPCACHE_F_MPTCP) | |
544 | tpheur->th_mptcp_loss = 0; | |
3e170ce0 A |
545 | |
546 | tcp_heuristic_unlock(head); | |
547 | } | |
548 | ||
39037602 A |
549 | void tcp_heuristic_tfo_success(struct tcpcb *tp) |
550 | { | |
551 | tcp_heuristic_reset_loss(tp, TCPCACHE_F_TFO); | |
552 | } | |
553 | ||
554 | void tcp_heuristic_mptcp_success(struct tcpcb *tp) | |
555 | { | |
556 | tcp_heuristic_reset_loss(tp, TCPCACHE_F_MPTCP); | |
557 | } | |
558 | ||
559 | void tcp_heuristic_ecn_success(struct tcpcb *tp) | |
560 | { | |
561 | tcp_heuristic_reset_loss(tp, TCPCACHE_F_ECN); | |
562 | } | |
563 | ||
3e170ce0 A |
564 | void tcp_heuristic_tfo_rcv_good(struct tcpcb *tp) |
565 | { | |
566 | struct tcp_heuristics_head *head; | |
567 | ||
568 | struct tcp_heuristic *tpheur = tcp_getheuristic_with_lock(tp, 1, &head); | |
569 | if (tpheur == NULL) | |
570 | return; | |
571 | ||
572 | tpheur->th_tfo_rcv_middlebox_supp = 1; | |
573 | ||
574 | tcp_heuristic_unlock(head); | |
575 | ||
576 | tp->t_tfo_flags |= TFO_F_NO_RCVPROBING; | |
577 | } | |
578 | ||
579 | void tcp_heuristic_tfo_snd_good(struct tcpcb *tp) | |
580 | { | |
581 | struct tcp_heuristics_head *head; | |
582 | ||
583 | struct tcp_heuristic *tpheur = tcp_getheuristic_with_lock(tp, 1, &head); | |
584 | if (tpheur == NULL) | |
585 | return; | |
586 | ||
587 | tpheur->th_tfo_snd_middlebox_supp = 1; | |
588 | ||
589 | tcp_heuristic_unlock(head); | |
590 | ||
591 | tp->t_tfo_flags |= TFO_F_NO_SNDPROBING; | |
592 | } | |
593 | ||
39037602 | 594 | static void tcp_heuristic_inc_loss(struct tcpcb *tp, u_int8_t flags) |
3e170ce0 A |
595 | { |
596 | struct tcp_heuristics_head *head; | |
597 | struct tcp_heuristic *tpheur; | |
598 | ||
599 | tpheur = tcp_getheuristic_with_lock(tp, 1, &head); | |
600 | if (tpheur == NULL) | |
601 | return; | |
602 | ||
39037602 A |
603 | /* Limit to prevent integer-overflow during exponential backoff */ |
604 | if ((flags & TCPCACHE_F_TFO) && tpheur->th_tfo_cookie_loss < TCP_CACHE_OVERFLOW_PROTECT) | |
4bd07ac2 A |
605 | tpheur->th_tfo_cookie_loss++; |
606 | ||
39037602 | 607 | if ((flags & TCPCACHE_F_ECN) && tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT) { |
4bd07ac2 A |
608 | tpheur->th_ecn_loss++; |
609 | if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { | |
610 | tcpstat.tcps_ecn_fallback_synloss++; | |
611 | INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_synloss); | |
612 | tpheur->th_ecn_backoff = tcp_now + | |
39037602 A |
613 | ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << |
614 | (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); | |
615 | } | |
616 | } | |
617 | ||
618 | if ((flags & TCPCACHE_F_MPTCP) && | |
619 | tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT) { | |
620 | tpheur->th_mptcp_loss++; | |
621 | if (tpheur->th_mptcp_loss >= MPTCP_MAX_SYN_LOSS) { | |
622 | /* | |
623 | * Yes, we take tcp_ecn_timeout, to avoid adding yet | |
624 | * another sysctl that is just used for testing. | |
625 | */ | |
626 | tpheur->th_mptcp_backoff = tcp_now + | |
627 | ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << | |
628 | (tpheur->th_mptcp_loss - MPTCP_MAX_SYN_LOSS)); | |
629 | } | |
630 | } | |
631 | ||
632 | if ((flags & TCPCACHE_F_ECN_DROPRST) && | |
633 | tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) { | |
634 | tpheur->th_ecn_droprst++; | |
635 | if (tpheur->th_ecn_droprst >= ECN_MAX_DROPRST) { | |
636 | tcpstat.tcps_ecn_fallback_droprst++; | |
637 | INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_droprst); | |
638 | tpheur->th_ecn_backoff = tcp_now + | |
639 | ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << | |
640 | (tpheur->th_ecn_droprst - ECN_MAX_DROPRST)); | |
641 | ||
4bd07ac2 A |
642 | } |
643 | } | |
3e170ce0 | 644 | |
39037602 A |
645 | if ((flags & TCPCACHE_F_ECN_DROPRXMT) && |
646 | tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT) { | |
647 | tpheur->th_ecn_droprxmt++; | |
648 | if (tpheur->th_ecn_droprxmt >= ECN_MAX_DROPRXMT) { | |
649 | tcpstat.tcps_ecn_fallback_droprxmt++; | |
650 | INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_droprxmt); | |
651 | tpheur->th_ecn_backoff = tcp_now + | |
652 | ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << | |
653 | (tpheur->th_ecn_droprxmt - ECN_MAX_DROPRXMT)); | |
654 | } | |
655 | } | |
3e170ce0 A |
656 | tcp_heuristic_unlock(head); |
657 | } | |
658 | ||
39037602 A |
659 | void tcp_heuristic_tfo_loss(struct tcpcb *tp) |
660 | { | |
661 | tcp_heuristic_inc_loss(tp, TCPCACHE_F_TFO); | |
662 | } | |
663 | ||
664 | void tcp_heuristic_mptcp_loss(struct tcpcb *tp) | |
665 | { | |
666 | tcp_heuristic_inc_loss(tp, TCPCACHE_F_MPTCP); | |
667 | } | |
668 | ||
669 | void tcp_heuristic_ecn_loss(struct tcpcb *tp) | |
670 | { | |
671 | tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN); | |
672 | } | |
673 | ||
674 | void tcp_heuristic_ecn_droprst(struct tcpcb *tp) | |
675 | { | |
676 | tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN_DROPRST); | |
677 | } | |
678 | ||
679 | void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp) | |
680 | { | |
681 | tcp_heuristic_inc_loss(tp, TCPCACHE_F_ECN_DROPRXMT); | |
682 | } | |
683 | ||
3e170ce0 A |
684 | void tcp_heuristic_tfo_middlebox(struct tcpcb *tp) |
685 | { | |
686 | struct tcp_heuristics_head *head; | |
687 | struct tcp_heuristic *tpheur; | |
688 | ||
689 | tpheur = tcp_getheuristic_with_lock(tp, 1, &head); | |
690 | if (tpheur == NULL) | |
691 | return; | |
692 | ||
693 | tpheur->th_tfo_aggressive_fallback = 1; | |
694 | ||
695 | tcp_heuristic_unlock(head); | |
696 | } | |
697 | ||
4bd07ac2 A |
698 | void tcp_heuristic_ecn_aggressive(struct tcpcb *tp) |
699 | { | |
700 | struct tcp_heuristics_head *head; | |
701 | struct tcp_heuristic *tpheur; | |
702 | ||
703 | tpheur = tcp_getheuristic_with_lock(tp, 1, &head); | |
704 | if (tpheur == NULL) | |
705 | return; | |
706 | ||
707 | /* Must be done before, otherwise we will start off with expo-backoff */ | |
708 | tpheur->th_ecn_backoff = tcp_now + | |
709 | ((tcp_ecn_timeout * 60 * TCP_RETRANSHZ) << (tpheur->th_ecn_aggressive)); | |
710 | ||
711 | /* | |
39037602 | 712 | * Ugly way to prevent integer overflow... limit to prevent in |
4bd07ac2 A |
713 | * overflow during exp. backoff. |
714 | */ | |
39037602 | 715 | if (tpheur->th_ecn_aggressive < TCP_CACHE_OVERFLOW_PROTECT) |
4bd07ac2 A |
716 | tpheur->th_ecn_aggressive++; |
717 | ||
718 | tcp_heuristic_unlock(head); | |
719 | } | |
720 | ||
3e170ce0 A |
721 | boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp) |
722 | { | |
723 | struct tcp_heuristics_head *head; | |
724 | struct tcp_heuristic *tpheur; | |
725 | ||
39037602 A |
726 | if (disable_tcp_heuristics) |
727 | return (TRUE); | |
728 | ||
3e170ce0 A |
729 | /* Get the tcp-heuristic. */ |
730 | tpheur = tcp_getheuristic_with_lock(tp, 0, &head); | |
731 | if (tpheur == NULL) | |
39037602 | 732 | return (TRUE); |
3e170ce0 A |
733 | |
734 | if (tpheur->th_tfo_aggressive_fallback) { | |
735 | /* Aggressive fallback - don't do TFO anymore... :'( */ | |
736 | tcp_heuristic_unlock(head); | |
39037602 | 737 | return (FALSE); |
3e170ce0 A |
738 | } |
739 | ||
740 | if (tpheur->th_tfo_cookie_loss >= TFO_MAX_COOKIE_LOSS && | |
741 | (tpheur->th_tfo_fallback_trials < tcp_tfo_fallback_min || | |
742 | TSTMP_GT(tpheur->th_tfo_cookie_backoff, tcp_now))) { | |
743 | /* | |
744 | * So, when we are in SYN-loss mode we try to stop using TFO | |
745 | * for the next 'tcp_tfo_fallback_min' connections. That way, | |
746 | * we are sure that never more than 1 out of tcp_tfo_fallback_min | |
747 | * connections will suffer from our nice little middelbox. | |
748 | * | |
749 | * After that we first wait for 2 minutes. If we fail again, | |
750 | * we wait for yet another 60 minutes. | |
751 | */ | |
752 | tpheur->th_tfo_fallback_trials++; | |
753 | if (tpheur->th_tfo_fallback_trials >= tcp_tfo_fallback_min && | |
754 | !tpheur->th_tfo_in_backoff) { | |
755 | if (tpheur->th_tfo_cookie_loss == TFO_MAX_COOKIE_LOSS) | |
756 | /* Backoff for 2 minutes */ | |
757 | tpheur->th_tfo_cookie_backoff = tcp_now + (60 * 2 * TCP_RETRANSHZ); | |
758 | else | |
759 | /* Backoff for 60 minutes */ | |
760 | tpheur->th_tfo_cookie_backoff = tcp_now + (60 * 60 * TCP_RETRANSHZ); | |
761 | ||
762 | tpheur->th_tfo_in_backoff = 1; | |
763 | } | |
764 | ||
765 | tcp_heuristic_unlock(head); | |
39037602 | 766 | return (FALSE); |
3e170ce0 A |
767 | } |
768 | ||
769 | /* | |
770 | * We give it a new shot, set trials back to 0. This allows to | |
771 | * start counting again from zero in case we get yet another SYN-loss | |
772 | */ | |
773 | tpheur->th_tfo_fallback_trials = 0; | |
774 | tpheur->th_tfo_in_backoff = 0; | |
775 | ||
776 | if (tpheur->th_tfo_rcv_middlebox_supp) | |
777 | tp->t_tfo_flags |= TFO_F_NO_RCVPROBING; | |
778 | if (tpheur->th_tfo_snd_middlebox_supp) | |
779 | tp->t_tfo_flags |= TFO_F_NO_SNDPROBING; | |
780 | ||
781 | tcp_heuristic_unlock(head); | |
782 | ||
39037602 A |
783 | return (TRUE); |
784 | } | |
785 | ||
786 | boolean_t tcp_heuristic_do_mptcp(struct tcpcb *tp) | |
787 | { | |
788 | struct tcp_heuristics_head *head; | |
789 | struct tcp_heuristic *tpheur; | |
790 | boolean_t ret = TRUE; | |
791 | ||
792 | if (disable_tcp_heuristics) | |
793 | return (TRUE); | |
794 | ||
795 | /* Get the tcp-heuristic. */ | |
796 | tpheur = tcp_getheuristic_with_lock(tp, 0, &head); | |
797 | if (tpheur == NULL) | |
798 | return ret; | |
799 | ||
800 | if (TSTMP_GT(tpheur->th_mptcp_backoff, tcp_now)) | |
801 | ret = FALSE; | |
802 | ||
803 | tcp_heuristic_unlock(head); | |
804 | ||
805 | return (ret); | |
3e170ce0 A |
806 | } |
807 | ||
4bd07ac2 A |
808 | boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp) |
809 | { | |
810 | struct tcp_heuristics_head *head; | |
811 | struct tcp_heuristic *tpheur; | |
39037602 A |
812 | boolean_t ret = TRUE; |
813 | ||
814 | if (disable_tcp_heuristics) | |
815 | return (TRUE); | |
4bd07ac2 A |
816 | |
817 | /* Get the tcp-heuristic. */ | |
818 | tpheur = tcp_getheuristic_with_lock(tp, 0, &head); | |
819 | if (tpheur == NULL) | |
820 | return ret; | |
821 | ||
39037602 A |
822 | if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) { |
823 | ret = FALSE; | |
824 | } else { | |
825 | /* Reset the following counters to start re-evaluating */ | |
826 | if (tpheur->th_ecn_droprst >= ECN_RETRY_LIMIT) | |
827 | tpheur->th_ecn_droprst = 0; | |
828 | if (tpheur->th_ecn_droprxmt >= ECN_RETRY_LIMIT) | |
829 | tpheur->th_ecn_droprxmt = 0; | |
830 | } | |
4bd07ac2 A |
831 | |
832 | tcp_heuristic_unlock(head); | |
833 | ||
834 | return (ret); | |
835 | } | |
836 | ||
3e170ce0 A |
837 | static void sysctl_cleartfocache(void) |
838 | { | |
839 | int i; | |
840 | ||
841 | for (i = 0; i < tcp_cache_size; i++) { | |
842 | struct tcp_cache_head *head = &tcp_cache[i]; | |
843 | struct tcp_cache *tpcache, *tmp; | |
844 | struct tcp_heuristics_head *hhead = &tcp_heuristics[i]; | |
845 | struct tcp_heuristic *tpheur, *htmp; | |
846 | ||
847 | lck_mtx_lock(&head->tch_mtx); | |
848 | SLIST_FOREACH_SAFE(tpcache, &head->tcp_caches, list, tmp) { | |
849 | SLIST_REMOVE(&head->tcp_caches, tpcache, tcp_cache, list); | |
850 | _FREE(tpcache, M_TEMP); | |
851 | } | |
852 | lck_mtx_unlock(&head->tch_mtx); | |
853 | ||
854 | lck_mtx_lock(&hhead->thh_mtx); | |
855 | SLIST_FOREACH_SAFE(tpheur, &hhead->tcp_heuristics, list, htmp) { | |
856 | SLIST_REMOVE(&hhead->tcp_heuristics, tpheur, tcp_heuristic, list); | |
857 | _FREE(tpheur, M_TEMP); | |
858 | } | |
859 | lck_mtx_unlock(&hhead->thh_mtx); | |
860 | } | |
861 | } | |
862 | ||
863 | /* This sysctl is useful for testing purposes only */ | |
864 | static int tcpcleartfo = 0; | |
865 | ||
866 | static int sysctl_cleartfo SYSCTL_HANDLER_ARGS | |
867 | { | |
868 | #pragma unused(arg1, arg2) | |
869 | int error = 0, val, oldval = tcpcleartfo; | |
870 | ||
871 | val = oldval; | |
872 | error = sysctl_handle_int(oidp, &val, 0, req); | |
873 | if (error || !req->newptr) | |
874 | return (error); | |
875 | ||
876 | /* | |
877 | * The actual value does not matter. If the value is set, it triggers | |
878 | * the clearing of the TFO cache. If a future implementation does not | |
879 | * use the route entry to hold the TFO cache, replace the route sysctl. | |
880 | */ | |
881 | ||
882 | if (val != oldval) | |
883 | sysctl_cleartfocache(); | |
884 | ||
885 | tcpcleartfo = val; | |
886 | ||
887 | return (error); | |
888 | } | |
889 | ||
890 | SYSCTL_PROC(_net_inet_tcp, OID_AUTO, clear_tfocache, CTLTYPE_INT | CTLFLAG_RW | | |
891 | CTLFLAG_LOCKED, &tcpcleartfo, 0, &sysctl_cleartfo, "I", | |
892 | "Toggle to clear the TFO destination based heuristic cache"); | |
893 | ||
894 | void tcp_cache_init(void) | |
895 | { | |
896 | uint64_t sane_size_meg = sane_size / 1024 / 1024; | |
897 | int i; | |
898 | ||
899 | /* | |
900 | * On machines with <100MB of memory this will result in a (full) cache-size | |
901 | * of 32 entries, thus 32 * 5 * 64bytes = 10KB. (about 0.01 %) | |
902 | * On machines with > 4GB of memory, we have a cache-size of 1024 entries, | |
903 | * thus about 327KB. | |
904 | * | |
905 | * Side-note: we convert to u_int32_t. If sane_size is more than | |
906 | * 16000 TB, we loose precision. But, who cares? :) | |
907 | */ | |
908 | tcp_cache_size = tcp_cache_roundup2((u_int32_t)(sane_size_meg >> 2)); | |
909 | if (tcp_cache_size < 32) | |
910 | tcp_cache_size = 32; | |
911 | else if (tcp_cache_size > 1024) | |
912 | tcp_cache_size = 1024; | |
913 | ||
914 | tcp_cache = _MALLOC(sizeof(struct tcp_cache_head) * tcp_cache_size, | |
915 | M_TEMP, M_ZERO); | |
916 | if (tcp_cache == NULL) | |
917 | panic("Allocating tcp_cache failed at boot-time!"); | |
918 | ||
919 | tcp_cache_mtx_grp_attr = lck_grp_attr_alloc_init(); | |
920 | tcp_cache_mtx_grp = lck_grp_alloc_init("tcpcache", tcp_cache_mtx_grp_attr); | |
921 | tcp_cache_mtx_attr = lck_attr_alloc_init(); | |
922 | ||
923 | tcp_heuristics = _MALLOC(sizeof(struct tcp_heuristics_head) * tcp_cache_size, | |
924 | M_TEMP, M_ZERO); | |
925 | if (tcp_heuristics == NULL) | |
926 | panic("Allocating tcp_heuristic failed at boot-time!"); | |
927 | ||
928 | tcp_heuristic_mtx_grp_attr = lck_grp_attr_alloc_init(); | |
929 | tcp_heuristic_mtx_grp = lck_grp_alloc_init("tcpheuristic", tcp_heuristic_mtx_grp_attr); | |
930 | tcp_heuristic_mtx_attr = lck_attr_alloc_init(); | |
931 | ||
932 | for (i = 0; i < tcp_cache_size; i++) { | |
933 | lck_mtx_init(&tcp_cache[i].tch_mtx, tcp_cache_mtx_grp, | |
934 | tcp_cache_mtx_attr); | |
935 | SLIST_INIT(&tcp_cache[i].tcp_caches); | |
936 | ||
937 | lck_mtx_init(&tcp_heuristics[i].thh_mtx, tcp_heuristic_mtx_grp, | |
938 | tcp_heuristic_mtx_attr); | |
939 | SLIST_INIT(&tcp_heuristics[i].tcp_heuristics); | |
940 | } | |
941 | ||
942 | tcp_cache_hash_seed = RandomULong(); | |
943 | } |