]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_subr.c
1be5b6a8085824911b02f5cd1c65bd0c17f01807
[apple/xnu.git] / bsd / netinet / tcp_subr.c
1 /*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
61 */
62 /*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/callout.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/domain.h>
77 #include <sys/proc.h>
78 #include <sys/kauth.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/protosw.h>
82 #include <sys/random.h>
83 #include <sys/syslog.h>
84 #include <sys/mcache.h>
85 #include <kern/locks.h>
86 #include <kern/zalloc.h>
87
88 #include <dev/random/randomdev.h>
89
90 #include <net/route.h>
91 #include <net/if.h>
92 #include <net/content_filter.h>
93
94 #define tcp_minmssoverload fring
95 #define _IP_VHL
96 #include <netinet/in.h>
97 #include <netinet/in_systm.h>
98 #include <netinet/ip.h>
99 #include <netinet/ip_icmp.h>
100 #if INET6
101 #include <netinet/ip6.h>
102 #include <netinet/icmp6.h>
103 #endif
104 #include <netinet/in_pcb.h>
105 #if INET6
106 #include <netinet6/in6_pcb.h>
107 #endif
108 #include <netinet/in_var.h>
109 #include <netinet/ip_var.h>
110 #include <netinet/icmp_var.h>
111 #if INET6
112 #include <netinet6/ip6_var.h>
113 #endif
114 #include <netinet/mptcp_var.h>
115 #include <netinet/tcp.h>
116 #include <netinet/tcp_fsm.h>
117 #include <netinet/tcp_seq.h>
118 #include <netinet/tcp_timer.h>
119 #include <netinet/tcp_var.h>
120 #include <netinet/tcp_cc.h>
121 #include <netinet/tcp_cache.h>
122 #include <kern/thread_call.h>
123
124 #if INET6
125 #include <netinet6/tcp6_var.h>
126 #endif
127 #include <netinet/tcpip.h>
128 #if TCPDEBUG
129 #include <netinet/tcp_debug.h>
130 #endif
131 #include <netinet6/ip6protosw.h>
132
133 #if IPSEC
134 #include <netinet6/ipsec.h>
135 #if INET6
136 #include <netinet6/ipsec6.h>
137 #endif
138 #endif /* IPSEC */
139
140 #if NECP
141 #include <net/necp.h>
142 #endif /* NECP */
143
144 #undef tcp_minmssoverload
145
146 #if CONFIG_MACF_NET
147 #include <security/mac_framework.h>
148 #endif /* MAC_NET */
149
150 #include <corecrypto/ccaes.h>
151 #include <libkern/crypto/aes.h>
152 #include <libkern/crypto/md5.h>
153 #include <sys/kdebug.h>
154 #include <mach/sdt.h>
155
156 #include <netinet/lro_ext.h>
157
158 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
159
160 static tcp_cc tcp_ccgen;
161 extern int tcp_lq_overflow;
162
163 extern struct tcptimerlist tcp_timer_list;
164 extern struct tcptailq tcp_tw_tailq;
165
166 SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
167 int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
168
169 #if INET6
170 SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
171 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
172 "Default TCP Maximum Segment Size for IPv6");
173 #endif
174
175 int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
176 struct sysctl_req *);
177 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
178 0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
179
180 /* Current count of half-open TFO connections */
181 int tcp_tfo_halfcnt = 0;
182
183 /* Maximum of half-open TFO connection backlog */
184 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
185 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
186 "Backlog queue for half-open TFO connections");
187
188 SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
189 int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
190 "Enable TCP Fastopen (RFC 7413)");
191
192 SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
193 uint32_t, tcp_now_init, 0, "Initial tcp now value");
194
195 SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
196 uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
197
198 /*
199 * Minimum MSS we accept and use. This prevents DoS attacks where
200 * we are forced to a ridiculous low MSS like 20 and send hundreds
201 * of packets instead of one. The effect scales with the available
202 * bandwidth and quickly saturates the CPU and network interface
203 * with packet generation and sending. Set to zero to disable MINMSS
204 * checking. This setting prevents us from sending too small packets.
205 */
206 SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
207 int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
208 int tcp_do_rfc1323 = 1;
209 #if (DEVELOPMENT || DEBUG)
210 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
211 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323, 0,
212 "Enable rfc1323 (high performance TCP) extensions");
213 #endif /* (DEVELOPMENT || DEBUG) */
214
215 // Not used
216 static int tcp_do_rfc1644 = 0;
217 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644,
218 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1644, 0,
219 "Enable rfc1644 (TTCP) extensions");
220
221 SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED,
222 static int, do_tcpdrain, 0,
223 "Enable tcp_drain routine for extra help when low on mbufs");
224
225 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
226 &tcbinfo.ipi_count, 0, "Number of active PCBs");
227
228 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
229 &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state");
230
231 SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
232 static int, icmp_may_rst, 1,
233 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
234
235 static int tcp_strict_rfc1948 = 0;
236 static int tcp_isn_reseed_interval = 0;
237 #if (DEVELOPMENT || DEBUG)
238 SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
239 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
240
241 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
242 CTLFLAG_RW | CTLFLAG_LOCKED,
243 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
244 #endif /* (DEVELOPMENT || DEBUG) */
245
246 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
247 int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
248
249 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
250 int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
251
252 SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
253 __private_extern__ int , tcp_use_randomport, 0,
254 "Randomize TCP port numbers");
255
256 SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
257 __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
258
259 static void tcp_cleartaocache(void);
260 static void tcp_notify(struct inpcb *, int);
261
262 struct zone *sack_hole_zone;
263 struct zone *tcp_reass_zone;
264 struct zone *tcp_bwmeas_zone;
265 struct zone *tcp_rxt_seg_zone;
266
267 extern int slowlink_wsize; /* window correction for slow links */
268 extern int path_mtu_discovery;
269
270 static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
271
272 #define TCP_BWMEAS_BURST_MINSIZE 6
273 #define TCP_BWMEAS_BURST_MAXSIZE 25
274
275 static uint32_t bwmeas_elm_size;
276
277 /*
278 * Target size of TCP PCB hash tables. Must be a power of two.
279 *
280 * Note that this can be overridden by the kernel environment
281 * variable net.inet.tcp.tcbhashsize
282 */
283 #ifndef TCBHASHSIZE
284 #define TCBHASHSIZE CONFIG_TCBHASHSIZE
285 #endif
286
287 __private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
288 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
289 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
290
291 /*
292 * This is the actual shape of what we allocate using the zone
293 * allocator. Doing it this way allows us to protect both structures
294 * using the same generation count, and also eliminates the overhead
295 * of allocating tcpcbs separately. By hiding the structure here,
296 * we avoid changing most of the rest of the code (although it needs
297 * to be changed, eventually, for greater efficiency).
298 */
299 #define ALIGNMENT 32
300 struct inp_tp {
301 struct inpcb inp;
302 struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
303 };
304 #undef ALIGNMENT
305
306 int get_inpcb_str_size(void);
307 int get_tcp_str_size(void);
308
309 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
310
311 static lck_attr_t *tcp_uptime_mtx_attr = NULL;
312 static lck_grp_t *tcp_uptime_mtx_grp = NULL;
313 static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL;
314 int tcp_notsent_lowat_check(struct socket *so);
315 static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
316 struct if_lim_perf_stat *stat);
317 static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
318 struct if_tcp_ecn_perf_stat *stat);
319
320 static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
321
322 void
323 tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out, size_t blk_size)
324 {
325 u_char in[CCAES_BLOCK_SIZE];
326 #if INET6
327 int isipv6 = inp->inp_vflag & INP_IPV6;
328 #endif
329
330 VERIFY(blk_size == CCAES_BLOCK_SIZE);
331
332 bzero(&in[0], CCAES_BLOCK_SIZE);
333 bzero(&out[0], CCAES_BLOCK_SIZE);
334
335 #if INET6
336 if (isipv6)
337 memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
338 else
339 #endif /* INET6 */
340 memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
341
342 aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
343 }
344
345 __private_extern__ int
346 tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
347 __unused int arg2, struct sysctl_req *req)
348 {
349 int error = 0;
350 /*
351 * TFO-key is expressed as a string in hex format
352 * (+1 to account for \0 char)
353 */
354 char keystring[TCP_FASTOPEN_KEYLEN * 2 + 1];
355 u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
356 int i;
357
358 /* -1, because newlen is len without the terminating \0 character */
359 if (req->newlen != (sizeof(keystring) - 1)) {
360 error = EINVAL;
361 goto exit;
362 }
363
364 /*
365 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
366 * Make sure everything is zero, to avoid putting garbage in there or
367 * leaking the stack.
368 */
369 bzero(keystring, sizeof(keystring));
370
371 error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
372 if (error)
373 goto exit;
374
375 for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
376 /*
377 * We jump over the keystring in 8-character (4 byte in hex)
378 * steps
379 */
380 if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) {
381 error = EINVAL;
382 goto exit;
383 }
384 }
385
386 aes_encrypt_key128((u_char *)key, &tfo_ctx);
387
388 exit:
389 return (error);
390 }
391
392 int
393 get_inpcb_str_size(void)
394 {
395 return (sizeof(struct inpcb));
396 }
397
398 int
399 get_tcp_str_size(void)
400 {
401 return (sizeof(struct tcpcb));
402 }
403
404 int tcp_freeq(struct tcpcb *tp);
405
406 static int scale_to_powerof2(int size);
407
408 /*
409 * This helper routine returns one of the following scaled value of size:
410 * 1. Rounded down power of two value of size if the size value passed as
411 * argument is not a power of two and the rounded up value overflows.
412 * OR
413 * 2. Rounded up power of two value of size if the size value passed as
414 * argument is not a power of two and the rounded up value does not overflow
415 * OR
416 * 3. Same value as argument size if it is already a power of two.
417 */
418 static int
419 scale_to_powerof2(int size) {
420 /* Handle special case of size = 0 */
421 int ret = size ? size : 1;
422
423 if (!powerof2(ret)) {
424 while (!powerof2(size)) {
425 /*
426 * Clear out least significant
427 * set bit till size is left with
428 * its highest set bit at which point
429 * it is rounded down power of two.
430 */
431 size = size & (size -1);
432 }
433
434 /* Check for overflow when rounding up */
435 if (0 == (size << 1)) {
436 ret = size;
437 } else {
438 ret = size << 1;
439 }
440 }
441
442 return (ret);
443 }
444
445 static void
446 tcp_tfo_init(void)
447 {
448 u_char key[TCP_FASTOPEN_KEYLEN];
449
450 read_frandom(key, sizeof(key));
451 aes_encrypt_key128(key, &tfo_ctx);
452 }
453
454 /*
455 * Tcp initialization
456 */
457 void
458 tcp_init(struct protosw *pp, struct domain *dp)
459 {
460 #pragma unused(dp)
461 static int tcp_initialized = 0;
462 vm_size_t str_size;
463 struct inpcbinfo *pcbinfo;
464
465 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
466
467 if (tcp_initialized)
468 return;
469 tcp_initialized = 1;
470
471 tcp_ccgen = 1;
472 tcp_cleartaocache();
473
474 tcp_keepinit = TCPTV_KEEP_INIT;
475 tcp_keepidle = TCPTV_KEEP_IDLE;
476 tcp_keepintvl = TCPTV_KEEPINTVL;
477 tcp_keepcnt = TCPTV_KEEPCNT;
478 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
479 tcp_msl = TCPTV_MSL;
480
481 microuptime(&tcp_uptime);
482 read_frandom(&tcp_now, sizeof(tcp_now));
483
484 /* Starts tcp internal clock at a random value */
485 tcp_now = tcp_now & 0x3fffffff;
486
487 /* expose initial uptime/now via systcl for utcp to keep time sync */
488 tcp_now_init = tcp_now;
489 tcp_microuptime_init = tcp_uptime.tv_sec * 1000 + tcp_uptime.tv_usec;
490 SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
491 SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
492
493 tcp_tfo_init();
494
495 LIST_INIT(&tcb);
496 tcbinfo.ipi_listhead = &tcb;
497
498 pcbinfo = &tcbinfo;
499 /*
500 * allocate lock group attribute and group for tcp pcb mutexes
501 */
502 pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init();
503 pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb",
504 pcbinfo->ipi_lock_grp_attr);
505
506 /*
507 * allocate the lock attribute for tcp pcb mutexes
508 */
509 pcbinfo->ipi_lock_attr = lck_attr_alloc_init();
510
511 if ((pcbinfo->ipi_lock = lck_rw_alloc_init(pcbinfo->ipi_lock_grp,
512 pcbinfo->ipi_lock_attr)) == NULL) {
513 panic("%s: unable to allocate PCB lock\n", __func__);
514 /* NOTREACHED */
515 }
516
517 if (tcp_tcbhashsize == 0) {
518 /* Set to default */
519 tcp_tcbhashsize = 512;
520 }
521
522 if (!powerof2(tcp_tcbhashsize)) {
523 int old_hash_size = tcp_tcbhashsize;
524 tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
525 /* Lower limit of 16 */
526 if (tcp_tcbhashsize < 16) {
527 tcp_tcbhashsize = 16;
528 }
529 printf("WARNING: TCB hash size not a power of 2, "
530 "scaled from %d to %d.\n",
531 old_hash_size,
532 tcp_tcbhashsize);
533 }
534
535 tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB,
536 &tcbinfo.ipi_hashmask);
537 tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
538 &tcbinfo.ipi_porthashmask);
539 str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
540 tcbinfo.ipi_zone = zinit(str_size, 120000*str_size, 8192, "tcpcb");
541 zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE);
542 zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE);
543
544 tcbinfo.ipi_gc = tcp_gc;
545 tcbinfo.ipi_timer = tcp_itimer;
546 in_pcbinfo_attach(&tcbinfo);
547
548 str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
549 sack_hole_zone = zinit(str_size, 120000*str_size, 8192,
550 "sack_hole zone");
551 zone_change(sack_hole_zone, Z_CALLERACCT, FALSE);
552 zone_change(sack_hole_zone, Z_EXPAND, TRUE);
553
554 str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
555 tcp_reass_zone = zinit(str_size, (nmbclusters >> 4) * str_size,
556 0, "tcp_reass_zone");
557 if (tcp_reass_zone == NULL) {
558 panic("%s: failed allocating tcp_reass_zone", __func__);
559 /* NOTREACHED */
560 }
561 zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE);
562 zone_change(tcp_reass_zone, Z_EXPAND, TRUE);
563
564 bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t));
565 tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0,
566 "tcp_bwmeas_zone");
567 if (tcp_bwmeas_zone == NULL) {
568 panic("%s: failed allocating tcp_bwmeas_zone", __func__);
569 /* NOTREACHED */
570 }
571 zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE);
572 zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE);
573
574 str_size = P2ROUNDUP(sizeof(struct tcp_ccstate), sizeof(u_int64_t));
575 tcp_cc_zone = zinit(str_size, 20000 * str_size, 0, "tcp_cc_zone");
576 zone_change(tcp_cc_zone, Z_CALLERACCT, FALSE);
577 zone_change(tcp_cc_zone, Z_EXPAND, TRUE);
578
579 str_size = P2ROUNDUP(sizeof(struct tcp_rxt_seg), sizeof(u_int64_t));
580 tcp_rxt_seg_zone = zinit(str_size, 10000 * str_size, 0,
581 "tcp_rxt_seg_zone");
582 zone_change(tcp_rxt_seg_zone, Z_CALLERACCT, FALSE);
583 zone_change(tcp_rxt_seg_zone, Z_EXPAND, TRUE);
584
585 #if INET6
586 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
587 #else /* INET6 */
588 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
589 #endif /* INET6 */
590 if (max_protohdr < TCP_MINPROTOHDR) {
591 _max_protohdr = TCP_MINPROTOHDR;
592 _max_protohdr = max_protohdr; /* round it up */
593 }
594 if (max_linkhdr + max_protohdr > MCLBYTES)
595 panic("tcp_init");
596 #undef TCP_MINPROTOHDR
597
598 /* Initialize time wait and timer lists */
599 TAILQ_INIT(&tcp_tw_tailq);
600
601 bzero(&tcp_timer_list, sizeof(tcp_timer_list));
602 LIST_INIT(&tcp_timer_list.lhead);
603 /*
604 * allocate lock group attribute, group and attribute for
605 * the tcp timer list
606 */
607 tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init();
608 tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist",
609 tcp_timer_list.mtx_grp_attr);
610 tcp_timer_list.mtx_attr = lck_attr_alloc_init();
611 if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp,
612 tcp_timer_list.mtx_attr)) == NULL) {
613 panic("failed to allocate memory for tcp_timer_list.mtx\n");
614 };
615 tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL);
616 if (tcp_timer_list.call == NULL) {
617 panic("failed to allocate call entry 1 in tcp_init\n");
618 }
619
620 /*
621 * allocate lock group attribute, group and attribute for
622 * tcp_uptime_lock
623 */
624 tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init();
625 tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime",
626 tcp_uptime_mtx_grp_attr);
627 tcp_uptime_mtx_attr = lck_attr_alloc_init();
628 tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp,
629 tcp_uptime_mtx_attr);
630
631 /* Initialize TCP LRO data structures */
632 tcp_lro_init();
633
634 /* Initialize TCP Cache */
635 tcp_cache_init();
636
637 /*
638 * If more than 60 MB of mbuf pool is available, increase the
639 * maximum allowed receive and send socket buffer size.
640 */
641 if (nmbclusters > 30720) {
642 #if CONFIG_EMBEDDED
643 tcp_autorcvbuf_max = 2 * 1024 * 1024;
644 tcp_autosndbuf_max = 2 * 1024 * 1024;
645 #else
646 tcp_autorcvbuf_max = 1024 * 1024;
647 tcp_autosndbuf_max = 1024 * 1024;
648 #endif /* CONFIG_EMBEDDED */
649 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
650 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
651
652 /*
653 * Receive buffer max for cellular interfaces supporting
654 * Carrier Aggregation is higher
655 */
656 tcp_autorcvbuf_max_ca = 2 * 1024 * 1024;
657 }
658 }
659
660 /*
661 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
662 * tcp_template used to store this data in mbufs, but we now recopy it out
663 * of the tcpcb each time to conserve mbufs.
664 */
665 void
666 tcp_fillheaders(struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
667 {
668 struct inpcb *inp = tp->t_inpcb;
669 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
670
671 #if INET6
672 if ((inp->inp_vflag & INP_IPV6) != 0) {
673 struct ip6_hdr *ip6;
674
675 ip6 = (struct ip6_hdr *)ip_ptr;
676 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
677 (inp->inp_flow & IPV6_FLOWINFO_MASK);
678 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
679 (IPV6_VERSION & IPV6_VERSION_MASK);
680 ip6->ip6_plen = htons(sizeof(struct tcphdr));
681 ip6->ip6_nxt = IPPROTO_TCP;
682 ip6->ip6_hlim = 0;
683 ip6->ip6_src = inp->in6p_laddr;
684 ip6->ip6_dst = inp->in6p_faddr;
685 tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
686 htonl(sizeof (struct tcphdr) + IPPROTO_TCP));
687 } else
688 #endif
689 {
690 struct ip *ip = (struct ip *) ip_ptr;
691
692 ip->ip_vhl = IP_VHL_BORING;
693 ip->ip_tos = 0;
694 ip->ip_len = 0;
695 ip->ip_id = 0;
696 ip->ip_off = 0;
697 ip->ip_ttl = 0;
698 ip->ip_sum = 0;
699 ip->ip_p = IPPROTO_TCP;
700 ip->ip_src = inp->inp_laddr;
701 ip->ip_dst = inp->inp_faddr;
702 tcp_hdr->th_sum =
703 in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
704 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
705 }
706
707 tcp_hdr->th_sport = inp->inp_lport;
708 tcp_hdr->th_dport = inp->inp_fport;
709 tcp_hdr->th_seq = 0;
710 tcp_hdr->th_ack = 0;
711 tcp_hdr->th_x2 = 0;
712 tcp_hdr->th_off = 5;
713 tcp_hdr->th_flags = 0;
714 tcp_hdr->th_win = 0;
715 tcp_hdr->th_urp = 0;
716 }
717
718 /*
719 * Create template to be used to send tcp packets on a connection.
720 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
721 * use for this function is in keepalives, which use tcp_respond.
722 */
723 struct tcptemp *
724 tcp_maketemplate(struct tcpcb *tp)
725 {
726 struct mbuf *m;
727 struct tcptemp *n;
728
729 m = m_get(M_DONTWAIT, MT_HEADER);
730 if (m == NULL)
731 return (0);
732 m->m_len = sizeof(struct tcptemp);
733 n = mtod(m, struct tcptemp *);
734
735 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
736 return (n);
737 }
738
739 /*
740 * Send a single message to the TCP at address specified by
741 * the given TCP/IP header. If m == 0, then we make a copy
742 * of the tcpiphdr at ti and send directly to the addressed host.
743 * This is used to force keep alive messages out using the TCP
744 * template for a connection. If flags are given then we send
745 * a message back to the TCP which originated the * segment ti,
746 * and discard the mbuf containing it and any other attached mbufs.
747 *
748 * In any case the ack and sequence number of the transmitted
749 * segment are as specified by the parameters.
750 *
751 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
752 */
753 void
754 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
755 tcp_seq ack, tcp_seq seq, int flags, struct tcp_respond_args *tra)
756 {
757 int tlen;
758 int win = 0;
759 struct route *ro = 0;
760 struct route sro;
761 struct ip *ip;
762 struct tcphdr *nth;
763 #if INET6
764 struct route_in6 *ro6 = 0;
765 struct route_in6 sro6;
766 struct ip6_hdr *ip6;
767 int isipv6;
768 #endif /* INET6 */
769 struct ifnet *outif;
770 int sotc = SO_TC_UNSPEC;
771
772 #if INET6
773 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
774 ip6 = ipgen;
775 #endif /* INET6 */
776 ip = ipgen;
777
778 if (tp) {
779 if (!(flags & TH_RST)) {
780 win = tcp_sbspace(tp);
781 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale)
782 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
783 }
784 #if INET6
785 if (isipv6)
786 ro6 = &tp->t_inpcb->in6p_route;
787 else
788 #endif /* INET6 */
789 ro = &tp->t_inpcb->inp_route;
790 } else {
791 #if INET6
792 if (isipv6) {
793 ro6 = &sro6;
794 bzero(ro6, sizeof(*ro6));
795 } else
796 #endif /* INET6 */
797 {
798 ro = &sro;
799 bzero(ro, sizeof(*ro));
800 }
801 }
802 if (m == 0) {
803 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
804 if (m == NULL)
805 return;
806 tlen = 0;
807 m->m_data += max_linkhdr;
808 #if INET6
809 if (isipv6) {
810 VERIFY((MHLEN - max_linkhdr) >=
811 (sizeof (*ip6) + sizeof (*nth)));
812 bcopy((caddr_t)ip6, mtod(m, caddr_t),
813 sizeof(struct ip6_hdr));
814 ip6 = mtod(m, struct ip6_hdr *);
815 nth = (struct tcphdr *)(void *)(ip6 + 1);
816 } else
817 #endif /* INET6 */
818 {
819 VERIFY((MHLEN - max_linkhdr) >=
820 (sizeof (*ip) + sizeof (*nth)));
821 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
822 ip = mtod(m, struct ip *);
823 nth = (struct tcphdr *)(void *)(ip + 1);
824 }
825 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
826 #if MPTCP
827 if ((tp) && (tp->t_mpflags & TMPF_RESET))
828 flags = (TH_RST | TH_ACK);
829 else
830 #endif
831 flags = TH_ACK;
832 } else {
833 m_freem(m->m_next);
834 m->m_next = 0;
835 m->m_data = (caddr_t)ipgen;
836 /* m_len is set later */
837 tlen = 0;
838 #define xchg(a, b, type) { type t; t = a; a = b; b = t; }
839 #if INET6
840 if (isipv6) {
841 /* Expect 32-bit aligned IP on strict-align platforms */
842 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
843 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
844 nth = (struct tcphdr *)(void *)(ip6 + 1);
845 } else
846 #endif /* INET6 */
847 {
848 /* Expect 32-bit aligned IP on strict-align platforms */
849 IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
850 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
851 nth = (struct tcphdr *)(void *)(ip + 1);
852 }
853 if (th != nth) {
854 /*
855 * this is usually a case when an extension header
856 * exists between the IPv6 header and the
857 * TCP header.
858 */
859 nth->th_sport = th->th_sport;
860 nth->th_dport = th->th_dport;
861 }
862 xchg(nth->th_dport, nth->th_sport, n_short);
863 #undef xchg
864 }
865 #if INET6
866 if (isipv6) {
867 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
868 tlen));
869 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
870 } else
871 #endif
872 {
873 tlen += sizeof (struct tcpiphdr);
874 ip->ip_len = tlen;
875 ip->ip_ttl = ip_defttl;
876 }
877 m->m_len = tlen;
878 m->m_pkthdr.len = tlen;
879 m->m_pkthdr.rcvif = 0;
880 #if CONFIG_MACF_NET
881 if (tp != NULL && tp->t_inpcb != NULL) {
882 /*
883 * Packet is associated with a socket, so allow the
884 * label of the response to reflect the socket label.
885 */
886 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
887 } else {
888 /*
889 * Packet is not associated with a socket, so possibly
890 * update the label in place.
891 */
892 mac_netinet_tcp_reply(m);
893 }
894 #endif
895
896 nth->th_seq = htonl(seq);
897 nth->th_ack = htonl(ack);
898 nth->th_x2 = 0;
899 nth->th_off = sizeof (struct tcphdr) >> 2;
900 nth->th_flags = flags;
901 if (tp)
902 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
903 else
904 nth->th_win = htons((u_short)win);
905 nth->th_urp = 0;
906 #if INET6
907 if (isipv6) {
908 nth->th_sum = 0;
909 nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
910 htonl((tlen - sizeof (struct ip6_hdr)) + IPPROTO_TCP));
911 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
912 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
913 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
914 ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
915 } else
916 #endif /* INET6 */
917 {
918 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
919 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
920 m->m_pkthdr.csum_flags = CSUM_TCP;
921 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
922 }
923 #if TCPDEBUG
924 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
925 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
926 #endif
927
928 #if NECP
929 necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0);
930 #endif /* NECP */
931
932 #if IPSEC
933 if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
934 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
935 m_freem(m);
936 return;
937 }
938 #endif
939
940 if (tp != NULL) {
941 u_int32_t svc_flags = 0;
942 if (isipv6) {
943 svc_flags |= PKT_SCF_IPV6;
944 }
945 sotc = tp->t_inpcb->inp_socket->so_traffic_class;
946 set_packet_service_class(m, tp->t_inpcb->inp_socket,
947 sotc, svc_flags);
948
949 /* Embed flowhash and flow control flags */
950 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
951 m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
952 m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
953 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
954 }
955
956 #if INET6
957 if (isipv6) {
958 struct ip6_out_args ip6oa = { tra->ifscope, { 0 },
959 IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR, 0,
960 SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC};
961
962 if (tra->ifscope != IFSCOPE_NONE)
963 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
964 if (tra->nocell)
965 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
966 if (tra->noexpensive)
967 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
968 if (tra->awdl_unrestricted)
969 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
970 if (tra->intcoproc_allowed)
971 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
972 ip6oa.ip6oa_sotc = sotc;
973 if (tp != NULL) {
974 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED))
975 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
976 ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
977 }
978 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
979 NULL, &ip6oa);
980
981 if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
982 (outif = ro6->ro_rt->rt_ifp) !=
983 tp->t_inpcb->in6p_last_outifp) {
984 tp->t_inpcb->in6p_last_outifp = outif;
985 }
986
987 if (ro6 == &sro6)
988 ROUTE_RELEASE(ro6);
989 } else
990 #endif /* INET6 */
991 {
992 struct ip_out_args ipoa = { tra->ifscope, { 0 },
993 IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR, 0,
994 SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC };
995
996 if (tra->ifscope != IFSCOPE_NONE)
997 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
998 if (tra->nocell)
999 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
1000 if (tra->noexpensive)
1001 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1002 if (tra->awdl_unrestricted)
1003 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1004 ipoa.ipoa_sotc = sotc;
1005 if (tp != NULL) {
1006 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED))
1007 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1008 ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1009 }
1010 if (ro != &sro) {
1011 /* Copy the cached route and take an extra reference */
1012 inp_route_copyout(tp->t_inpcb, &sro);
1013 }
1014 /*
1015 * For consistency, pass a local route copy.
1016 */
1017 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1018
1019 if (tp != NULL && sro.ro_rt != NULL &&
1020 (outif = sro.ro_rt->rt_ifp) !=
1021 tp->t_inpcb->inp_last_outifp) {
1022 tp->t_inpcb->inp_last_outifp = outif;
1023
1024 }
1025 if (ro != &sro) {
1026 /* Synchronize cached PCB route */
1027 inp_route_copyin(tp->t_inpcb, &sro);
1028 } else {
1029 ROUTE_RELEASE(&sro);
1030 }
1031 }
1032 }
1033
1034 /*
1035 * Create a new TCP control block, making an
1036 * empty reassembly queue and hooking it to the argument
1037 * protocol control block. The `inp' parameter must have
1038 * come from the zone allocator set up in tcp_init().
1039 */
1040 struct tcpcb *
1041 tcp_newtcpcb(struct inpcb *inp)
1042 {
1043 struct inp_tp *it;
1044 struct tcpcb *tp;
1045 struct socket *so = inp->inp_socket;
1046 #if INET6
1047 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1048 #endif /* INET6 */
1049
1050 calculate_tcp_clock();
1051
1052 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1053 it = (struct inp_tp *)(void *)inp;
1054 tp = &it->tcb;
1055 } else {
1056 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1057 }
1058
1059 bzero((char *) tp, sizeof(struct tcpcb));
1060 LIST_INIT(&tp->t_segq);
1061 tp->t_maxseg = tp->t_maxopd =
1062 #if INET6
1063 isipv6 ? tcp_v6mssdflt :
1064 #endif /* INET6 */
1065 tcp_mssdflt;
1066
1067 if (tcp_do_rfc1323)
1068 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
1069 if (tcp_do_sack)
1070 tp->t_flagsext |= TF_SACK_ENABLE;
1071
1072 TAILQ_INIT(&tp->snd_holes);
1073 SLIST_INIT(&tp->t_rxt_segments);
1074 SLIST_INIT(&tp->t_notify_ack);
1075 tp->t_inpcb = inp;
1076 /*
1077 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1078 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
1079 * reasonable initial retransmit time.
1080 */
1081 tp->t_srtt = TCPTV_SRTTBASE;
1082 tp->t_rttvar =
1083 ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1084 tp->t_rttmin = tcp_TCPTV_MIN;
1085 tp->t_rxtcur = TCPTV_RTOBASE;
1086
1087 if (tcp_use_newreno)
1088 /* use newreno by default */
1089 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1090 else
1091 tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1092
1093 tcp_cc_allocate_state(tp);
1094
1095 if (CC_ALGO(tp)->init != NULL)
1096 CC_ALGO(tp)->init(tp);
1097
1098 tp->snd_cwnd = TCP_CC_CWND_INIT_BYTES;
1099 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1100 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1101 tp->t_rcvtime = tcp_now;
1102 tp->tentry.timer_start = tcp_now;
1103 tp->t_persist_timeout = tcp_max_persist_timeout;
1104 tp->t_persist_stop = 0;
1105 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1106 tp->t_rexmtthresh = tcprexmtthresh;
1107
1108 /* Enable bandwidth measurement on this connection */
1109 tp->t_flagsext |= TF_MEASURESNDBW;
1110 if (tp->t_bwmeas == NULL) {
1111 tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1112 if (tp->t_bwmeas == NULL)
1113 tp->t_flagsext &= ~TF_MEASURESNDBW;
1114 }
1115
1116 /* Clear time wait tailq entry */
1117 tp->t_twentry.tqe_next = NULL;
1118 tp->t_twentry.tqe_prev = NULL;
1119
1120 /*
1121 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1122 * because the socket may be bound to an IPv6 wildcard address,
1123 * which may match an IPv4-mapped IPv6 address.
1124 */
1125 inp->inp_ip_ttl = ip_defttl;
1126 inp->inp_ppcb = (caddr_t)tp;
1127 return (tp); /* XXX */
1128 }
1129
1130 /*
1131 * Drop a TCP connection, reporting
1132 * the specified error. If connection is synchronized,
1133 * then send a RST to peer.
1134 */
1135 struct tcpcb *
1136 tcp_drop(struct tcpcb *tp, int errno)
1137 {
1138 struct socket *so = tp->t_inpcb->inp_socket;
1139 #if CONFIG_DTRACE
1140 struct inpcb *inp = tp->t_inpcb;
1141 #endif
1142
1143 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1144 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1145 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1146 tp->t_state = TCPS_CLOSED;
1147 (void) tcp_output(tp);
1148 tcpstat.tcps_drops++;
1149 } else
1150 tcpstat.tcps_conndrops++;
1151 if (errno == ETIMEDOUT && tp->t_softerror)
1152 errno = tp->t_softerror;
1153 so->so_error = errno;
1154 return (tcp_close(tp));
1155 }
1156
1157 void
1158 tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1159 {
1160 u_int32_t rtt = rt->rt_rmx.rmx_rtt;
1161 int isnetlocal = (tp->t_flags & TF_LOCAL);
1162
1163 if (rtt != 0) {
1164 /*
1165 * XXX the lock bit for RTT indicates that the value
1166 * is also a minimum value; this is subject to time.
1167 */
1168 if (rt->rt_rmx.rmx_locks & RTV_RTT)
1169 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1170 else
1171 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
1172 TCPTV_REXMTMIN;
1173 tp->t_srtt =
1174 rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1175 tcpstat.tcps_usedrtt++;
1176 if (rt->rt_rmx.rmx_rttvar) {
1177 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1178 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1179 tcpstat.tcps_usedrttvar++;
1180 } else {
1181 /* default variation is +- 1 rtt */
1182 tp->t_rttvar =
1183 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1184 }
1185 TCPT_RANGESET(tp->t_rxtcur,
1186 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
1187 tp->t_rttmin, TCPTV_REXMTMAX,
1188 TCP_ADD_REXMTSLOP(tp));
1189 }
1190 }
1191
1192 static inline void
1193 tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1194 struct ifnet_stats_per_flow *ifs)
1195 {
1196 struct inpcb *inp;
1197 struct socket *so;
1198 if (tp == NULL || ifs == NULL)
1199 return;
1200
1201 bzero(ifs, sizeof(*ifs));
1202 inp = tp->t_inpcb;
1203 so = inp->inp_socket;
1204
1205 ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1206 ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1207 ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1208 ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1209 ifs->ecn_flags = tp->ecn_flags;
1210 ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1211 ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1212 ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1213 ifs->rcvoopack = tp->t_rcvoopack;
1214 ifs->pawsdrop = tp->t_pawsdrop;
1215 ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1216 ifs->reordered_pkts = tp->t_reordered_pkts;
1217 ifs->dsack_sent = tp->t_dsack_sent;
1218 ifs->dsack_recvd = tp->t_dsack_recvd;
1219 ifs->srtt = tp->t_srtt;
1220 ifs->rttupdated = tp->t_rttupdated;
1221 ifs->rttvar = tp->t_rttvar;
1222 ifs->rttmin = get_base_rtt(tp);
1223 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1224 ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1225 } else {
1226 ifs->bw_sndbw_max = 0;
1227 }
1228 if (tp->t_bwmeas!= NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1229 ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1230 } else {
1231 ifs->bw_rcvbw_max = 0;
1232 }
1233 ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1234 ifs->txpackets = inp->inp_stat->txpackets;
1235 ifs->rxpackets = inp->inp_stat->rxpackets;
1236 }
1237
1238 static inline void
1239 tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1240 struct if_tcp_ecn_perf_stat *stat)
1241 {
1242 u_int64_t curval, oldval;
1243 stat->total_txpkts += ifs->txpackets;
1244 stat->total_rxpkts += ifs->rxpackets;
1245 stat->total_rxmitpkts += ifs->rxmitpkts;
1246 stat->total_oopkts += ifs->rcvoopack;
1247 stat->total_reorderpkts += (ifs->reordered_pkts +
1248 ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1249
1250 /* Average RTT */
1251 curval = ifs->srtt >> TCP_RTT_SHIFT;
1252 if (curval > 0 && ifs->rttupdated >= 16) {
1253 if (stat->rtt_avg == 0) {
1254 stat->rtt_avg = curval;
1255 } else {
1256 oldval = stat->rtt_avg;
1257 stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1258 }
1259 }
1260
1261 /* RTT variance */
1262 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1263 if (curval > 0 && ifs->rttupdated >= 16) {
1264 if (stat->rtt_var == 0) {
1265 stat->rtt_var = curval;
1266 } else {
1267 oldval = stat->rtt_var;
1268 stat->rtt_var =
1269 ((oldval << 4) - oldval + curval) >> 4;
1270 }
1271 }
1272
1273 /* SACK episodes */
1274 stat->sack_episodes += ifs->sack_recovery_episodes;
1275 if (ifs->connreset)
1276 stat->rst_drop++;
1277 }
1278
1279 static inline void
1280 tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1281 struct if_lim_perf_stat *stat)
1282 {
1283 u_int64_t curval, oldval;
1284
1285 stat->lim_total_txpkts += ifs->txpackets;
1286 stat->lim_total_rxpkts += ifs->rxpackets;
1287 stat->lim_total_retxpkts += ifs->rxmitpkts;
1288 stat->lim_total_oopkts += ifs->rcvoopack;
1289
1290 if (ifs->bw_sndbw_max > 0) {
1291 /* convert from bytes per ms to bits per second */
1292 ifs->bw_sndbw_max *= 8000;
1293 stat->lim_ul_max_bandwidth = max(stat->lim_ul_max_bandwidth,
1294 ifs->bw_sndbw_max);
1295 }
1296
1297 if (ifs->bw_rcvbw_max > 0) {
1298 /* convert from bytes per ms to bits per second */
1299 ifs->bw_rcvbw_max *= 8000;
1300 stat->lim_dl_max_bandwidth = max(stat->lim_dl_max_bandwidth,
1301 ifs->bw_rcvbw_max);
1302 }
1303
1304 /* Average RTT */
1305 curval = ifs->srtt >> TCP_RTT_SHIFT;
1306 if (curval > 0 && ifs->rttupdated >= 16) {
1307 if (stat->lim_rtt_average == 0) {
1308 stat->lim_rtt_average = curval;
1309 } else {
1310 oldval = stat->lim_rtt_average;
1311 stat->lim_rtt_average =
1312 ((oldval << 4) - oldval + curval) >> 4;
1313 }
1314 }
1315
1316 /* RTT variance */
1317 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1318 if (curval > 0 && ifs->rttupdated >= 16) {
1319 if (stat->lim_rtt_variance == 0) {
1320 stat->lim_rtt_variance = curval;
1321 } else {
1322 oldval = stat->lim_rtt_variance;
1323 stat->lim_rtt_variance =
1324 ((oldval << 4) - oldval + curval) >> 4;
1325 }
1326 }
1327
1328 if (stat->lim_rtt_min == 0) {
1329 stat->lim_rtt_min = ifs->rttmin;
1330 } else {
1331 stat->lim_rtt_min = min(stat->lim_rtt_min, ifs->rttmin);
1332 }
1333
1334 /* connection timeouts */
1335 stat->lim_conn_attempts++;
1336 if (ifs->conntimeout)
1337 stat->lim_conn_timeouts++;
1338
1339 /* bytes sent using background delay-based algorithms */
1340 stat->lim_bk_txpkts += ifs->bk_txpackets;
1341
1342 }
1343
1344 /*
1345 * Close a TCP control block:
1346 * discard all space held by the tcp
1347 * discard internet protocol block
1348 * wake up any sleepers
1349 */
1350 struct tcpcb *
1351 tcp_close(struct tcpcb *tp)
1352 {
1353 struct inpcb *inp = tp->t_inpcb;
1354 struct socket *so = inp->inp_socket;
1355 #if INET6
1356 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1357 #endif /* INET6 */
1358 struct route *ro;
1359 struct rtentry *rt;
1360 int dosavessthresh;
1361 struct ifnet_stats_per_flow ifs;
1362
1363 /* tcp_close was called previously, bail */
1364 if (inp->inp_ppcb == NULL)
1365 return (NULL);
1366
1367 tcp_canceltimers(tp);
1368 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1369
1370 /*
1371 * If another thread for this tcp is currently in ip (indicated by
1372 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1373 * back to tcp. This is done to serialize the close until after all
1374 * pending output is finished, in order to avoid having the PCB be
1375 * detached and the cached route cleaned, only for ip to cache the
1376 * route back into the PCB again. Note that we've cleared all the
1377 * timers at this point. Set TF_CLOSING to indicate to tcp_output()
1378 * that is should call us again once it returns from ip; at that
1379 * point both flags should be cleared and we can proceed further
1380 * with the cleanup.
1381 */
1382 if ((tp->t_flags & TF_CLOSING) ||
1383 inp->inp_sndinprog_cnt > 0) {
1384 tp->t_flags |= TF_CLOSING;
1385 return (NULL);
1386 }
1387
1388 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1389 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1390
1391 #if INET6
1392 ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1393 #else
1394 ro = &inp->inp_route;
1395 #endif
1396 rt = ro->ro_rt;
1397 if (rt != NULL)
1398 RT_LOCK_SPIN(rt);
1399
1400 /*
1401 * If we got enough samples through the srtt filter,
1402 * save the rtt and rttvar in the routing entry.
1403 * 'Enough' is arbitrarily defined as the 16 samples.
1404 * 16 samples is enough for the srtt filter to converge
1405 * to within 5% of the correct value; fewer samples and
1406 * we could save a very bogus rtt.
1407 *
1408 * Don't update the default route's characteristics and don't
1409 * update anything that the user "locked".
1410 */
1411 if (tp->t_rttupdated >= 16) {
1412 u_int32_t i = 0;
1413
1414 #if INET6
1415 if (isipv6) {
1416 struct sockaddr_in6 *sin6;
1417
1418 if (rt == NULL)
1419 goto no_valid_rt;
1420 sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt);
1421 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
1422 goto no_valid_rt;
1423 }
1424 else
1425 #endif /* INET6 */
1426 if (ROUTE_UNUSABLE(ro) ||
1427 SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1428 DTRACE_TCP4(state__change, void, NULL,
1429 struct inpcb *, inp, struct tcpcb *, tp,
1430 int32_t, TCPS_CLOSED);
1431 tp->t_state = TCPS_CLOSED;
1432 goto no_valid_rt;
1433 }
1434
1435 RT_LOCK_ASSERT_HELD(rt);
1436 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1437 i = tp->t_srtt *
1438 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1439 if (rt->rt_rmx.rmx_rtt && i)
1440 /*
1441 * filter this update to half the old & half
1442 * the new values, converting scale.
1443 * See route.h and tcp_var.h for a
1444 * description of the scaling constants.
1445 */
1446 rt->rt_rmx.rmx_rtt =
1447 (rt->rt_rmx.rmx_rtt + i) / 2;
1448 else
1449 rt->rt_rmx.rmx_rtt = i;
1450 tcpstat.tcps_cachedrtt++;
1451 }
1452 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1453 i = tp->t_rttvar *
1454 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1455 if (rt->rt_rmx.rmx_rttvar && i)
1456 rt->rt_rmx.rmx_rttvar =
1457 (rt->rt_rmx.rmx_rttvar + i) / 2;
1458 else
1459 rt->rt_rmx.rmx_rttvar = i;
1460 tcpstat.tcps_cachedrttvar++;
1461 }
1462 /*
1463 * The old comment here said:
1464 * update the pipelimit (ssthresh) if it has been updated
1465 * already or if a pipesize was specified & the threshhold
1466 * got below half the pipesize. I.e., wait for bad news
1467 * before we start updating, then update on both good
1468 * and bad news.
1469 *
1470 * But we want to save the ssthresh even if no pipesize is
1471 * specified explicitly in the route, because such
1472 * connections still have an implicit pipesize specified
1473 * by the global tcp_sendspace. In the absence of a reliable
1474 * way to calculate the pipesize, it will have to do.
1475 */
1476 i = tp->snd_ssthresh;
1477 if (rt->rt_rmx.rmx_sendpipe != 0)
1478 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1479 else
1480 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1481 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1482 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1483 dosavessthresh) {
1484 /*
1485 * convert the limit from user data bytes to
1486 * packets then to packet data bytes.
1487 */
1488 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1489 if (i < 2)
1490 i = 2;
1491 i *= (u_int32_t)(tp->t_maxseg +
1492 #if INET6
1493 isipv6 ? sizeof (struct ip6_hdr) +
1494 sizeof (struct tcphdr) :
1495 #endif /* INET6 */
1496 sizeof (struct tcpiphdr));
1497 if (rt->rt_rmx.rmx_ssthresh)
1498 rt->rt_rmx.rmx_ssthresh =
1499 (rt->rt_rmx.rmx_ssthresh + i) / 2;
1500 else
1501 rt->rt_rmx.rmx_ssthresh = i;
1502 tcpstat.tcps_cachedssthresh++;
1503 }
1504 }
1505
1506 /*
1507 * Mark route for deletion if no information is cached.
1508 */
1509 if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) {
1510 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1511 rt->rt_rmx.rmx_rtt == 0) {
1512 rt->rt_flags |= RTF_DELCLONE;
1513 }
1514 }
1515
1516 no_valid_rt:
1517 if (rt != NULL)
1518 RT_UNLOCK(rt);
1519
1520 /* free the reassembly queue, if any */
1521 (void) tcp_freeq(tp);
1522
1523 /* performance stats per interface */
1524 tcp_create_ifnet_stats_per_flow(tp, &ifs);
1525 tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1526
1527 tcp_free_sackholes(tp);
1528 tcp_notify_ack_free(tp);
1529
1530 inp_decr_sndbytes_allunsent(so, tp->snd_una);
1531
1532 if (tp->t_bwmeas != NULL) {
1533 tcp_bwmeas_free(tp);
1534 }
1535 tcp_rxtseg_clean(tp);
1536 /* Free the packet list */
1537 if (tp->t_pktlist_head != NULL)
1538 m_freem_list(tp->t_pktlist_head);
1539 TCP_PKTLIST_CLEAR(tp);
1540
1541 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER)
1542 inp->inp_saved_ppcb = (caddr_t) tp;
1543
1544 tp->t_state = TCPS_CLOSED;
1545
1546 /*
1547 * Issue a wakeup before detach so that we don't miss
1548 * a wakeup
1549 */
1550 sodisconnectwakeup(so);
1551
1552 /*
1553 * Clean up any LRO state
1554 */
1555 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
1556 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
1557 inp->inp_lport, inp->inp_fport);
1558 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1559 }
1560
1561 /*
1562 * If this is a socket that does not want to wakeup the device
1563 * for it's traffic, the application might need to know that the
1564 * socket is closed, send a notification.
1565 */
1566 if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1567 inp->inp_state != INPCB_STATE_DEAD &&
1568 !(inp->inp_flags2 & INP2_TIMEWAIT))
1569 socket_post_kev_msg_closed(so);
1570
1571 if (CC_ALGO(tp)->cleanup != NULL) {
1572 CC_ALGO(tp)->cleanup(tp);
1573 }
1574
1575 if (tp->t_ccstate != NULL) {
1576 zfree(tcp_cc_zone, tp->t_ccstate);
1577 tp->t_ccstate = NULL;
1578 }
1579 tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1580
1581 /* Can happen if we close the socket before receiving the third ACK */
1582 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1583 OSDecrementAtomic(&tcp_tfo_halfcnt);
1584
1585 /* Panic if something has gone terribly wrong. */
1586 VERIFY(tcp_tfo_halfcnt >= 0);
1587
1588 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1589 }
1590
1591 #if INET6
1592 if (SOCK_CHECK_DOM(so, PF_INET6))
1593 in6_pcbdetach(inp);
1594 else
1595 #endif /* INET6 */
1596 in_pcbdetach(inp);
1597
1598 /*
1599 * Call soisdisconnected after detach because it might unlock the socket
1600 */
1601 soisdisconnected(so);
1602 tcpstat.tcps_closed++;
1603 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1604 tcpstat.tcps_closed, 0, 0, 0, 0);
1605 return (NULL);
1606 }
1607
1608 int
1609 tcp_freeq(struct tcpcb *tp)
1610 {
1611 struct tseg_qent *q;
1612 int rv = 0;
1613
1614 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1615 LIST_REMOVE(q, tqe_q);
1616 m_freem(q->tqe_m);
1617 zfree(tcp_reass_zone, q);
1618 rv = 1;
1619 }
1620 tp->t_reassqlen = 0;
1621 return (rv);
1622 }
1623
1624
1625 /*
1626 * Walk the tcpbs, if existing, and flush the reassembly queue,
1627 * if there is one when do_tcpdrain is enabled
1628 * Also defunct the extended background idle socket
1629 * Do it next time if the pcbinfo lock is in use
1630 */
1631 void
1632 tcp_drain(void)
1633 {
1634 struct inpcb *inp;
1635 struct tcpcb *tp;
1636
1637 if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock))
1638 return;
1639
1640 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1641 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1642 WNT_STOPUSING) {
1643 socket_lock(inp->inp_socket, 1);
1644 if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1645 == WNT_STOPUSING) {
1646 /* lost a race, try the next one */
1647 socket_unlock(inp->inp_socket, 1);
1648 continue;
1649 }
1650 tp = intotcpcb(inp);
1651
1652 if (do_tcpdrain)
1653 tcp_freeq(tp);
1654
1655 so_drain_extended_bk_idle(inp->inp_socket);
1656
1657 socket_unlock(inp->inp_socket, 1);
1658 }
1659 }
1660 lck_rw_done(tcbinfo.ipi_lock);
1661
1662 }
1663
1664 /*
1665 * Notify a tcp user of an asynchronous error;
1666 * store error as soft error, but wake up user
1667 * (for now, won't do anything until can select for soft error).
1668 *
1669 * Do not wake up user since there currently is no mechanism for
1670 * reporting soft errors (yet - a kqueue filter may be added).
1671 */
1672 static void
1673 tcp_notify(struct inpcb *inp, int error)
1674 {
1675 struct tcpcb *tp;
1676
1677 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD))
1678 return; /* pcb is gone already */
1679
1680 tp = (struct tcpcb *)inp->inp_ppcb;
1681
1682 VERIFY(tp != NULL);
1683 /*
1684 * Ignore some errors if we are hooked up.
1685 * If connection hasn't completed, has retransmitted several times,
1686 * and receives a second error, give up now. This is better
1687 * than waiting a long time to establish a connection that
1688 * can never complete.
1689 */
1690 if (tp->t_state == TCPS_ESTABLISHED &&
1691 (error == EHOSTUNREACH || error == ENETUNREACH ||
1692 error == EHOSTDOWN)) {
1693 if (inp->inp_route.ro_rt) {
1694 rtfree(inp->inp_route.ro_rt);
1695 inp->inp_route.ro_rt = (struct rtentry *)NULL;
1696 }
1697 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1698 tp->t_softerror)
1699 tcp_drop(tp, error);
1700 else
1701 tp->t_softerror = error;
1702 #if 0
1703 wakeup((caddr_t) &so->so_timeo);
1704 sorwakeup(so);
1705 sowwakeup(so);
1706 #endif
1707 }
1708
1709 struct bwmeas *
1710 tcp_bwmeas_alloc(struct tcpcb *tp)
1711 {
1712 struct bwmeas *elm;
1713 elm = zalloc(tcp_bwmeas_zone);
1714 if (elm == NULL)
1715 return (elm);
1716
1717 bzero(elm, bwmeas_elm_size);
1718 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1719 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1720 return (elm);
1721 }
1722
1723 void
1724 tcp_bwmeas_free(struct tcpcb *tp)
1725 {
1726 zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1727 tp->t_bwmeas = NULL;
1728 tp->t_flagsext &= ~(TF_MEASURESNDBW);
1729 }
1730
1731 int
1732 get_tcp_inp_list(struct inpcb **inp_list, int n, inp_gen_t gencnt)
1733 {
1734 struct tcpcb *tp;
1735 struct inpcb *inp;
1736 int i = 0;
1737
1738 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1739 if (inp->inp_gencnt <= gencnt &&
1740 inp->inp_state != INPCB_STATE_DEAD)
1741 inp_list[i++] = inp;
1742 if (i >= n)
1743 break;
1744 }
1745
1746 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1747 inp = tp->t_inpcb;
1748 if (inp->inp_gencnt <= gencnt &&
1749 inp->inp_state != INPCB_STATE_DEAD)
1750 inp_list[i++] = inp;
1751 if (i >= n)
1752 break;
1753 }
1754 return (i);
1755 }
1756
1757 /*
1758 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1759 * The otcpcb data structure is passed to user space and must not change.
1760 */
1761 static void
1762 tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1763 {
1764 otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
1765 otp->t_dupacks = tp->t_dupacks;
1766 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1767 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1768 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1769 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1770 otp->t_inpcb =
1771 (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb);
1772 otp->t_state = tp->t_state;
1773 otp->t_flags = tp->t_flags;
1774 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1775 otp->snd_una = tp->snd_una;
1776 otp->snd_max = tp->snd_max;
1777 otp->snd_nxt = tp->snd_nxt;
1778 otp->snd_up = tp->snd_up;
1779 otp->snd_wl1 = tp->snd_wl1;
1780 otp->snd_wl2 = tp->snd_wl2;
1781 otp->iss = tp->iss;
1782 otp->irs = tp->irs;
1783 otp->rcv_nxt = tp->rcv_nxt;
1784 otp->rcv_adv = tp->rcv_adv;
1785 otp->rcv_wnd = tp->rcv_wnd;
1786 otp->rcv_up = tp->rcv_up;
1787 otp->snd_wnd = tp->snd_wnd;
1788 otp->snd_cwnd = tp->snd_cwnd;
1789 otp->snd_ssthresh = tp->snd_ssthresh;
1790 otp->t_maxopd = tp->t_maxopd;
1791 otp->t_rcvtime = tp->t_rcvtime;
1792 otp->t_starttime = tp->t_starttime;
1793 otp->t_rtttime = tp->t_rtttime;
1794 otp->t_rtseq = tp->t_rtseq;
1795 otp->t_rxtcur = tp->t_rxtcur;
1796 otp->t_maxseg = tp->t_maxseg;
1797 otp->t_srtt = tp->t_srtt;
1798 otp->t_rttvar = tp->t_rttvar;
1799 otp->t_rxtshift = tp->t_rxtshift;
1800 otp->t_rttmin = tp->t_rttmin;
1801 otp->t_rttupdated = tp->t_rttupdated;
1802 otp->max_sndwnd = tp->max_sndwnd;
1803 otp->t_softerror = tp->t_softerror;
1804 otp->t_oobflags = tp->t_oobflags;
1805 otp->t_iobc = tp->t_iobc;
1806 otp->snd_scale = tp->snd_scale;
1807 otp->rcv_scale = tp->rcv_scale;
1808 otp->request_r_scale = tp->request_r_scale;
1809 otp->requested_s_scale = tp->requested_s_scale;
1810 otp->ts_recent = tp->ts_recent;
1811 otp->ts_recent_age = tp->ts_recent_age;
1812 otp->last_ack_sent = tp->last_ack_sent;
1813 otp->cc_send = 0;
1814 otp->cc_recv = 0;
1815 otp->snd_recover = tp->snd_recover;
1816 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1817 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1818 otp->t_badrxtwin = 0;
1819 }
1820
1821 static int
1822 tcp_pcblist SYSCTL_HANDLER_ARGS
1823 {
1824 #pragma unused(oidp, arg1, arg2)
1825 int error, i = 0, n;
1826 struct inpcb **inp_list;
1827 inp_gen_t gencnt;
1828 struct xinpgen xig;
1829
1830 /*
1831 * The process of preparing the TCB list is too time-consuming and
1832 * resource-intensive to repeat twice on every request.
1833 */
1834 lck_rw_lock_shared(tcbinfo.ipi_lock);
1835 if (req->oldptr == USER_ADDR_NULL) {
1836 n = tcbinfo.ipi_count;
1837 req->oldidx = 2 * (sizeof(xig))
1838 + (n + n/8) * sizeof(struct xtcpcb);
1839 lck_rw_done(tcbinfo.ipi_lock);
1840 return (0);
1841 }
1842
1843 if (req->newptr != USER_ADDR_NULL) {
1844 lck_rw_done(tcbinfo.ipi_lock);
1845 return (EPERM);
1846 }
1847
1848 /*
1849 * OK, now we're committed to doing something.
1850 */
1851 gencnt = tcbinfo.ipi_gencnt;
1852 n = tcbinfo.ipi_count;
1853
1854 bzero(&xig, sizeof(xig));
1855 xig.xig_len = sizeof(xig);
1856 xig.xig_count = n;
1857 xig.xig_gen = gencnt;
1858 xig.xig_sogen = so_gencnt;
1859 error = SYSCTL_OUT(req, &xig, sizeof(xig));
1860 if (error) {
1861 lck_rw_done(tcbinfo.ipi_lock);
1862 return (error);
1863 }
1864 /*
1865 * We are done if there is no pcb
1866 */
1867 if (n == 0) {
1868 lck_rw_done(tcbinfo.ipi_lock);
1869 return (0);
1870 }
1871
1872 inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK);
1873 if (inp_list == 0) {
1874 lck_rw_done(tcbinfo.ipi_lock);
1875 return (ENOMEM);
1876 }
1877
1878 n = get_tcp_inp_list(inp_list, n, gencnt);
1879
1880 error = 0;
1881 for (i = 0; i < n; i++) {
1882 struct xtcpcb xt;
1883 caddr_t inp_ppcb;
1884 struct inpcb *inp;
1885
1886 inp = inp_list[i];
1887
1888 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1889 continue;
1890 socket_lock(inp->inp_socket, 1);
1891 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1892 socket_unlock(inp->inp_socket, 1);
1893 continue;
1894 }
1895 if (inp->inp_gencnt > gencnt) {
1896 socket_unlock(inp->inp_socket, 1);
1897 continue;
1898 }
1899
1900 bzero(&xt, sizeof(xt));
1901 xt.xt_len = sizeof(xt);
1902 /* XXX should avoid extra copy */
1903 inpcb_to_compat(inp, &xt.xt_inp);
1904 inp_ppcb = inp->inp_ppcb;
1905 if (inp_ppcb != NULL) {
1906 tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
1907 &xt.xt_tp);
1908 } else {
1909 bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp));
1910 }
1911 if (inp->inp_socket)
1912 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1913
1914 socket_unlock(inp->inp_socket, 1);
1915
1916 error = SYSCTL_OUT(req, &xt, sizeof(xt));
1917 }
1918 if (!error) {
1919 /*
1920 * Give the user an updated idea of our state.
1921 * If the generation differs from what we told
1922 * her before, she knows that something happened
1923 * while we were processing this request, and it
1924 * might be necessary to retry.
1925 */
1926 bzero(&xig, sizeof(xig));
1927 xig.xig_len = sizeof(xig);
1928 xig.xig_gen = tcbinfo.ipi_gencnt;
1929 xig.xig_sogen = so_gencnt;
1930 xig.xig_count = tcbinfo.ipi_count;
1931 error = SYSCTL_OUT(req, &xig, sizeof(xig));
1932 }
1933 FREE(inp_list, M_TEMP);
1934 lck_rw_done(tcbinfo.ipi_lock);
1935 return (error);
1936 }
1937
1938 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
1939 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1940 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1941
1942 #if !CONFIG_EMBEDDED
1943
1944 static void
1945 tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
1946 {
1947 otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
1948 otp->t_dupacks = tp->t_dupacks;
1949 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1950 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1951 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1952 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1953 otp->t_state = tp->t_state;
1954 otp->t_flags = tp->t_flags;
1955 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1956 otp->snd_una = tp->snd_una;
1957 otp->snd_max = tp->snd_max;
1958 otp->snd_nxt = tp->snd_nxt;
1959 otp->snd_up = tp->snd_up;
1960 otp->snd_wl1 = tp->snd_wl1;
1961 otp->snd_wl2 = tp->snd_wl2;
1962 otp->iss = tp->iss;
1963 otp->irs = tp->irs;
1964 otp->rcv_nxt = tp->rcv_nxt;
1965 otp->rcv_adv = tp->rcv_adv;
1966 otp->rcv_wnd = tp->rcv_wnd;
1967 otp->rcv_up = tp->rcv_up;
1968 otp->snd_wnd = tp->snd_wnd;
1969 otp->snd_cwnd = tp->snd_cwnd;
1970 otp->snd_ssthresh = tp->snd_ssthresh;
1971 otp->t_maxopd = tp->t_maxopd;
1972 otp->t_rcvtime = tp->t_rcvtime;
1973 otp->t_starttime = tp->t_starttime;
1974 otp->t_rtttime = tp->t_rtttime;
1975 otp->t_rtseq = tp->t_rtseq;
1976 otp->t_rxtcur = tp->t_rxtcur;
1977 otp->t_maxseg = tp->t_maxseg;
1978 otp->t_srtt = tp->t_srtt;
1979 otp->t_rttvar = tp->t_rttvar;
1980 otp->t_rxtshift = tp->t_rxtshift;
1981 otp->t_rttmin = tp->t_rttmin;
1982 otp->t_rttupdated = tp->t_rttupdated;
1983 otp->max_sndwnd = tp->max_sndwnd;
1984 otp->t_softerror = tp->t_softerror;
1985 otp->t_oobflags = tp->t_oobflags;
1986 otp->t_iobc = tp->t_iobc;
1987 otp->snd_scale = tp->snd_scale;
1988 otp->rcv_scale = tp->rcv_scale;
1989 otp->request_r_scale = tp->request_r_scale;
1990 otp->requested_s_scale = tp->requested_s_scale;
1991 otp->ts_recent = tp->ts_recent;
1992 otp->ts_recent_age = tp->ts_recent_age;
1993 otp->last_ack_sent = tp->last_ack_sent;
1994 otp->cc_send = 0;
1995 otp->cc_recv = 0;
1996 otp->snd_recover = tp->snd_recover;
1997 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1998 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1999 otp->t_badrxtwin = 0;
2000 }
2001
2002
2003 static int
2004 tcp_pcblist64 SYSCTL_HANDLER_ARGS
2005 {
2006 #pragma unused(oidp, arg1, arg2)
2007 int error, i = 0, n;
2008 struct inpcb **inp_list;
2009 inp_gen_t gencnt;
2010 struct xinpgen xig;
2011
2012 /*
2013 * The process of preparing the TCB list is too time-consuming and
2014 * resource-intensive to repeat twice on every request.
2015 */
2016 lck_rw_lock_shared(tcbinfo.ipi_lock);
2017 if (req->oldptr == USER_ADDR_NULL) {
2018 n = tcbinfo.ipi_count;
2019 req->oldidx = 2 * (sizeof(xig))
2020 + (n + n/8) * sizeof(struct xtcpcb64);
2021 lck_rw_done(tcbinfo.ipi_lock);
2022 return (0);
2023 }
2024
2025 if (req->newptr != USER_ADDR_NULL) {
2026 lck_rw_done(tcbinfo.ipi_lock);
2027 return (EPERM);
2028 }
2029
2030 /*
2031 * OK, now we're committed to doing something.
2032 */
2033 gencnt = tcbinfo.ipi_gencnt;
2034 n = tcbinfo.ipi_count;
2035
2036 bzero(&xig, sizeof(xig));
2037 xig.xig_len = sizeof(xig);
2038 xig.xig_count = n;
2039 xig.xig_gen = gencnt;
2040 xig.xig_sogen = so_gencnt;
2041 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2042 if (error) {
2043 lck_rw_done(tcbinfo.ipi_lock);
2044 return (error);
2045 }
2046 /*
2047 * We are done if there is no pcb
2048 */
2049 if (n == 0) {
2050 lck_rw_done(tcbinfo.ipi_lock);
2051 return (0);
2052 }
2053
2054 inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK);
2055 if (inp_list == 0) {
2056 lck_rw_done(tcbinfo.ipi_lock);
2057 return (ENOMEM);
2058 }
2059
2060 n = get_tcp_inp_list(inp_list, n, gencnt);
2061
2062 error = 0;
2063 for (i = 0; i < n; i++) {
2064 struct xtcpcb64 xt;
2065 struct inpcb *inp;
2066
2067 inp = inp_list[i];
2068
2069 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
2070 continue;
2071 socket_lock(inp->inp_socket, 1);
2072 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2073 socket_unlock(inp->inp_socket, 1);
2074 continue;
2075 }
2076 if (inp->inp_gencnt > gencnt) {
2077 socket_unlock(inp->inp_socket, 1);
2078 continue;
2079 }
2080
2081 bzero(&xt, sizeof(xt));
2082 xt.xt_len = sizeof(xt);
2083 inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2084 xt.xt_inpcb.inp_ppcb =
2085 (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb);
2086 if (inp->inp_ppcb != NULL)
2087 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb,
2088 &xt);
2089 if (inp->inp_socket)
2090 sotoxsocket64(inp->inp_socket,
2091 &xt.xt_inpcb.xi_socket);
2092
2093 socket_unlock(inp->inp_socket, 1);
2094
2095 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2096 }
2097 if (!error) {
2098 /*
2099 * Give the user an updated idea of our state.
2100 * If the generation differs from what we told
2101 * her before, she knows that something happened
2102 * while we were processing this request, and it
2103 * might be necessary to retry.
2104 */
2105 bzero(&xig, sizeof(xig));
2106 xig.xig_len = sizeof(xig);
2107 xig.xig_gen = tcbinfo.ipi_gencnt;
2108 xig.xig_sogen = so_gencnt;
2109 xig.xig_count = tcbinfo.ipi_count;
2110 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2111 }
2112 FREE(inp_list, M_TEMP);
2113 lck_rw_done(tcbinfo.ipi_lock);
2114 return (error);
2115 }
2116
2117 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2118 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2119 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2120
2121 #endif /* !CONFIG_EMBEDDED */
2122
2123 static int
2124 tcp_pcblist_n SYSCTL_HANDLER_ARGS
2125 {
2126 #pragma unused(oidp, arg1, arg2)
2127 int error = 0;
2128
2129 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2130
2131 return (error);
2132 }
2133
2134
2135 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2136 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2137 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2138
2139
2140 __private_extern__ void
2141 tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
2142 bitstr_t *bitfield)
2143 {
2144 inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
2145 &tcbinfo);
2146 }
2147
2148 __private_extern__ uint32_t
2149 tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2150 {
2151 return (inpcb_count_opportunistic(ifindex, &tcbinfo, flags));
2152 }
2153
2154 __private_extern__ uint32_t
2155 tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2156 {
2157 return (inpcb_find_anypcb_byaddr(ifa, &tcbinfo));
2158 }
2159
2160 static void
2161 tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2162 {
2163 struct rtentry *rt = NULL;
2164 u_short ifscope = IFSCOPE_NONE;
2165 int mtu;
2166 struct sockaddr_in icmpsrc = {
2167 sizeof (struct sockaddr_in),
2168 AF_INET, 0, { 0 },
2169 { 0, 0, 0, 0, 0, 0, 0, 0 } };
2170 struct icmp *icp = NULL;
2171
2172 icp = (struct icmp *)(void *)
2173 ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2174
2175 icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2176
2177 /*
2178 * MTU discovery:
2179 * If we got a needfrag and there is a host route to the
2180 * original destination, and the MTU is not locked, then
2181 * set the MTU in the route to the suggested new value
2182 * (if given) and then notify as usual. The ULPs will
2183 * notice that the MTU has changed and adapt accordingly.
2184 * If no new MTU was suggested, then we guess a new one
2185 * less than the current value. If the new MTU is
2186 * unreasonably small (defined by sysctl tcp_minmss), then
2187 * we reset the MTU to the interface value and enable the
2188 * lock bit, indicating that we are no longer doing MTU
2189 * discovery.
2190 */
2191 if (ROUTE_UNUSABLE(&(inp->inp_route)) == false)
2192 rt = inp->inp_route.ro_rt;
2193
2194 /*
2195 * icmp6_mtudisc_update scopes the routing lookup
2196 * to the incoming interface (delivered from mbuf
2197 * packet header.
2198 * That is mostly ok but for asymmetric networks
2199 * that may be an issue.
2200 * Frag needed OR Packet too big really communicates
2201 * MTU for the out data path.
2202 * Take the interface scope from cached route or
2203 * the last outgoing interface from inp
2204 */
2205 if (rt != NULL)
2206 ifscope = (rt->rt_ifp != NULL) ?
2207 rt->rt_ifp->if_index : IFSCOPE_NONE;
2208 else
2209 ifscope = (inp->inp_last_outifp != NULL) ?
2210 inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2211
2212 if ((rt == NULL) ||
2213 !(rt->rt_flags & RTF_HOST) ||
2214 (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2215 rt = rtalloc1_scoped((struct sockaddr *)&icmpsrc, 0,
2216 RTF_CLONING | RTF_PRCLONING, ifscope);
2217 } else if (rt) {
2218 RT_LOCK(rt);
2219 rtref(rt);
2220 RT_UNLOCK(rt);
2221 }
2222
2223 if (rt != NULL) {
2224 RT_LOCK(rt);
2225 if ((rt->rt_flags & RTF_HOST) &&
2226 !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2227 mtu = ntohs(icp->icmp_nextmtu);
2228 /*
2229 * XXX Stock BSD has changed the following
2230 * to compare with icp->icmp_ip.ip_len
2231 * to converge faster when sent packet
2232 * < route's MTU. We may want to adopt
2233 * that change.
2234 */
2235 if (mtu == 0)
2236 mtu = ip_next_mtu(rt->rt_rmx.
2237 rmx_mtu, 1);
2238 #if DEBUG_MTUDISC
2239 printf("MTU for %s reduced to %d\n",
2240 inet_ntop(AF_INET,
2241 &icmpsrc.sin_addr, ipv4str,
2242 sizeof (ipv4str)), mtu);
2243 #endif
2244 if (mtu < max(296, (tcp_minmss +
2245 sizeof (struct tcpiphdr)))) {
2246 rt->rt_rmx.rmx_locks |= RTV_MTU;
2247 } else if (rt->rt_rmx.rmx_mtu > mtu) {
2248 rt->rt_rmx.rmx_mtu = mtu;
2249 }
2250 }
2251 RT_UNLOCK(rt);
2252 rtfree(rt);
2253 }
2254 }
2255
2256 void
2257 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2258 {
2259 tcp_seq icmp_tcp_seq;
2260 struct ip *ip = vip;
2261 struct in_addr faddr;
2262 struct inpcb *inp;
2263 struct tcpcb *tp;
2264 struct tcphdr *th;
2265 struct icmp *icp;
2266 void (*notify)(struct inpcb *, int) = tcp_notify;
2267
2268 faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr;
2269 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
2270 return;
2271
2272 if ((unsigned)cmd >= PRC_NCMDS)
2273 return;
2274
2275 /* Source quench is deprecated */
2276 if (cmd == PRC_QUENCH)
2277 return;
2278
2279 if (cmd == PRC_MSGSIZE)
2280 notify = tcp_mtudisc;
2281 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2282 cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2283 cmd == PRC_TIMXCEED_INTRANS) && ip)
2284 notify = tcp_drop_syn_sent;
2285 /*
2286 * Hostdead is ugly because it goes linearly through all PCBs.
2287 * XXX: We never get this from ICMP, otherwise it makes an
2288 * excellent DoS attack on machines with many connections.
2289 */
2290 else if (cmd == PRC_HOSTDEAD)
2291 ip = NULL;
2292 else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd))
2293 return;
2294
2295
2296 if (ip == NULL) {
2297 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2298 return;
2299 }
2300
2301 icp = (struct icmp *)(void *)
2302 ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2303 th = (struct tcphdr *)(void *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2));
2304 icmp_tcp_seq = ntohl(th->th_seq);
2305
2306 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2307 ip->ip_src, th->th_sport, 0, NULL);
2308
2309 if (inp == NULL ||
2310 inp->inp_socket == NULL) {
2311 return;
2312 }
2313
2314 socket_lock(inp->inp_socket, 1);
2315 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2316 WNT_STOPUSING) {
2317 socket_unlock(inp->inp_socket, 1);
2318 return;
2319 }
2320
2321 if (PRC_IS_REDIRECT(cmd)) {
2322 /* signal EHOSTDOWN, as it flushes the cached route */
2323 (*notify)(inp, EHOSTDOWN);
2324 } else {
2325 tp = intotcpcb(inp);
2326 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2327 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2328 if (cmd == PRC_MSGSIZE)
2329 tcp_handle_msgsize(ip, inp);
2330
2331 (*notify)(inp, inetctlerrmap[cmd]);
2332 }
2333 }
2334 socket_unlock(inp->inp_socket, 1);
2335 }
2336
2337 #if INET6
2338 void
2339 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2340 {
2341 tcp_seq icmp_tcp_seq;
2342 struct in6_addr *dst;
2343 struct tcphdr *th;
2344 void (*notify)(struct inpcb *, int) = tcp_notify;
2345 struct ip6_hdr *ip6;
2346 struct mbuf *m;
2347 struct inpcb *inp;
2348 struct tcpcb *tp;
2349 struct icmp6_hdr *icmp6;
2350 struct ip6ctlparam *ip6cp = NULL;
2351 const struct sockaddr_in6 *sa6_src = NULL;
2352 unsigned int mtu;
2353 unsigned int off;
2354
2355 if (sa->sa_family != AF_INET6 ||
2356 sa->sa_len != sizeof(struct sockaddr_in6))
2357 return;
2358
2359 /* Source quench is deprecated */
2360 if (cmd == PRC_QUENCH)
2361 return;
2362
2363 if ((unsigned)cmd >= PRC_NCMDS)
2364 return;
2365
2366 /* if the parameter is from icmp6, decode it. */
2367 if (d != NULL) {
2368 ip6cp = (struct ip6ctlparam *)d;
2369 icmp6 = ip6cp->ip6c_icmp6;
2370 m = ip6cp->ip6c_m;
2371 ip6 = ip6cp->ip6c_ip6;
2372 off = ip6cp->ip6c_off;
2373 sa6_src = ip6cp->ip6c_src;
2374 dst = ip6cp->ip6c_finaldst;
2375 } else {
2376 m = NULL;
2377 ip6 = NULL;
2378 off = 0; /* fool gcc */
2379 sa6_src = &sa6_any;
2380 dst = NULL;
2381 }
2382
2383 if (cmd == PRC_MSGSIZE)
2384 notify = tcp_mtudisc;
2385 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2386 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2387 ip6 != NULL)
2388 notify = tcp_drop_syn_sent;
2389 /*
2390 * Hostdead is ugly because it goes linearly through all PCBs.
2391 * XXX: We never get this from ICMP, otherwise it makes an
2392 * excellent DoS attack on machines with many connections.
2393 */
2394 else if (cmd == PRC_HOSTDEAD)
2395 ip6 = NULL;
2396 else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd))
2397 return;
2398
2399
2400 if (ip6 == NULL) {
2401 in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)(size_t)sa6_src,
2402 0, cmd, NULL, notify);
2403 return;
2404 }
2405
2406 if (m == NULL ||
2407 (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_seq))))
2408 return;
2409
2410 th = (struct tcphdr *)(void *)mtodo(m, off);
2411 icmp_tcp_seq = ntohl(th->th_seq);
2412
2413 if (cmd == PRC_MSGSIZE) {
2414 mtu = ntohl(icmp6->icmp6_mtu);
2415 /*
2416 * If no alternative MTU was proposed, or the proposed
2417 * MTU was too small, set to the min.
2418 */
2419 if (mtu < IPV6_MMTU)
2420 mtu = IPV6_MMTU - 8;
2421 }
2422
2423 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, th->th_dport,
2424 &ip6->ip6_src, th->th_sport, 0, NULL);
2425
2426 if (inp == NULL ||
2427 inp->inp_socket == NULL) {
2428 return;
2429 }
2430
2431 socket_lock(inp->inp_socket, 1);
2432 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2433 WNT_STOPUSING) {
2434 socket_unlock(inp->inp_socket, 1);
2435 return;
2436 }
2437
2438 if (PRC_IS_REDIRECT(cmd)) {
2439 /* signal EHOSTDOWN, as it flushes the cached route */
2440 (*notify)(inp, EHOSTDOWN);
2441 } else {
2442 tp = intotcpcb(inp);
2443 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2444 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2445 if (cmd == PRC_MSGSIZE) {
2446 /*
2447 * Only process the offered MTU if it
2448 * is smaller than the current one.
2449 */
2450 if (mtu < tp->t_maxseg +
2451 (sizeof (*th) + sizeof (*ip6)))
2452 (*notify)(inp, inetctlerrmap[cmd]);
2453 } else
2454 (*notify)(inp, inetctlerrmap[cmd]);
2455 }
2456 }
2457 socket_unlock(inp->inp_socket, 1);
2458 }
2459 #endif /* INET6 */
2460
2461
2462 /*
2463 * Following is where TCP initial sequence number generation occurs.
2464 *
2465 * There are two places where we must use initial sequence numbers:
2466 * 1. In SYN-ACK packets.
2467 * 2. In SYN packets.
2468 *
2469 * The ISNs in SYN-ACK packets have no monotonicity requirement,
2470 * and should be as unpredictable as possible to avoid the possibility
2471 * of spoofing and/or connection hijacking. To satisfy this
2472 * requirement, SYN-ACK ISNs are generated via the arc4random()
2473 * function. If exact RFC 1948 compliance is requested via sysctl,
2474 * these ISNs will be generated just like those in SYN packets.
2475 *
2476 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2477 * depends on this property. In addition, these ISNs should be
2478 * unguessable so as to prevent connection hijacking. To satisfy
2479 * the requirements of this situation, the algorithm outlined in
2480 * RFC 1948 is used to generate sequence numbers.
2481 *
2482 * For more information on the theory of operation, please see
2483 * RFC 1948.
2484 *
2485 * Implementation details:
2486 *
2487 * Time is based off the system timer, and is corrected so that it
2488 * increases by one megabyte per second. This allows for proper
2489 * recycling on high speed LANs while still leaving over an hour
2490 * before rollover.
2491 *
2492 * Two sysctls control the generation of ISNs:
2493 *
2494 * net.inet.tcp.isn_reseed_interval controls the number of seconds
2495 * between seeding of isn_secret. This is normally set to zero,
2496 * as reseeding should not be necessary.
2497 *
2498 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
2499 * strictly. When strict compliance is requested, reseeding is
2500 * disabled and SYN-ACKs will be generated in the same manner as
2501 * SYNs. Strict mode is disabled by default.
2502 *
2503 */
2504
2505 #define ISN_BYTES_PER_SECOND 1048576
2506
2507 tcp_seq
2508 tcp_new_isn(struct tcpcb *tp)
2509 {
2510 u_int32_t md5_buffer[4];
2511 tcp_seq new_isn;
2512 struct timeval timenow;
2513 u_char isn_secret[32];
2514 int isn_last_reseed = 0;
2515 MD5_CTX isn_ctx;
2516
2517 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
2518 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) &&
2519 tcp_strict_rfc1948 == 0)
2520 #ifdef __APPLE__
2521 return (RandomULong());
2522 #else
2523 return (arc4random());
2524 #endif
2525 getmicrotime(&timenow);
2526
2527 /* Seed if this is the first use, reseed if requested. */
2528 if ((isn_last_reseed == 0) ||
2529 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
2530 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
2531 < (u_int)timenow.tv_sec))) {
2532 #ifdef __APPLE__
2533 read_frandom(&isn_secret, sizeof(isn_secret));
2534 #else
2535 read_random_unlimited(&isn_secret, sizeof(isn_secret));
2536 #endif
2537 isn_last_reseed = timenow.tv_sec;
2538 }
2539
2540 /* Compute the md5 hash and return the ISN. */
2541 MD5Init(&isn_ctx);
2542 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2543 sizeof(u_short));
2544 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2545 sizeof(u_short));
2546 #if INET6
2547 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2548 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2549 sizeof(struct in6_addr));
2550 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2551 sizeof(struct in6_addr));
2552 } else
2553 #endif
2554 {
2555 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2556 sizeof(struct in_addr));
2557 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2558 sizeof(struct in_addr));
2559 }
2560 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2561 MD5Final((u_char *) &md5_buffer, &isn_ctx);
2562 new_isn = (tcp_seq) md5_buffer[0];
2563 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
2564 return (new_isn);
2565 }
2566
2567
2568 /*
2569 * When a specific ICMP unreachable message is received and the
2570 * connection state is SYN-SENT, drop the connection. This behavior
2571 * is controlled by the icmp_may_rst sysctl.
2572 */
2573 void
2574 tcp_drop_syn_sent(struct inpcb *inp, int errno)
2575 {
2576 struct tcpcb *tp = intotcpcb(inp);
2577
2578 if (tp && tp->t_state == TCPS_SYN_SENT)
2579 tcp_drop(tp, errno);
2580 }
2581
2582 /*
2583 * When `need fragmentation' ICMP is received, update our idea of the MSS
2584 * based on the new value in the route. Also nudge TCP to send something,
2585 * since we know the packet we just sent was dropped.
2586 * This duplicates some code in the tcp_mss() function in tcp_input.c.
2587 */
2588 void
2589 tcp_mtudisc(
2590 struct inpcb *inp,
2591 __unused int errno
2592 )
2593 {
2594 struct tcpcb *tp = intotcpcb(inp);
2595 struct rtentry *rt;
2596 struct rmxp_tao *taop;
2597 struct socket *so = inp->inp_socket;
2598 int offered;
2599 int mss;
2600 u_int32_t mtu;
2601 u_int32_t protoHdrOverhead = sizeof (struct tcpiphdr);
2602 #if INET6
2603 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2604
2605 if (isipv6)
2606 protoHdrOverhead = sizeof(struct ip6_hdr) +
2607 sizeof(struct tcphdr);
2608 #endif /* INET6 */
2609
2610 if (tp) {
2611 #if INET6
2612 if (isipv6)
2613 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2614 else
2615 #endif /* INET6 */
2616 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2617 if (!rt || !rt->rt_rmx.rmx_mtu) {
2618 tp->t_maxopd = tp->t_maxseg =
2619 #if INET6
2620 isipv6 ? tcp_v6mssdflt :
2621 #endif /* INET6 */
2622 tcp_mssdflt;
2623
2624 /* Route locked during lookup above */
2625 if (rt != NULL)
2626 RT_UNLOCK(rt);
2627 return;
2628 }
2629 taop = rmx_taop(rt->rt_rmx);
2630 offered = taop->tao_mssopt;
2631 mtu = rt->rt_rmx.rmx_mtu;
2632
2633 /* Route locked during lookup above */
2634 RT_UNLOCK(rt);
2635
2636 #if NECP
2637 // Adjust MTU if necessary.
2638 mtu = necp_socket_get_effective_mtu(inp, mtu);
2639 #endif /* NECP */
2640 mss = mtu - protoHdrOverhead;
2641
2642 if (offered)
2643 mss = min(mss, offered);
2644 /*
2645 * XXX - The above conditional probably violates the TCP
2646 * spec. The problem is that, since we don't know the
2647 * other end's MSS, we are supposed to use a conservative
2648 * default. But, if we do that, then MTU discovery will
2649 * never actually take place, because the conservative
2650 * default is much less than the MTUs typically seen
2651 * on the Internet today. For the moment, we'll sweep
2652 * this under the carpet.
2653 *
2654 * The conservative default might not actually be a problem
2655 * if the only case this occurs is when sending an initial
2656 * SYN with options and data to a host we've never talked
2657 * to before. Then, they will reply with an MSS value which
2658 * will get recorded and the new parameters should get
2659 * recomputed. For Further Study.
2660 */
2661 if (tp->t_maxopd <= mss)
2662 return;
2663 tp->t_maxopd = mss;
2664
2665 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2666 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
2667 mss -= TCPOLEN_TSTAMP_APPA;
2668
2669 #if MPTCP
2670 mss -= mptcp_adj_mss(tp, TRUE);
2671 #endif
2672 if (so->so_snd.sb_hiwat < mss)
2673 mss = so->so_snd.sb_hiwat;
2674
2675 tp->t_maxseg = mss;
2676
2677 /*
2678 * Reset the slow-start flight size as it may depends on the
2679 * new MSS
2680 */
2681 if (CC_ALGO(tp)->cwnd_init != NULL)
2682 CC_ALGO(tp)->cwnd_init(tp);
2683 tcpstat.tcps_mturesent++;
2684 tp->t_rtttime = 0;
2685 tp->snd_nxt = tp->snd_una;
2686 tcp_output(tp);
2687 }
2688 }
2689
2690 /*
2691 * Look-up the routing entry to the peer of this inpcb. If no route
2692 * is found and it cannot be allocated the return NULL. This routine
2693 * is called by TCP routines that access the rmx structure and by tcp_mss
2694 * to get the interface MTU. If a route is found, this routine will
2695 * hold the rtentry lock; the caller is responsible for unlocking.
2696 */
2697 struct rtentry *
2698 tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
2699 {
2700 struct route *ro;
2701 struct rtentry *rt;
2702 struct tcpcb *tp;
2703
2704 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2705
2706 ro = &inp->inp_route;
2707 if ((rt = ro->ro_rt) != NULL)
2708 RT_LOCK(rt);
2709
2710 if (ROUTE_UNUSABLE(ro)) {
2711 if (rt != NULL) {
2712 RT_UNLOCK(rt);
2713 rt = NULL;
2714 }
2715 ROUTE_RELEASE(ro);
2716 /* No route yet, so try to acquire one */
2717 if (inp->inp_faddr.s_addr != INADDR_ANY) {
2718 unsigned int ifscope;
2719
2720 ro->ro_dst.sa_family = AF_INET;
2721 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2722 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
2723 inp->inp_faddr;
2724
2725 /*
2726 * If the socket was bound to an interface, then
2727 * the bound-to-interface takes precedence over
2728 * the inbound interface passed in by the caller
2729 * (if we get here as part of the output path then
2730 * input_ifscope is IFSCOPE_NONE).
2731 */
2732 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2733 inp->inp_boundifp->if_index : input_ifscope;
2734
2735 rtalloc_scoped(ro, ifscope);
2736 if ((rt = ro->ro_rt) != NULL)
2737 RT_LOCK(rt);
2738 }
2739 }
2740 if (rt != NULL)
2741 RT_LOCK_ASSERT_HELD(rt);
2742
2743 /*
2744 * Update MTU discovery determination. Don't do it if:
2745 * 1) it is disabled via the sysctl
2746 * 2) the route isn't up
2747 * 3) the MTU is locked (if it is, then discovery has been
2748 * disabled)
2749 */
2750
2751 tp = intotcpcb(inp);
2752
2753 if (!path_mtu_discovery || ((rt != NULL) &&
2754 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2755 tp->t_flags &= ~TF_PMTUD;
2756 else
2757 tp->t_flags |= TF_PMTUD;
2758
2759 if (rt != NULL && rt->rt_ifp != NULL) {
2760 somultipages(inp->inp_socket,
2761 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2762 tcp_set_tso(tp, rt->rt_ifp);
2763 soif2kcl(inp->inp_socket,
2764 (rt->rt_ifp->if_eflags & IFEF_2KCL));
2765 tcp_set_ecn(tp, rt->rt_ifp);
2766 if (inp->inp_last_outifp == NULL) {
2767 inp->inp_last_outifp = rt->rt_ifp;
2768
2769 }
2770 }
2771
2772 /* Note if the peer is local */
2773 if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
2774 (rt->rt_gateway->sa_family == AF_LINK ||
2775 rt->rt_ifp->if_flags & IFF_LOOPBACK ||
2776 in_localaddr(inp->inp_faddr))) {
2777 tp->t_flags |= TF_LOCAL;
2778 }
2779
2780 /*
2781 * Caller needs to call RT_UNLOCK(rt).
2782 */
2783 return (rt);
2784 }
2785
2786 #if INET6
2787 struct rtentry *
2788 tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
2789 {
2790 struct route_in6 *ro6;
2791 struct rtentry *rt;
2792 struct tcpcb *tp;
2793
2794 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2795
2796 ro6 = &inp->in6p_route;
2797 if ((rt = ro6->ro_rt) != NULL)
2798 RT_LOCK(rt);
2799
2800 if (ROUTE_UNUSABLE(ro6)) {
2801 if (rt != NULL) {
2802 RT_UNLOCK(rt);
2803 rt = NULL;
2804 }
2805 ROUTE_RELEASE(ro6);
2806 /* No route yet, so try to acquire one */
2807 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2808 struct sockaddr_in6 *dst6;
2809 unsigned int ifscope;
2810
2811 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
2812 dst6->sin6_family = AF_INET6;
2813 dst6->sin6_len = sizeof(*dst6);
2814 dst6->sin6_addr = inp->in6p_faddr;
2815
2816 /*
2817 * If the socket was bound to an interface, then
2818 * the bound-to-interface takes precedence over
2819 * the inbound interface passed in by the caller
2820 * (if we get here as part of the output path then
2821 * input_ifscope is IFSCOPE_NONE).
2822 */
2823 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2824 inp->inp_boundifp->if_index : input_ifscope;
2825
2826 rtalloc_scoped((struct route *)ro6, ifscope);
2827 if ((rt = ro6->ro_rt) != NULL)
2828 RT_LOCK(rt);
2829 }
2830 }
2831 if (rt != NULL)
2832 RT_LOCK_ASSERT_HELD(rt);
2833
2834 /*
2835 * Update path MTU Discovery determination
2836 * while looking up the route:
2837 * 1) we have a valid route to the destination
2838 * 2) the MTU is not locked (if it is, then discovery has been
2839 * disabled)
2840 */
2841
2842
2843 tp = intotcpcb(inp);
2844
2845 /*
2846 * Update MTU discovery determination. Don't do it if:
2847 * 1) it is disabled via the sysctl
2848 * 2) the route isn't up
2849 * 3) the MTU is locked (if it is, then discovery has been
2850 * disabled)
2851 */
2852
2853 if (!path_mtu_discovery || ((rt != NULL) &&
2854 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2855 tp->t_flags &= ~TF_PMTUD;
2856 else
2857 tp->t_flags |= TF_PMTUD;
2858
2859 if (rt != NULL && rt->rt_ifp != NULL) {
2860 somultipages(inp->inp_socket,
2861 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2862 tcp_set_tso(tp, rt->rt_ifp);
2863 soif2kcl(inp->inp_socket,
2864 (rt->rt_ifp->if_eflags & IFEF_2KCL));
2865 tcp_set_ecn(tp, rt->rt_ifp);
2866 if (inp->inp_last_outifp == NULL) {
2867 inp->inp_last_outifp = rt->rt_ifp;
2868 }
2869 }
2870
2871 /* Note if the peer is local */
2872 if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
2873 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
2874 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
2875 rt->rt_gateway->sa_family == AF_LINK ||
2876 in6_localaddr(&inp->in6p_faddr))) {
2877 tp->t_flags |= TF_LOCAL;
2878 }
2879
2880 /*
2881 * Caller needs to call RT_UNLOCK(rt).
2882 */
2883 return (rt);
2884 }
2885 #endif /* INET6 */
2886
2887 #if IPSEC
2888 /* compute ESP/AH header size for TCP, including outer IP header. */
2889 size_t
2890 ipsec_hdrsiz_tcp(struct tcpcb *tp)
2891 {
2892 struct inpcb *inp;
2893 struct mbuf *m;
2894 size_t hdrsiz;
2895 struct ip *ip;
2896 #if INET6
2897 struct ip6_hdr *ip6 = NULL;
2898 #endif /* INET6 */
2899 struct tcphdr *th;
2900
2901 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
2902 return (0);
2903 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */
2904 if (!m)
2905 return (0);
2906
2907 #if INET6
2908 if ((inp->inp_vflag & INP_IPV6) != 0) {
2909 ip6 = mtod(m, struct ip6_hdr *);
2910 th = (struct tcphdr *)(void *)(ip6 + 1);
2911 m->m_pkthdr.len = m->m_len =
2912 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2913 tcp_fillheaders(tp, ip6, th);
2914 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2915 } else
2916 #endif /* INET6 */
2917 {
2918 ip = mtod(m, struct ip *);
2919 th = (struct tcphdr *)(ip + 1);
2920 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
2921 tcp_fillheaders(tp, ip, th);
2922 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2923 }
2924 m_free(m);
2925 return (hdrsiz);
2926 }
2927 #endif /* IPSEC */
2928
2929 /*
2930 * Return a pointer to the cached information about the remote host.
2931 * The cached information is stored in the protocol specific part of
2932 * the route metrics.
2933 */
2934 struct rmxp_tao *
2935 tcp_gettaocache(struct inpcb *inp)
2936 {
2937 struct rtentry *rt;
2938 struct rmxp_tao *taop;
2939
2940 #if INET6
2941 if ((inp->inp_vflag & INP_IPV6) != 0)
2942 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2943 else
2944 #endif /* INET6 */
2945 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2946
2947 /* Make sure this is a host route and is up. */
2948 if (rt == NULL ||
2949 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) {
2950 /* Route locked during lookup above */
2951 if (rt != NULL)
2952 RT_UNLOCK(rt);
2953 return (NULL);
2954 }
2955
2956 taop = rmx_taop(rt->rt_rmx);
2957 /* Route locked during lookup above */
2958 RT_UNLOCK(rt);
2959 return (taop);
2960 }
2961
2962 /*
2963 * Clear all the TAO cache entries, called from tcp_init.
2964 *
2965 * XXX
2966 * This routine is just an empty one, because we assume that the routing
2967 * routing tables are initialized at the same time when TCP, so there is
2968 * nothing in the cache left over.
2969 */
2970 static void
2971 tcp_cleartaocache(void)
2972 {
2973 }
2974
2975 int
2976 tcp_lock(struct socket *so, int refcount, void *lr)
2977 {
2978 void *lr_saved;
2979
2980 if (lr == NULL)
2981 lr_saved = __builtin_return_address(0);
2982 else
2983 lr_saved = lr;
2984
2985 retry:
2986 if (so->so_pcb != NULL) {
2987 if (so->so_flags & SOF_MP_SUBFLOW) {
2988 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
2989 VERIFY(mp_tp);
2990
2991 mpte_lock_assert_notheld(mp_tp->mpt_mpte);
2992
2993 mpte_lock(mp_tp->mpt_mpte);
2994
2995 /*
2996 * Check if we became non-MPTCP while waiting for the lock.
2997 * If yes, we have to retry to grab the right lock.
2998 */
2999 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3000 mpte_unlock(mp_tp->mpt_mpte);
3001 goto retry;
3002 }
3003 } else {
3004 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3005
3006 if (so->so_flags & SOF_MP_SUBFLOW) {
3007 /*
3008 * While waiting for the lock, we might have
3009 * become MPTCP-enabled (see mptcp_subflow_socreate).
3010 */
3011 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3012 goto retry;
3013 }
3014 }
3015 } else {
3016 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n",
3017 so, lr_saved, solockhistory_nr(so));
3018 /* NOTREACHED */
3019 }
3020
3021 if (so->so_usecount < 0) {
3022 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n",
3023 so, so->so_pcb, lr_saved, so->so_usecount,
3024 solockhistory_nr(so));
3025 /* NOTREACHED */
3026 }
3027 if (refcount)
3028 so->so_usecount++;
3029 so->lock_lr[so->next_lock_lr] = lr_saved;
3030 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3031 return (0);
3032 }
3033
3034 int
3035 tcp_unlock(struct socket *so, int refcount, void *lr)
3036 {
3037 void *lr_saved;
3038
3039 if (lr == NULL)
3040 lr_saved = __builtin_return_address(0);
3041 else
3042 lr_saved = lr;
3043
3044 #ifdef MORE_TCPLOCK_DEBUG
3045 printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3046 "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3047 (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3048 (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3049 so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3050 #endif
3051 if (refcount)
3052 so->so_usecount--;
3053
3054 if (so->so_usecount < 0) {
3055 panic("tcp_unlock: so=%p usecount=%x lrh= %s\n",
3056 so, so->so_usecount, solockhistory_nr(so));
3057 /* NOTREACHED */
3058 }
3059 if (so->so_pcb == NULL) {
3060 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n",
3061 so, so->so_usecount, lr_saved, solockhistory_nr(so));
3062 /* NOTREACHED */
3063 } else {
3064 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3065 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3066
3067 if (so->so_flags & SOF_MP_SUBFLOW) {
3068 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3069
3070 VERIFY(mp_tp);
3071 mpte_lock_assert_held(mp_tp->mpt_mpte);
3072
3073 mpte_unlock(mp_tp->mpt_mpte);
3074 } else {
3075 LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3076 LCK_MTX_ASSERT_OWNED);
3077 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3078 }
3079 }
3080 return (0);
3081 }
3082
3083 lck_mtx_t *
3084 tcp_getlock(struct socket *so, int flags)
3085 {
3086 struct inpcb *inp = sotoinpcb(so);
3087
3088 if (so->so_pcb) {
3089 if (so->so_usecount < 0)
3090 panic("tcp_getlock: so=%p usecount=%x lrh= %s\n",
3091 so, so->so_usecount, solockhistory_nr(so));
3092
3093 if (so->so_flags & SOF_MP_SUBFLOW) {
3094 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3095
3096 return (mpte_getlock(mp_tp->mpt_mpte, flags));
3097 } else {
3098 return (&inp->inpcb_mtx);
3099 }
3100 } else {
3101 panic("tcp_getlock: so=%p NULL so_pcb %s\n",
3102 so, solockhistory_nr(so));
3103 return (so->so_proto->pr_domain->dom_mtx);
3104 }
3105 }
3106
3107 /*
3108 * Determine if we can grow the recieve socket buffer to avoid sending
3109 * a zero window update to the peer. We allow even socket buffers that
3110 * have fixed size (set by the application) to grow if the resource
3111 * constraints are met. They will also be trimmed after the application
3112 * reads data.
3113 */
3114 static void
3115 tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3116 {
3117 u_int32_t rcvbufinc = tp->t_maxseg << 4;
3118 u_int32_t rcvbuf = sb->sb_hiwat;
3119 struct socket *so = tp->t_inpcb->inp_socket;
3120
3121 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so))
3122 return;
3123 /*
3124 * If message delivery is enabled, do not count
3125 * unordered bytes in receive buffer towards hiwat
3126 */
3127 if (so->so_flags & SOF_ENABLE_MSGS)
3128 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
3129
3130 if (tcp_do_autorcvbuf == 1 &&
3131 tcp_cansbgrow(sb) &&
3132 (tp->t_flags & TF_SLOWLINK) == 0 &&
3133 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3134 (rcvbuf - sb->sb_cc) < rcvbufinc &&
3135 rcvbuf < tcp_autorcvbuf_max &&
3136 (sb->sb_idealsize > 0 &&
3137 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3138 sbreserve(sb,
3139 min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
3140 }
3141 }
3142
3143 int32_t
3144 tcp_sbspace(struct tcpcb *tp)
3145 {
3146 struct socket *so = tp->t_inpcb->inp_socket;
3147 struct sockbuf *sb = &so->so_rcv;
3148 u_int32_t rcvbuf;
3149 int32_t space;
3150 int32_t pending = 0;
3151
3152 tcp_sbrcv_grow_rwin(tp, sb);
3153
3154 /* hiwat might have changed */
3155 rcvbuf = sb->sb_hiwat;
3156
3157 /*
3158 * If message delivery is enabled, do not count
3159 * unordered bytes in receive buffer towards hiwat mark.
3160 * This value is used to return correct rwnd that does
3161 * not reflect the extra unordered bytes added to the
3162 * receive socket buffer.
3163 */
3164 if (so->so_flags & SOF_ENABLE_MSGS)
3165 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
3166
3167 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
3168 (sb->sb_mbmax - sb->sb_mbcnt)));
3169 if (space < 0)
3170 space = 0;
3171
3172 #if CONTENT_FILTER
3173 /* Compensate for data being processed by content filters */
3174 pending = cfil_sock_data_space(sb);
3175 #endif /* CONTENT_FILTER */
3176 if (pending > space)
3177 space = 0;
3178 else
3179 space -= pending;
3180
3181 /*
3182 * Avoid increasing window size if the current window
3183 * is already very low, we could be in "persist" mode and
3184 * we could break some apps (see rdar://5409343)
3185 */
3186
3187 if (space < tp->t_maxseg)
3188 return (space);
3189
3190 /* Clip window size for slower link */
3191
3192 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0)
3193 return (imin(space, slowlink_wsize));
3194
3195 return (space);
3196 }
3197 /*
3198 * Checks TCP Segment Offloading capability for a given connection
3199 * and interface pair.
3200 */
3201 void
3202 tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3203 {
3204 #if INET6
3205 struct inpcb *inp;
3206 int isipv6;
3207 #endif /* INET6 */
3208 #if MPTCP
3209 /*
3210 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3211 */
3212 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
3213 tp->t_flags &= ~TF_TSO;
3214 return;
3215 }
3216 #endif
3217 #if INET6
3218 inp = tp->t_inpcb;
3219 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3220
3221 if (isipv6) {
3222 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV6)) {
3223 tp->t_flags |= TF_TSO;
3224 if (ifp->if_tso_v6_mtu != 0)
3225 tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3226 else
3227 tp->tso_max_segment_size = TCP_MAXWIN;
3228 } else
3229 tp->t_flags &= ~TF_TSO;
3230
3231 } else
3232 #endif /* INET6 */
3233
3234 {
3235 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV4)) {
3236 tp->t_flags |= TF_TSO;
3237 if (ifp->if_tso_v4_mtu != 0)
3238 tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3239 else
3240 tp->tso_max_segment_size = TCP_MAXWIN;
3241 } else
3242 tp->t_flags &= ~TF_TSO;
3243 }
3244 }
3245
3246 #define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + \
3247 (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)
3248
3249 /*
3250 * Function to calculate the tcp clock. The tcp clock will get updated
3251 * at the boundaries of the tcp layer. This is done at 3 places:
3252 * 1. Right before processing an input tcp packet
3253 * 2. Whenever a connection wants to access the network using tcp_usrreqs
3254 * 3. When a tcp timer fires or before tcp slow timeout
3255 *
3256 */
3257
3258 void
3259 calculate_tcp_clock(void)
3260 {
3261 struct timeval tv = tcp_uptime;
3262 struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
3263 struct timeval now, hold_now;
3264 uint32_t incr = 0;
3265
3266 microuptime(&now);
3267
3268 /*
3269 * Update coarse-grained networking timestamp (in sec.); the idea
3270 * is to update the counter returnable via net_uptime() when
3271 * we read time.
3272 */
3273 net_update_uptime_with_time(&now);
3274
3275 timevaladd(&tv, &interval);
3276 if (timevalcmp(&now, &tv, >)) {
3277 /* time to update the clock */
3278 lck_spin_lock(tcp_uptime_lock);
3279 if (timevalcmp(&tcp_uptime, &now, >=)) {
3280 /* clock got updated while waiting for the lock */
3281 lck_spin_unlock(tcp_uptime_lock);
3282 return;
3283 }
3284
3285 microuptime(&now);
3286 hold_now = now;
3287 tv = tcp_uptime;
3288 timevalsub(&now, &tv);
3289
3290 incr = TIMEVAL_TO_TCPHZ(now);
3291 if (incr > 0) {
3292 tcp_uptime = hold_now;
3293 tcp_now += incr;
3294 }
3295
3296 lck_spin_unlock(tcp_uptime_lock);
3297 }
3298 }
3299
3300 /*
3301 * Compute receive window scaling that we are going to request
3302 * for this connection based on sb_hiwat. Try to leave some
3303 * room to potentially increase the window size upto a maximum
3304 * defined by the constant tcp_autorcvbuf_max.
3305 */
3306 void
3307 tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so,
3308 u_int32_t rcvbuf_max)
3309 {
3310 u_int32_t maxsockbufsize;
3311 if (!tcp_do_rfc1323) {
3312 tp->request_r_scale = 0;
3313 return;
3314 }
3315
3316 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
3317 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3318 so->so_rcv.sb_hiwat : rcvbuf_max;
3319
3320 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3321 (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize)
3322 tp->request_r_scale++;
3323 tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT);
3324
3325 }
3326
3327 int
3328 tcp_notsent_lowat_check(struct socket *so) {
3329 struct inpcb *inp = sotoinpcb(so);
3330 struct tcpcb *tp = NULL;
3331 int notsent = 0;
3332 if (inp != NULL) {
3333 tp = intotcpcb(inp);
3334 }
3335
3336 notsent = so->so_snd.sb_cc -
3337 (tp->snd_nxt - tp->snd_una);
3338
3339 /*
3340 * When we send a FIN or SYN, not_sent can be negative.
3341 * In that case also we need to send a write event to the
3342 * process if it is waiting. In the FIN case, it will
3343 * get an error from send because cantsendmore will be set.
3344 */
3345 if (notsent <= tp->t_notsent_lowat) {
3346 return (1);
3347 }
3348
3349 /*
3350 * When Nagle's algorithm is not disabled, it is better
3351 * to wakeup the client until there is atleast one
3352 * maxseg of data to write.
3353 */
3354 if ((tp->t_flags & TF_NODELAY) == 0 &&
3355 notsent > 0 && notsent < tp->t_maxseg) {
3356 return (1);
3357 }
3358 return (0);
3359 }
3360
3361 void
3362 tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3363 {
3364 struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3365 u_int32_t rxcount = 0;
3366
3367 if (SLIST_EMPTY(&tp->t_rxt_segments))
3368 tp->t_dsack_lastuna = tp->snd_una;
3369 /*
3370 * First check if there is a segment already existing for this
3371 * sequence space.
3372 */
3373
3374 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3375 if (SEQ_GT(rxseg->rx_start, start))
3376 break;
3377 prev = rxseg;
3378 }
3379 next = rxseg;
3380
3381 /* check if prev seg is for this sequence */
3382 if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3383 SEQ_GEQ(prev->rx_end, end)) {
3384 prev->rx_count++;
3385 return;
3386 }
3387
3388 /*
3389 * There are a couple of possibilities at this point.
3390 * 1. prev overlaps with the beginning of this sequence
3391 * 2. next overlaps with the end of this sequence
3392 * 3. there is no overlap.
3393 */
3394
3395 if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3396 if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3397 start = prev->rx_end + 1;
3398 prev->rx_count++;
3399 } else {
3400 prev->rx_end = (start - 1);
3401 rxcount = prev->rx_count;
3402 }
3403 }
3404
3405 if (next != NULL && SEQ_LT(next->rx_start, end)) {
3406 if (SEQ_LEQ(next->rx_end, end)) {
3407 end = next->rx_start - 1;
3408 next->rx_count++;
3409 } else {
3410 next->rx_start = end + 1;
3411 rxcount = next->rx_count;
3412 }
3413 }
3414 if (!SEQ_LT(start, end))
3415 return;
3416
3417 rxseg = (struct tcp_rxt_seg *) zalloc(tcp_rxt_seg_zone);
3418 if (rxseg == NULL) {
3419 return;
3420 }
3421 bzero(rxseg, sizeof(*rxseg));
3422 rxseg->rx_start = start;
3423 rxseg->rx_end = end;
3424 rxseg->rx_count = rxcount + 1;
3425
3426 if (prev != NULL) {
3427 SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3428 } else {
3429 SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3430 }
3431 }
3432
3433 struct tcp_rxt_seg *
3434 tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3435 {
3436 struct tcp_rxt_seg *rxseg;
3437 if (SLIST_EMPTY(&tp->t_rxt_segments))
3438 return (NULL);
3439
3440 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3441 if (SEQ_LEQ(rxseg->rx_start, start) &&
3442 SEQ_GEQ(rxseg->rx_end, end))
3443 return (rxseg);
3444 if (SEQ_GT(rxseg->rx_start, start))
3445 break;
3446 }
3447 return (NULL);
3448 }
3449
3450 void
3451 tcp_rxtseg_clean(struct tcpcb *tp)
3452 {
3453 struct tcp_rxt_seg *rxseg, *next;
3454
3455 SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3456 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3457 tcp_rxt_seg, rx_link);
3458 zfree(tcp_rxt_seg_zone, rxseg);
3459 }
3460 tp->t_dsack_lastuna = tp->snd_max;
3461 }
3462
3463 boolean_t
3464 tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3465 {
3466 boolean_t bad_rexmt;
3467 struct tcp_rxt_seg *rxseg;
3468
3469 if (SLIST_EMPTY(&tp->t_rxt_segments))
3470 return (FALSE);
3471
3472 /*
3473 * If all of the segments in this window are not cumulatively
3474 * acknowledged, then there can still be undetected packet loss.
3475 * Do not restore congestion window in that case.
3476 */
3477 if (SEQ_LT(th_ack, tp->snd_recover))
3478 return (FALSE);
3479
3480 bad_rexmt = TRUE;
3481 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3482 if (rxseg->rx_count > 1 ||
3483 !(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3484 bad_rexmt = FALSE;
3485 break;
3486 }
3487 }
3488 return (bad_rexmt);
3489 }
3490
3491 boolean_t
3492 tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp)
3493 {
3494 boolean_t dsack_for_tlp = FALSE;
3495 struct tcp_rxt_seg *rxseg;
3496 if (SLIST_EMPTY(&tp->t_rxt_segments))
3497 return (FALSE);
3498
3499 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3500 if (rxseg->rx_count == 1 &&
3501 SLIST_NEXT(rxseg, rx_link) == NULL &&
3502 (rxseg->rx_flags & TCP_RXT_DSACK_FOR_TLP)) {
3503 dsack_for_tlp = TRUE;
3504 break;
3505 }
3506 }
3507 return (dsack_for_tlp);
3508 }
3509
3510 u_int32_t
3511 tcp_rxtseg_total_size(struct tcpcb *tp)
3512 {
3513 struct tcp_rxt_seg *rxseg;
3514 u_int32_t total_size = 0;
3515
3516 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3517 total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3518 }
3519 return (total_size);
3520 }
3521
3522 void
3523 tcp_get_connectivity_status(struct tcpcb *tp,
3524 struct tcp_conn_status *connstatus)
3525 {
3526 if (tp == NULL || connstatus == NULL)
3527 return;
3528 bzero(connstatus, sizeof(*connstatus));
3529 if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
3530 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3531 connstatus->write_probe_failed = 1;
3532 } else {
3533 connstatus->conn_probe_failed = 1;
3534 }
3535 }
3536 if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX)
3537 connstatus->read_probe_failed = 1;
3538 if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
3539 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY))
3540 connstatus->probe_activated = 1;
3541 }
3542
3543 boolean_t
3544 tfo_enabled(const struct tcpcb *tp)
3545 {
3546 return ((tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE);
3547 }
3548
3549 void
3550 tcp_disable_tfo(struct tcpcb *tp)
3551 {
3552 tp->t_flagsext &= ~TF_FASTOPEN;
3553 }
3554
3555 static struct mbuf *
3556 tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
3557 boolean_t is_probe)
3558 {
3559 struct inpcb *inp = tp->t_inpcb;
3560 struct tcphdr *th;
3561 u_int8_t *data;
3562 int win = 0;
3563 struct mbuf *m;
3564
3565 /*
3566 * The code assumes the IP + TCP headers fit in an mbuf packet header
3567 */
3568 _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
3569 _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
3570
3571 MGETHDR(m, M_WAIT, MT_HEADER);
3572 if (m == NULL) {
3573 return (NULL);
3574 }
3575 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3576
3577 data = mbuf_datastart(m);
3578
3579 if (inp->inp_vflag & INP_IPV4) {
3580 bzero(data, sizeof(struct ip) + sizeof(struct tcphdr));
3581 th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
3582 m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
3583 m->m_pkthdr.len = m->m_len;
3584 } else {
3585 VERIFY(inp->inp_vflag & INP_IPV6);
3586
3587 bzero(data, sizeof(struct ip6_hdr)
3588 + sizeof(struct tcphdr));
3589 th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
3590 m->m_len = sizeof(struct ip6_hdr) +
3591 sizeof(struct tcphdr);
3592 m->m_pkthdr.len = m->m_len;
3593 }
3594
3595 tcp_fillheaders(tp, data, th);
3596
3597 if (inp->inp_vflag & INP_IPV4) {
3598 struct ip *ip;
3599
3600 ip = (__typeof__(ip))(void *)data;
3601
3602 ip->ip_id = rfc6864 ? 0 : ip_randomid();
3603 ip->ip_off = htons(IP_DF);
3604 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
3605 ip->ip_ttl = inp->inp_ip_ttl;
3606 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3607 ip->ip_sum = in_cksum_hdr(ip);
3608 } else {
3609 struct ip6_hdr *ip6;
3610
3611 ip6 = (__typeof__(ip6))(void *)data;
3612
3613 ip6->ip6_plen = htons(sizeof(struct tcphdr));
3614 ip6->ip6_hlim = in6_selecthlim(inp, ifp);
3615 ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
3616
3617 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
3618 ip6->ip6_src.s6_addr16[1] = 0;
3619 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
3620 ip6->ip6_dst.s6_addr16[1] = 0;
3621 }
3622 th->th_flags = TH_ACK;
3623
3624 win = tcp_sbspace(tp);
3625 if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale))
3626 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
3627 th->th_win = htons((u_short) (win >> tp->rcv_scale));
3628
3629 if (is_probe) {
3630 th->th_seq = htonl(tp->snd_una - 1);
3631 } else {
3632 th->th_seq = htonl(tp->snd_una);
3633 }
3634 th->th_ack = htonl(tp->rcv_nxt);
3635
3636 /* Force recompute TCP checksum to be the final value */
3637 th->th_sum = 0;
3638 if (inp->inp_vflag & INP_IPV4) {
3639 th->th_sum = inet_cksum(m, IPPROTO_TCP,
3640 sizeof(struct ip), sizeof(struct tcphdr));
3641 } else {
3642 th->th_sum = inet6_cksum(m, IPPROTO_TCP,
3643 sizeof(struct ip6_hdr), sizeof(struct tcphdr));
3644 }
3645
3646 return (m);
3647 }
3648
3649 void
3650 tcp_fill_keepalive_offload_frames(ifnet_t ifp,
3651 struct ifnet_keepalive_offload_frame *frames_array,
3652 u_int32_t frames_array_count, size_t frame_data_offset,
3653 u_int32_t *used_frames_count)
3654 {
3655 struct inpcb *inp;
3656 inp_gen_t gencnt;
3657 u_int32_t frame_index = *used_frames_count;
3658
3659 if (ifp == NULL || frames_array == NULL ||
3660 frames_array_count == 0 ||
3661 frame_index >= frames_array_count ||
3662 frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE)
3663 return;
3664
3665 /*
3666 * This function is called outside the regular TCP processing
3667 * so we need to update the TCP clock.
3668 */
3669 calculate_tcp_clock();
3670
3671 lck_rw_lock_shared(tcbinfo.ipi_lock);
3672 gencnt = tcbinfo.ipi_gencnt;
3673 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
3674 struct socket *so;
3675 struct ifnet_keepalive_offload_frame *frame;
3676 struct mbuf *m = NULL;
3677 struct tcpcb *tp = intotcpcb(inp);
3678
3679 if (frame_index >= frames_array_count)
3680 break;
3681
3682 if (inp->inp_gencnt > gencnt ||
3683 inp->inp_state == INPCB_STATE_DEAD)
3684 continue;
3685
3686 if ((so = inp->inp_socket) == NULL ||
3687 (so->so_state & SS_DEFUNCT))
3688 continue;
3689 /*
3690 * check for keepalive offload flag without socket
3691 * lock to avoid a deadlock
3692 */
3693 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
3694 continue;
3695 }
3696
3697 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
3698 continue;
3699 }
3700 if (inp->inp_ppcb == NULL ||
3701 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
3702 continue;
3703 socket_lock(so, 1);
3704 /* Release the want count */
3705 if (inp->inp_ppcb == NULL ||
3706 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
3707 socket_unlock(so, 1);
3708 continue;
3709 }
3710 if ((inp->inp_vflag & INP_IPV4) &&
3711 (inp->inp_laddr.s_addr == INADDR_ANY ||
3712 inp->inp_faddr.s_addr == INADDR_ANY)) {
3713 socket_unlock(so, 1);
3714 continue;
3715 }
3716 if ((inp->inp_vflag & INP_IPV6) &&
3717 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
3718 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
3719 socket_unlock(so, 1);
3720 continue;
3721 }
3722 if (inp->inp_lport == 0 || inp->inp_fport == 0) {
3723 socket_unlock(so, 1);
3724 continue;
3725 }
3726 if (inp->inp_last_outifp == NULL ||
3727 inp->inp_last_outifp->if_index != ifp->if_index) {
3728 socket_unlock(so, 1);
3729 continue;
3730 }
3731 if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
3732 sizeof(struct ip) + sizeof(struct tcphdr) >
3733 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
3734 socket_unlock(so, 1);
3735 continue;
3736 } else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
3737 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
3738 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
3739 socket_unlock(so, 1);
3740 continue;
3741 }
3742 /*
3743 * There is no point in waking up the device for connections
3744 * that are not established. Long lived connection are meant
3745 * for processes that will sent and receive data
3746 */
3747 if (tp->t_state != TCPS_ESTABLISHED) {
3748 socket_unlock(so, 1);
3749 continue;
3750 }
3751 /*
3752 * This inp has all the information that is needed to
3753 * generate an offload frame.
3754 */
3755 frame = &frames_array[frame_index];
3756 frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
3757 frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
3758 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
3759 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
3760 frame->interval = tp->t_keepidle > 0 ? tp->t_keepidle :
3761 tcp_keepidle;
3762 frame->keep_cnt = TCP_CONN_KEEPCNT(tp);
3763 frame->keep_retry = TCP_CONN_KEEPINTVL(tp);
3764 frame->local_port = ntohs(inp->inp_lport);
3765 frame->remote_port = ntohs(inp->inp_fport);
3766 frame->local_seq = tp->snd_nxt;
3767 frame->remote_seq = tp->rcv_nxt;
3768 if (inp->inp_vflag & INP_IPV4) {
3769 frame->length = frame_data_offset +
3770 sizeof(struct ip) + sizeof(struct tcphdr);
3771 frame->reply_length = frame->length;
3772
3773 frame->addr_length = sizeof(struct in_addr);
3774 bcopy(&inp->inp_laddr, frame->local_addr,
3775 sizeof(struct in_addr));
3776 bcopy(&inp->inp_faddr, frame->remote_addr,
3777 sizeof(struct in_addr));
3778 } else {
3779 struct in6_addr *ip6;
3780
3781 frame->length = frame_data_offset +
3782 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3783 frame->reply_length = frame->length;
3784
3785 frame->addr_length = sizeof(struct in6_addr);
3786 ip6 = (struct in6_addr *)(void *)frame->local_addr;
3787 bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr));
3788 if (IN6_IS_SCOPE_EMBED(ip6))
3789 ip6->s6_addr16[1] = 0;
3790
3791 ip6 = (struct in6_addr *)(void *)frame->remote_addr;
3792 bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr));
3793 if (IN6_IS_SCOPE_EMBED(ip6))
3794 ip6->s6_addr16[1] = 0;
3795 }
3796
3797 /*
3798 * First the probe
3799 */
3800 m = tcp_make_keepalive_frame(tp, ifp, TRUE);
3801 if (m == NULL) {
3802 socket_unlock(so, 1);
3803 continue;
3804 }
3805 bcopy(m->m_data, frame->data + frame_data_offset,
3806 m->m_len);
3807 m_freem(m);
3808
3809 /*
3810 * Now the response packet to incoming probes
3811 */
3812 m = tcp_make_keepalive_frame(tp, ifp, FALSE);
3813 if (m == NULL) {
3814 socket_unlock(so, 1);
3815 continue;
3816 }
3817 bcopy(m->m_data, frame->reply_data + frame_data_offset,
3818 m->m_len);
3819 m_freem(m);
3820
3821 frame_index++;
3822 socket_unlock(so, 1);
3823 }
3824 lck_rw_done(tcbinfo.ipi_lock);
3825 *used_frames_count = frame_index;
3826 }
3827
3828 errno_t
3829 tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
3830 u_int32_t notify_id)
3831 {
3832 struct tcp_notify_ack_marker *elm;
3833
3834 if (so->so_snd.sb_cc == 0)
3835 return (ENOBUFS);
3836
3837 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3838 /* Duplicate id is not allowed */
3839 if (elm->notify_id == notify_id)
3840 return (EINVAL);
3841 /* Duplicate position is not allowed */
3842 if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc)
3843 return (EINVAL);
3844 }
3845 return (0);
3846 }
3847
3848 errno_t
3849 tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
3850 {
3851 struct tcp_notify_ack_marker *nm, *elm = NULL;
3852 struct socket *so = tp->t_inpcb->inp_socket;
3853
3854 MALLOC(nm, struct tcp_notify_ack_marker *, sizeof (*nm),
3855 M_TEMP, M_WAIT | M_ZERO);
3856 if (nm == NULL)
3857 return (ENOMEM);
3858 nm->notify_id = notify_id;
3859 nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
3860
3861 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3862 if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una))
3863 break;
3864 }
3865
3866 if (elm == NULL) {
3867 VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
3868 SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
3869 } else {
3870 SLIST_INSERT_AFTER(elm, nm, notify_next);
3871 }
3872 tp->t_notify_ack_count++;
3873 return (0);
3874 }
3875
3876 void
3877 tcp_notify_ack_free(struct tcpcb *tp)
3878 {
3879 struct tcp_notify_ack_marker *elm, *next;
3880 if (SLIST_EMPTY(&tp->t_notify_ack))
3881 return;
3882
3883 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
3884 SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
3885 notify_next);
3886 FREE(elm, M_TEMP);
3887 }
3888 SLIST_INIT(&tp->t_notify_ack);
3889 tp->t_notify_ack_count = 0;
3890 }
3891
3892 inline void
3893 tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
3894 {
3895 struct tcp_notify_ack_marker *elm;
3896
3897 elm = SLIST_FIRST(&tp->t_notify_ack);
3898 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
3899 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
3900 }
3901 }
3902
3903 void
3904 tcp_get_notify_ack_count(struct tcpcb *tp,
3905 struct tcp_notify_ack_complete *retid)
3906 {
3907 struct tcp_notify_ack_marker *elm;
3908 size_t complete = 0;
3909
3910 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3911 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una))
3912 complete++;
3913 else
3914 break;
3915 }
3916 retid->notify_pending = tp->t_notify_ack_count - complete;
3917 retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete);
3918 }
3919
3920 void
3921 tcp_get_notify_ack_ids(struct tcpcb *tp,
3922 struct tcp_notify_ack_complete *retid)
3923 {
3924 size_t i = 0;
3925 struct tcp_notify_ack_marker *elm, *next;
3926
3927 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
3928 if (i >= retid->notify_complete_count)
3929 break;
3930 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
3931 retid->notify_complete_id[i++] = elm->notify_id;
3932 SLIST_REMOVE(&tp->t_notify_ack, elm,
3933 tcp_notify_ack_marker, notify_next);
3934 FREE(elm, M_TEMP);
3935 tp->t_notify_ack_count--;
3936 } else {
3937 break;
3938 }
3939 }
3940 }
3941
3942 bool
3943 tcp_notify_ack_active(struct socket *so)
3944 {
3945 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
3946 SOCK_TYPE(so) == SOCK_STREAM) {
3947 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3948
3949 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
3950 struct tcp_notify_ack_marker *elm;
3951 elm = SLIST_FIRST(&tp->t_notify_ack);
3952 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una))
3953 return (true);
3954 }
3955 }
3956 return (false);
3957 }
3958
3959 inline int32_t
3960 inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
3961 {
3962 struct inpcb *inp = sotoinpcb(so);
3963 struct tcpcb *tp = intotcpcb(inp);
3964
3965 if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
3966 so->so_snd.sb_cc > 0) {
3967 int32_t unsent, sent;
3968 sent = tp->snd_max - th_ack;
3969 if (tp->t_flags & TF_SENTFIN)
3970 sent--;
3971 unsent = so->so_snd.sb_cc - sent;
3972 return (unsent);
3973 }
3974 return (0);
3975 }
3976
3977 #define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
3978 if (_ipv4_) { \
3979 ifp->if_ipv4_stat->_stat_++; \
3980 } else { \
3981 ifp->if_ipv6_stat->_stat_++; \
3982 } \
3983 }
3984
3985 #define FLOW_ECN_ENABLED(_flags_) \
3986 ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
3987
3988 void tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
3989 struct ifnet *ifp)
3990 {
3991 if (ifp == NULL || !IF_FULLY_ATTACHED(ifp))
3992 return;
3993
3994 ifnet_lock_shared(ifp);
3995 if (ifs->ecn_flags & TE_SETUPSENT) {
3996 if (ifs->ecn_flags & TE_CLIENT_SETUP) {
3997 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
3998 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
3999 IFP_PER_FLOW_STAT(ifs->ipv4,
4000 ecn_client_success);
4001 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4002 IFP_PER_FLOW_STAT(ifs->ipv4,
4003 ecn_syn_lost);
4004 } else {
4005 IFP_PER_FLOW_STAT(ifs->ipv4,
4006 ecn_peer_nosupport);
4007 }
4008 } else {
4009 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
4010 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4011 IFP_PER_FLOW_STAT(ifs->ipv4,
4012 ecn_server_success);
4013 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4014 IFP_PER_FLOW_STAT(ifs->ipv4,
4015 ecn_synack_lost);
4016 } else {
4017 IFP_PER_FLOW_STAT(ifs->ipv4,
4018 ecn_peer_nosupport);
4019 }
4020 }
4021 } else {
4022 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
4023 }
4024 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4025 if (ifs->ecn_flags & TE_RECV_ECN_CE) {
4026 tcpstat.tcps_ecn_conn_recv_ce++;
4027 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
4028 }
4029 if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
4030 tcpstat.tcps_ecn_conn_recv_ece++;
4031 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
4032 }
4033 if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
4034 if (ifs->txretransmitbytes > 0 ||
4035 ifs->rxoutoforderbytes > 0) {
4036 tcpstat.tcps_ecn_conn_pl_ce++;
4037 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
4038 } else {
4039 tcpstat.tcps_ecn_conn_nopl_ce++;
4040 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
4041 }
4042 } else {
4043 if (ifs->txretransmitbytes > 0 ||
4044 ifs->rxoutoforderbytes > 0) {
4045 tcpstat.tcps_ecn_conn_plnoce++;
4046 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
4047 }
4048 }
4049 }
4050
4051 /* Other stats are interesting for non-local connections only */
4052 if (ifs->local) {
4053 ifnet_lock_done(ifp);
4054 return;
4055 }
4056
4057 if (ifs->ipv4) {
4058 ifp->if_ipv4_stat->timestamp = net_uptime();
4059 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4060 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_on);
4061 } else {
4062 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_off);
4063 }
4064 } else {
4065 ifp->if_ipv6_stat->timestamp = net_uptime();
4066 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4067 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_on);
4068 } else {
4069 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_off);
4070 }
4071 }
4072
4073 if (ifs->rxmit_drop) {
4074 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4075 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
4076 } else {
4077 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
4078 }
4079 }
4080 if (ifs->ecn_fallback_synloss)
4081 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
4082 if (ifs->ecn_fallback_droprst)
4083 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
4084 if (ifs->ecn_fallback_droprxmt)
4085 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
4086 if (ifs->ecn_fallback_ce)
4087 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
4088 if (ifs->ecn_fallback_reorder)
4089 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
4090 if (ifs->ecn_recv_ce > 0)
4091 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
4092 if (ifs->ecn_recv_ece > 0)
4093 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
4094
4095 tcp_flow_lim_stats(ifs, &ifp->if_lim_stat);
4096 ifnet_lock_done(ifp);
4097 }