]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
xnu-3789.51.2.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/proc.h> /* for proc0 declaration */
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/syslog.h>
81 #include <sys/mcache.h>
82 #include <sys/kasl.h>
83 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
84
85 #include <machine/endian.h>
86
87 #include <net/if.h>
88 #include <net/if_types.h>
89 #include <net/route.h>
90 #include <net/ntstat.h>
91 #include <net/dlil.h>
92
93 #include <netinet/in.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
97 #include <netinet/in_var.h>
98 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
99 #include <netinet/in_pcb.h>
100 #include <netinet/ip_var.h>
101 #include <mach/sdt.h>
102 #if INET6
103 #include <netinet/ip6.h>
104 #include <netinet/icmp6.h>
105 #include <netinet6/nd6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet6/in6_pcb.h>
108 #endif
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_cache.h>
111 #include <netinet/tcp_fsm.h>
112 #include <netinet/tcp_seq.h>
113 #include <netinet/tcp_timer.h>
114 #include <netinet/tcp_var.h>
115 #include <netinet/tcp_cc.h>
116 #include <dev/random/randomdev.h>
117 #include <kern/zalloc.h>
118 #if INET6
119 #include <netinet6/tcp6_var.h>
120 #endif
121 #include <netinet/tcpip.h>
122 #if TCPDEBUG
123 #include <netinet/tcp_debug.h>
124 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
125 struct tcphdr tcp_savetcp;
126 #endif /* TCPDEBUG */
127
128 #if IPSEC
129 #include <netinet6/ipsec.h>
130 #if INET6
131 #include <netinet6/ipsec6.h>
132 #endif
133 #include <netkey/key.h>
134 #endif /*IPSEC*/
135
136 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
137 #include <security/mac_framework.h>
138 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
139
140 #include <sys/kdebug.h>
141 #include <netinet/lro_ext.h>
142 #if MPTCP
143 #include <netinet/mptcp_var.h>
144 #include <netinet/mptcp.h>
145 #include <netinet/mptcp_opt.h>
146 #endif /* MPTCP */
147
148 #include <corecrypto/ccaes.h>
149
150 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
151 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
152 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
153 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
154
155 #define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ)
156 #define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ)
157 #define TCP_STRETCHACK_ENABLE_PKTCNT 2000
158
159 tcp_cc tcp_ccgen;
160
161 struct tcpstat tcpstat;
162
163 static int log_in_vain = 0;
164 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
165 CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0,
166 "Log all incoming TCP connections");
167
168 static int blackhole = 0;
169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
170 CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0,
171 "Do not send RST when dropping refused connections");
172
173 int tcp_delack_enabled = 3;
174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack,
175 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0,
176 "Delay ACK to try and piggyback it onto a data packet");
177
178 int tcp_lq_overflow = 1;
179 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow,
180 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0,
181 "Listen Queue Overflow");
182
183 int tcp_recv_bg = 0;
184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
185 &tcp_recv_bg, 0, "Receive background");
186
187 #if TCP_DROP_SYNFIN
188 static int drop_synfin = 1;
189 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin,
190 CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0,
191 "Drop TCP packets with SYN+FIN set");
192 #endif
193
194 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
195 "TCP Segment Reassembly Queue");
196
197 static int tcp_reass_overflows = 0;
198 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
199 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
200 "Global number of TCP Segment Reassembly Queue Overflows");
201
202
203 __private_extern__ int slowlink_wsize = 8192;
204 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize,
205 CTLFLAG_RW | CTLFLAG_LOCKED,
206 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
207
208 int maxseg_unacked = 8;
209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked,
210 CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0,
211 "Maximum number of outstanding segments left unacked");
212
213 int tcp_do_rfc3465 = 1;
214 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &tcp_do_rfc3465, 0, "");
216
217 int tcp_do_rfc3465_lim2 = 1;
218 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2,
219 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0,
220 "Appropriate bytes counting w/ L=2*SMSS");
221
222 int rtt_samples_per_slot = 20;
223
224 int tcp_allowed_iaj = ALLOWED_IAJ;
225 int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
226 u_int32_t tcp_autorcvbuf_inc_shift = 3;
227 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj,
228 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_allowed_iaj, 0,
229 "Allowed inter-packet arrival jiter");
230 #if (DEVELOPMENT || DEBUG)
231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh,
232 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_high_thresh, 0,
233 "Used in calculating maximum accumulated IAJ");
234
235 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift,
236 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_inc_shift, 0,
237 "Shift for increment in receive socket buffer size");
238 #endif /* (DEVELOPMENT || DEBUG) */
239
240 u_int32_t tcp_do_autorcvbuf = 1;
241 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf,
242 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autorcvbuf, 0,
243 "Enable automatic socket buffer tuning");
244
245 u_int32_t tcp_autorcvbuf_max = 512 * 1024;
246 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax,
247 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max, 0,
248 "Maximum receive socket buffer size");
249
250 u_int32_t tcp_autorcvbuf_max_ca = 512 * 1024;
251 #if (DEBUG || DEVELOPMENT)
252 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmaxca,
253 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max_ca, 0,
254 "Maximum receive socket buffer size");
255 #endif /* (DEBUG || DEVELOPMENT) */
256
257 int sw_lro = 0;
258 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
259 &sw_lro, 0, "Used to coalesce TCP packets");
260
261 int lrodebug = 0;
262 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg,
263 CTLFLAG_RW | CTLFLAG_LOCKED, &lrodebug, 0,
264 "Used to debug SW LRO");
265
266 int lro_start = 4;
267 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &lro_start, 0,
269 "Segments for starting LRO computed as power of 2");
270
271 extern int tcp_do_autosendbuf;
272
273 int limited_txmt = 1;
274 int early_rexmt = 1;
275 int sack_ackadv = 1;
276 int tcp_dsack_enable = 1;
277
278 #if (DEVELOPMENT || DEBUG)
279 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit,
280 CTLFLAG_RW | CTLFLAG_LOCKED, &limited_txmt, 0,
281 "Enable limited transmit");
282
283 SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &early_rexmt, 0,
285 "Enable Early Retransmit");
286
287 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv,
288 CTLFLAG_RW | CTLFLAG_LOCKED, &sack_ackadv, 0,
289 "Use SACK with cumulative ack advancement as a dupack");
290
291 SYSCTL_INT(_net_inet_tcp, OID_AUTO, dsack_enable,
292 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_dsack_enable, 0,
293 "use DSACK TCP option to report duplicate segments");
294 #endif /* (DEVELOPMENT || DEBUG) */
295
296 extern int tcp_TCPTV_MIN;
297 extern int tcp_acc_iaj_high;
298 extern int tcp_acc_iaj_react_limit;
299
300 int tcprexmtthresh = 3;
301
302 u_int32_t tcp_now;
303 struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
304 lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */
305
306 struct inpcbhead tcb;
307 #define tcb6 tcb /* for KAME src sync over BSD*'s */
308 struct inpcbinfo tcbinfo;
309
310 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
311 struct tcpopt *);
312 static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
313 static void tcp_pulloutofband(struct socket *,
314 struct tcphdr *, struct mbuf *, int);
315 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
316 struct ifnet *);
317 static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
318 static inline unsigned int tcp_maxmtu(struct rtentry *);
319 static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags);
320 static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
321
322 #if TRAFFIC_MGT
323 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
324 int reset_size);
325 void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
326 static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
327 #endif /* TRAFFIC_MGT */
328
329 #if INET6
330 static inline unsigned int tcp_maxmtu6(struct rtentry *);
331 #endif
332
333 static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
334 struct tcpopt *to, u_int32_t tlen, u_int32_t rcvbuf_max);
335 void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
336 static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
337 static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
338 static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
339 u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max);
340 static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
341 static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
342 struct tcphdr *th);
343 static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
344 static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
345 struct tcpopt *to);
346 /*
347 * Constants used for resizing receive socket buffer
348 * when timestamps are not supported
349 */
350 #define TCPTV_RCVNOTS_QUANTUM 100
351 #define TCP_RCVNOTS_BYTELEVEL 204800
352
353 /*
354 * Constants used for limiting early retransmits
355 * to 10 per minute.
356 */
357 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
358 #define TCP_EARLY_REXMT_LIMIT 10
359
360 extern void ipfwsyslog( int level, const char *format,...);
361 extern int fw_verbose;
362
363 #if IPFIREWALL
364 extern void ipfw_stealth_stats_incr_tcp(void);
365
366 #define log_in_vain_log( a ) { \
367 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
368 ipfwsyslog a ; \
369 } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) { \
370 ipfw_stealth_stats_incr_tcp(); \
371 } \
372 else log a ; \
373 }
374 #else
375 #define log_in_vain_log( a ) { log a; }
376 #endif
377
378 int tcp_rcvunackwin = TCPTV_UNACKWIN;
379 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
380 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
381 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
382 &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
383
384 #define DELAY_ACK(tp, th) \
385 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
386
387 static int tcp_dropdropablreq(struct socket *head);
388 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
389 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
390 void tcp_set_background_cc(struct socket *so);
391 void tcp_set_foreground_cc(struct socket *so);
392 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
393 static void tcp_bwmeas_check(struct tcpcb *tp);
394
395 #if TRAFFIC_MGT
396 void
397 reset_acc_iaj(struct tcpcb *tp)
398 {
399 tp->acc_iaj = 0;
400 CLEAR_IAJ_STATE(tp);
401 }
402
403 static inline void
404 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
405 {
406 if (rst_size > 0)
407 tp->iaj_size = 0;
408 if (tp->iaj_size == 0 || size >= tp->iaj_size) {
409 tp->iaj_size = size;
410 tp->iaj_rcv_ts = tcp_now;
411 tp->iaj_small_pkt = 0;
412 }
413 }
414
415 /* For every 32 bit unsigned integer(v), this function will find the
416 * largest integer n such that (n*n <= v). This takes at most 16 iterations
417 * irrespective of the value of v and does not involve multiplications.
418 */
419 static inline int
420 isqrt(unsigned int val)
421 {
422 unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
423 unsigned int temp, g=0, b=0x8000, bshft=15;
424 if ( val <= 100) {
425 for (g = 0; g <= 10; ++g) {
426 if (sqrt_cache[g] > val) {
427 g--;
428 break;
429 } else if (sqrt_cache[g] == val) {
430 break;
431 }
432 }
433 } else {
434 do {
435 temp = (((g << 1) + b) << (bshft--));
436 if (val >= temp) {
437 g += b;
438 val -= temp;
439 }
440 b >>= 1;
441 } while ( b > 0 && val > 0);
442 }
443 return(g);
444 }
445
446 /*
447 * With LRO, roughly estimate the inter arrival time between
448 * each sub coalesced packet as an average. Count the delay
449 * cur_iaj to be the delay between the last packet received
450 * and the first packet of the LRO stream. Due to round off errors
451 * cur_iaj may be the same as lro_delay_factor. Averaging has
452 * round off errors too. lro_delay_factor may be close to 0
453 * in steady state leading to lower values fed to compute_iaj_meat.
454 */
455 void
456 compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
457 {
458 uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
459 uint32_t timediff = 0;
460
461 if (cur_iaj >= lro_delay_factor) {
462 cur_iaj = cur_iaj - lro_delay_factor;
463 }
464
465 compute_iaj_meat(tp, cur_iaj);
466
467 if (nlropkts <= 1)
468 return;
469
470 nlropkts--;
471
472 timediff = lro_delay_factor/nlropkts;
473
474 while (nlropkts > 0)
475 {
476 compute_iaj_meat(tp, timediff);
477 nlropkts--;
478 }
479 }
480
481 static
482 void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
483 {
484 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
485 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
486 */
487 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
488 #define IAJ_DIV_SHIFT 4
489 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
490
491 uint32_t allowed_iaj, acc_iaj = 0;
492
493 uint32_t mean, temp;
494 int32_t cur_iaj_dev;
495
496 cur_iaj_dev = (cur_iaj - tp->avg_iaj);
497
498 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
499 * may have a constant jitter more than that. We detect this by
500 * using standard deviation.
501 */
502 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
503 if (allowed_iaj < tcp_allowed_iaj)
504 allowed_iaj = tcp_allowed_iaj;
505
506 /* Initially when the connection starts, the senders congestion
507 * window is small. During this period we avoid throttling a
508 * connection because we do not have a good starting point for
509 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
510 * the first few packets.
511 */
512 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
513 if ( cur_iaj <= allowed_iaj ) {
514 if (tp->acc_iaj >= 2)
515 acc_iaj = tp->acc_iaj - 2;
516 else
517 acc_iaj = 0;
518
519 } else {
520 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
521 }
522
523 if (acc_iaj > MAX_ACC_IAJ)
524 acc_iaj = MAX_ACC_IAJ;
525 tp->acc_iaj = acc_iaj;
526 }
527
528 /* Compute weighted average where the history has a weight of
529 * 15 out of 16 and the current value has a weight of 1 out of 16.
530 * This will make the short-term measurements have more weight.
531 *
532 * The addition of 8 will help to round-up the value
533 * instead of round-down
534 */
535 tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
536 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
537
538 /* Compute Root-mean-square of deviation where mean is a weighted
539 * average as described above.
540 */
541 temp = tp->std_dev_iaj * tp->std_dev_iaj;
542 mean = (((temp << IAJ_DIV_SHIFT) - temp)
543 + (cur_iaj_dev * cur_iaj_dev)
544 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
545
546 tp->std_dev_iaj = isqrt(mean);
547
548 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
549 uint32_t, allowed_iaj);
550
551 return;
552 }
553 #endif /* TRAFFIC_MGT */
554
555 /* Check if enough amount of data has been acknowledged since
556 * bw measurement was started
557 */
558 static void
559 tcp_bwmeas_check(struct tcpcb *tp)
560 {
561 int32_t bw_meas_bytes;
562 uint32_t bw, bytes, elapsed_time;
563 bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
564 if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
565 bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
566 bytes = bw_meas_bytes;
567 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
568 if (elapsed_time > 0) {
569 bw = bytes / elapsed_time;
570 if ( bw > 0) {
571 if (tp->t_bwmeas->bw_sndbw > 0) {
572 tp->t_bwmeas->bw_sndbw =
573 (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
574 } else {
575 tp->t_bwmeas->bw_sndbw = bw;
576 }
577 }
578 }
579 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
580 }
581 }
582
583 static int
584 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
585 struct ifnet *ifp)
586 {
587 struct tseg_qent *q;
588 struct tseg_qent *p = NULL;
589 struct tseg_qent *nq;
590 struct tseg_qent *te = NULL;
591 struct inpcb *inp = tp->t_inpcb;
592 struct socket *so = inp->inp_socket;
593 int flags = 0;
594 int dowakeup = 0;
595 struct mbuf *oodata = NULL;
596 int copy_oodata = 0;
597 u_int16_t qlimit;
598 boolean_t cell = IFNET_IS_CELLULAR(ifp);
599 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
600 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
601 boolean_t dsack_set = FALSE;
602
603 /*
604 * Call with th==0 after become established to
605 * force pre-ESTABLISHED data up to user socket.
606 */
607 if (th == NULL)
608 goto present;
609
610 /*
611 * If the reassembly queue already has entries or if we are going
612 * to add a new one, then the connection has reached a loss state.
613 * Reset the stretch-ack algorithm at this point.
614 */
615 tcp_reset_stretch_ack(tp);
616
617 #if TRAFFIC_MGT
618 if (tp->acc_iaj > 0)
619 reset_acc_iaj(tp);
620 #endif /* TRAFFIC_MGT */
621
622 /*
623 * Limit the number of segments in the reassembly queue to prevent
624 * holding on to too many segments (and thus running out of mbufs).
625 * Make sure to let the missing segment through which caused this
626 * queue. Always keep one global queue entry spare to be able to
627 * process the missing segment.
628 */
629 qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
630 (TCP_AUTORCVBUF_MAX(ifp) >> 10));
631 if (th->th_seq != tp->rcv_nxt &&
632 (tp->t_reassqlen + 1) >= qlimit) {
633 tcp_reass_overflows++;
634 tcpstat.tcps_rcvmemdrop++;
635 m_freem(m);
636 *tlenp = 0;
637 return (0);
638 }
639
640 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
641 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
642 if (te == NULL) {
643 tcpstat.tcps_rcvmemdrop++;
644 m_freem(m);
645 return (0);
646 }
647 tp->t_reassqlen++;
648
649 /*
650 * Find a segment which begins after this one does.
651 */
652 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
653 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
654 break;
655 p = q;
656 }
657
658 /*
659 * If there is a preceding segment, it may provide some of
660 * our data already. If so, drop the data from the incoming
661 * segment. If it provides all of our data, drop us.
662 */
663 if (p != NULL) {
664 int i;
665 /* conversion to int (in i) handles seq wraparound */
666 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
667 if (i > 0) {
668 if (TCP_DSACK_ENABLED(tp) && i > 1) {
669 /*
670 * Note duplicate data sequnce numbers
671 * to report in DSACK option
672 */
673 tp->t_dsack_lseq = th->th_seq;
674 tp->t_dsack_rseq = th->th_seq +
675 min(i, *tlenp);
676
677 /*
678 * Report only the first part of partial/
679 * non-contiguous duplicate sequence space
680 */
681 dsack_set = TRUE;
682 }
683 if (i >= *tlenp) {
684 tcpstat.tcps_rcvduppack++;
685 tcpstat.tcps_rcvdupbyte += *tlenp;
686 if (nstat_collect) {
687 nstat_route_rx(inp->inp_route.ro_rt,
688 1, *tlenp,
689 NSTAT_RX_FLAG_DUPLICATE);
690 INP_ADD_STAT(inp, cell, wifi, wired,
691 rxpackets, 1);
692 INP_ADD_STAT(inp, cell, wifi, wired,
693 rxbytes, *tlenp);
694 tp->t_stat.rxduplicatebytes += *tlenp;
695 }
696 m_freem(m);
697 zfree(tcp_reass_zone, te);
698 te = NULL;
699 tp->t_reassqlen--;
700 /*
701 * Try to present any queued data
702 * at the left window edge to the user.
703 * This is needed after the 3-WHS
704 * completes.
705 */
706 goto present;
707 }
708 m_adj(m, i);
709 *tlenp -= i;
710 th->th_seq += i;
711 }
712 }
713 tp->t_rcvoopack++;
714 tcpstat.tcps_rcvoopack++;
715 tcpstat.tcps_rcvoobyte += *tlenp;
716 if (nstat_collect) {
717 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
718 NSTAT_RX_FLAG_OUT_OF_ORDER);
719 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
720 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
721 tp->t_stat.rxoutoforderbytes += *tlenp;
722 }
723
724 /*
725 * While we overlap succeeding segments trim them or,
726 * if they are completely covered, dequeue them.
727 */
728 while (q) {
729 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
730 if (i <= 0)
731 break;
732
733 /*
734 * Report only the first part of partial/non-contiguous
735 * duplicate segment in dsack option. The variable
736 * dsack_set will be true if a previous entry has some of
737 * the duplicate sequence space.
738 */
739 if (TCP_DSACK_ENABLED(tp) && i > 1 && !dsack_set) {
740 if (tp->t_dsack_lseq == 0) {
741 tp->t_dsack_lseq = q->tqe_th->th_seq;
742 tp->t_dsack_rseq =
743 tp->t_dsack_lseq + min(i, q->tqe_len);
744 } else {
745 /*
746 * this segment overlaps data in multple
747 * entries in the reassembly queue, move
748 * the right sequence number further.
749 */
750 tp->t_dsack_rseq =
751 tp->t_dsack_rseq + min(i, q->tqe_len);
752 }
753 }
754 if (i < q->tqe_len) {
755 q->tqe_th->th_seq += i;
756 q->tqe_len -= i;
757 m_adj(q->tqe_m, i);
758 break;
759 }
760
761 nq = LIST_NEXT(q, tqe_q);
762 LIST_REMOVE(q, tqe_q);
763 m_freem(q->tqe_m);
764 zfree(tcp_reass_zone, q);
765 tp->t_reassqlen--;
766 q = nq;
767 }
768
769 /* Insert the new segment queue entry into place. */
770 te->tqe_m = m;
771 te->tqe_th = th;
772 te->tqe_len = *tlenp;
773
774 if (p == NULL) {
775 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
776 } else {
777 LIST_INSERT_AFTER(p, te, tqe_q);
778 }
779
780 /*
781 * New out-of-order data exists, and is pointed to by
782 * queue entry te. Set copy_oodata to 1 so out-of-order data
783 * can be copied off to sockbuf after in-order data
784 * is copied off.
785 */
786 if (!(so->so_state & SS_CANTRCVMORE))
787 copy_oodata = 1;
788
789 present:
790 /*
791 * Present data to user, advancing rcv_nxt through
792 * completed sequence space.
793 */
794 if (!TCPS_HAVEESTABLISHED(tp->t_state))
795 return (0);
796 q = LIST_FIRST(&tp->t_segq);
797 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
798 /* Stop using LRO once out of order packets arrive */
799 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
800 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
801 th->th_dport, th->th_sport);
802 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
803 }
804
805 /*
806 * continue processing if out-of-order data
807 * can be delivered
808 */
809 if (q && (so->so_flags & SOF_ENABLE_MSGS))
810 goto msg_unordered_delivery;
811
812 return (0);
813 }
814
815 /*
816 * If there is already another thread doing reassembly for this
817 * connection, it is better to let it finish the job --
818 * (radar 16316196)
819 */
820 if (tp->t_flagsext & TF_REASS_INPROG)
821 return (0);
822
823 tp->t_flagsext |= TF_REASS_INPROG;
824 /* lost packet was recovered, so ooo data can be returned */
825 tcpstat.tcps_recovered_pkts++;
826
827 do {
828 tp->rcv_nxt += q->tqe_len;
829 flags = q->tqe_th->th_flags & TH_FIN;
830 LIST_REMOVE(q, tqe_q);
831 if (so->so_state & SS_CANTRCVMORE) {
832 m_freem(q->tqe_m);
833 } else {
834 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
835 if (so->so_flags & SOF_ENABLE_MSGS) {
836 /*
837 * Append the inorder data as a message to the
838 * receive socket buffer. Also check to see if
839 * the data we are about to deliver is the same
840 * data that we wanted to pass up to the user
841 * out of order. If so, reset copy_oodata --
842 * the received data filled a gap, and
843 * is now in order!
844 */
845 if (q == te)
846 copy_oodata = 0;
847 }
848 if (sbappendstream_rcvdemux(so, q->tqe_m,
849 q->tqe_th->th_seq - (tp->irs + 1), 0))
850 dowakeup = 1;
851 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
852 tcp_update_lro_seq(tp->rcv_nxt,
853 inp->inp_laddr, inp->inp_faddr,
854 th->th_dport, th->th_sport);
855 }
856 }
857 zfree(tcp_reass_zone, q);
858 tp->t_reassqlen--;
859 q = LIST_FIRST(&tp->t_segq);
860 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
861 tp->t_flagsext &= ~TF_REASS_INPROG;
862
863 #if INET6
864 if ((inp->inp_vflag & INP_IPV6) != 0) {
865
866 KERNEL_DEBUG(DBG_LAYER_BEG,
867 ((inp->inp_fport << 16) | inp->inp_lport),
868 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
869 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
870 0,0,0);
871 }
872 else
873 #endif
874 {
875 KERNEL_DEBUG(DBG_LAYER_BEG,
876 ((inp->inp_fport << 16) | inp->inp_lport),
877 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
878 (inp->inp_faddr.s_addr & 0xffff)),
879 0,0,0);
880 }
881
882 msg_unordered_delivery:
883 /* Deliver out-of-order data as a message */
884 if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
885 /*
886 * make a copy of the mbuf to be delivered up to
887 * the user, and add it to the sockbuf
888 */
889 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
890 if (oodata != NULL) {
891 if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
892 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
893 dowakeup = 1;
894 tcpstat.tcps_msg_unopkts++;
895 } else {
896 tcpstat.tcps_msg_unoappendfail++;
897 }
898 }
899 }
900
901 if (dowakeup)
902 sorwakeup(so); /* done with socket lock held */
903 return (flags);
904 }
905
906 /*
907 * Reduce congestion window -- used when ECN is seen or when a tail loss
908 * probe recovers the last packet.
909 */
910 static void
911 tcp_reduce_congestion_window(
912 struct tcpcb *tp)
913 {
914 /*
915 * If the current tcp cc module has
916 * defined a hook for tasks to run
917 * before entering FR, call it
918 */
919 if (CC_ALGO(tp)->pre_fr != NULL)
920 CC_ALGO(tp)->pre_fr(tp);
921 ENTER_FASTRECOVERY(tp);
922 if (tp->t_flags & TF_SENTFIN)
923 tp->snd_recover = tp->snd_max - 1;
924 else
925 tp->snd_recover = tp->snd_max;
926 tp->t_timer[TCPT_REXMT] = 0;
927 tp->t_timer[TCPT_PTO] = 0;
928 tp->t_rtttime = 0;
929 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
930 tcp_cc_adjust_nonvalidated_cwnd(tp);
931 } else {
932 tp->snd_cwnd = tp->snd_ssthresh +
933 tp->t_maxseg * tcprexmtthresh;
934 }
935 }
936
937 /*
938 * This function is called upon reception of data on a socket. It's purpose is
939 * to handle the adaptive keepalive timers that monitor whether the connection
940 * is making progress. First the adaptive read-timer, second the TFO probe-timer.
941 *
942 * The application wants to get an event if there is a stall during read.
943 * Set the initial keepalive timeout to be equal to twice RTO.
944 *
945 * If the outgoing interface is in marginal conditions, we need to
946 * enable read probes for that too.
947 */
948 static inline void
949 tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
950 {
951 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
952
953 if ((tp->t_adaptive_rtimo > 0 ||
954 (outifp != NULL &&
955 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
956 && tlen > 0 &&
957 tp->t_state == TCPS_ESTABLISHED) {
958 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
959 (TCP_REXMTVAL(tp) << 1));
960 tp->t_flagsext |= TF_DETECT_READSTALL;
961 tp->t_rtimo_probes = 0;
962 }
963 }
964
965 inline void
966 tcp_keepalive_reset(struct tcpcb *tp)
967 {
968 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
969 TCP_CONN_KEEPIDLE(tp));
970 tp->t_flagsext &= ~(TF_DETECT_READSTALL);
971 tp->t_rtimo_probes = 0;
972 }
973
974 /*
975 * TCP input routine, follows pages 65-76 of the
976 * protocol specification dated September, 1981 very closely.
977 */
978 #if INET6
979 int
980 tcp6_input(struct mbuf **mp, int *offp, int proto)
981 {
982 #pragma unused(proto)
983 struct mbuf *m = *mp;
984 uint32_t ia6_flags;
985 struct ifnet *ifp = m->m_pkthdr.rcvif;
986
987 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
988
989 /* Expect 32-bit aligned data pointer on strict-align platforms */
990 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
991
992 /*
993 * draft-itojun-ipv6-tcp-to-anycast
994 * better place to put this in?
995 */
996 if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
997 if (ia6_flags & IN6_IFF_ANYCAST) {
998 struct ip6_hdr *ip6;
999
1000 ip6 = mtod(m, struct ip6_hdr *);
1001 icmp6_error(m, ICMP6_DST_UNREACH,
1002 ICMP6_DST_UNREACH_ADDR,
1003 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
1004
1005 IF_TCP_STATINC(ifp, icmp6unreach);
1006
1007 return (IPPROTO_DONE);
1008 }
1009 }
1010
1011 tcp_input(m, *offp);
1012 return (IPPROTO_DONE);
1013 }
1014 #endif
1015
1016 /* Depending on the usage of mbuf space in the system, this function
1017 * will return true or false. This is used to determine if a socket
1018 * buffer can take more memory from the system for auto-tuning or not.
1019 */
1020 u_int8_t
1021 tcp_cansbgrow(struct sockbuf *sb)
1022 {
1023 /* Calculate the host level space limit in terms of MSIZE buffers.
1024 * We can use a maximum of half of the available mbuf space for
1025 * socket buffers.
1026 */
1027 u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
1028
1029 /* Calculate per sb limit in terms of bytes. We optimize this limit
1030 * for upto 16 socket buffers.
1031 */
1032
1033 u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
1034
1035 if ((total_sbmb_cnt < mblim) &&
1036 (sb->sb_hiwat < sbspacelim)) {
1037 return(1);
1038 } else {
1039 OSIncrementAtomic64(&sbmb_limreached);
1040 }
1041 return(0);
1042 }
1043
1044 static void
1045 tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
1046 u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max)
1047 {
1048 /* newsize should not exceed max */
1049 newsize = min(newsize, rcvbuf_max);
1050
1051 /* The receive window scale negotiated at the
1052 * beginning of the connection will also set a
1053 * limit on the socket buffer size
1054 */
1055 newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
1056
1057 /* Set new socket buffer size */
1058 if (newsize > sbrcv->sb_hiwat &&
1059 (sbreserve(sbrcv, newsize) == 1)) {
1060 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
1061 (idealsize != 0) ? idealsize : newsize), rcvbuf_max);
1062
1063 /* Again check the limit set by the advertised
1064 * window scale
1065 */
1066 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
1067 TCP_MAXWIN << tp->rcv_scale);
1068 }
1069 }
1070
1071 /*
1072 * This function is used to grow a receive socket buffer. It
1073 * will take into account system-level memory usage and the
1074 * bandwidth available on the link to make a decision.
1075 */
1076 static void
1077 tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1078 struct tcpopt *to, u_int32_t pktlen, u_int32_t rcvbuf_max)
1079 {
1080 struct socket *so = sbrcv->sb_so;
1081
1082 /*
1083 * Do not grow the receive socket buffer if
1084 * - auto resizing is disabled, globally or on this socket
1085 * - the high water mark already reached the maximum
1086 * - the stream is in background and receive side is being
1087 * throttled
1088 * - if there are segments in reassembly queue indicating loss,
1089 * do not need to increase recv window during recovery as more
1090 * data is not going to be sent. A duplicate ack sent during
1091 * recovery should not change the receive window
1092 */
1093 if (tcp_do_autorcvbuf == 0 ||
1094 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1095 tcp_cansbgrow(sbrcv) == 0 ||
1096 sbrcv->sb_hiwat >= rcvbuf_max ||
1097 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1098 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
1099 !LIST_EMPTY(&tp->t_segq)) {
1100 /* Can not resize the socket buffer, just return */
1101 goto out;
1102 }
1103
1104 if (TSTMP_GT(tcp_now,
1105 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1106 /* If there has been an idle period in the
1107 * connection, just restart the measurement
1108 */
1109 goto out;
1110 }
1111
1112 if (!TSTMP_SUPPORTED(tp)) {
1113 /*
1114 * Timestamp option is not supported on this connection.
1115 * If the connection reached a state to indicate that
1116 * the receive socket buffer needs to grow, increase
1117 * the high water mark.
1118 */
1119 if (TSTMP_GEQ(tcp_now,
1120 tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1121 if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1122 tcp_sbrcv_reserve(tp, sbrcv,
1123 tcp_autorcvbuf_max, 0,
1124 tcp_autorcvbuf_max);
1125 }
1126 goto out;
1127 } else {
1128 tp->rfbuf_cnt += pktlen;
1129 return;
1130 }
1131 } else if (to->to_tsecr != 0) {
1132 /*
1133 * If the timestamp shows that one RTT has
1134 * completed, we can stop counting the
1135 * bytes. Here we consider increasing
1136 * the socket buffer if the bandwidth measured in
1137 * last rtt, is more than half of sb_hiwat, this will
1138 * help to scale the buffer according to the bandwidth
1139 * on the link.
1140 */
1141 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1142 if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1143 (sbrcv->sb_hiwat >> 1))) {
1144 int32_t rcvbuf_inc, min_incr;
1145 /*
1146 * Increment the receive window by a
1147 * multiple of maximum sized segments.
1148 * This will prevent a connection from
1149 * sending smaller segments on wire if it
1150 * is limited by the receive window.
1151 *
1152 * Set the ideal size based on current
1153 * bandwidth measurements. We set the
1154 * ideal size on receive socket buffer to
1155 * be twice the bandwidth delay product.
1156 */
1157 rcvbuf_inc = (tp->rfbuf_cnt << 1)
1158 - sbrcv->sb_hiwat;
1159
1160 /*
1161 * Make the increment equal to 8 segments
1162 * at least
1163 */
1164 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1165 if (rcvbuf_inc < min_incr)
1166 rcvbuf_inc = min_incr;
1167
1168 rcvbuf_inc =
1169 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1170 tcp_sbrcv_reserve(tp, sbrcv,
1171 sbrcv->sb_hiwat + rcvbuf_inc,
1172 (tp->rfbuf_cnt * 2), rcvbuf_max);
1173 }
1174 goto out;
1175 } else {
1176 tp->rfbuf_cnt += pktlen;
1177 return;
1178 }
1179 }
1180 out:
1181 /* Restart the measurement */
1182 tp->rfbuf_ts = 0;
1183 tp->rfbuf_cnt = 0;
1184 return;
1185 }
1186
1187 /* This function will trim the excess space added to the socket buffer
1188 * to help a slow-reading app. The ideal-size of a socket buffer depends
1189 * on the link bandwidth or it is set by an application and we aim to
1190 * reach that size.
1191 */
1192 void
1193 tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv)
1194 {
1195 if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1196 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1197 int32_t trim;
1198 /* compute the difference between ideal and current sizes */
1199 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1200
1201 /* Compute the maximum advertised window for
1202 * this connection.
1203 */
1204 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1205
1206 /* How much can we trim the receive socket buffer?
1207 * 1. it can not be trimmed beyond the max rcv win advertised
1208 * 2. if possible, leave 1/16 of bandwidth*delay to
1209 * avoid closing the win completely
1210 */
1211 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1212
1213 /* Sometimes leave can be zero, in that case leave at least
1214 * a few segments worth of space.
1215 */
1216 if (leave == 0)
1217 leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1218
1219 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1220 trim = imin(trim, (int32_t)diff);
1221
1222 if (trim > 0)
1223 sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1224 }
1225 }
1226
1227 /* We may need to trim the send socket buffer size for two reasons:
1228 * 1. if the rtt seen on the connection is climbing up, we do not
1229 * want to fill the buffers any more.
1230 * 2. if the congestion win on the socket backed off, there is no need
1231 * to hold more mbufs for that connection than what the cwnd will allow.
1232 */
1233 void
1234 tcp_sbsnd_trim(struct sockbuf *sbsnd)
1235 {
1236 if (tcp_do_autosendbuf == 1 &&
1237 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1238 (SB_AUTOSIZE | SB_TRIM)) &&
1239 (sbsnd->sb_idealsize > 0) &&
1240 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1241 u_int32_t trim = 0;
1242 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1243 trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1244 } else {
1245 trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1246 }
1247 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1248 }
1249 if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1250 sbsnd->sb_flags &= ~(SB_TRIM);
1251 }
1252
1253 /*
1254 * If timestamp option was not negotiated on this connection
1255 * and this connection is on the receiving side of a stream
1256 * then we can not measure the delay on the link accurately.
1257 * Instead of enabling automatic receive socket buffer
1258 * resizing, just give more space to the receive socket buffer.
1259 */
1260 static inline void
1261 tcp_sbrcv_tstmp_check(struct tcpcb *tp)
1262 {
1263 struct socket *so = tp->t_inpcb->inp_socket;
1264 u_int32_t newsize = 2 * tcp_recvspace;
1265 struct sockbuf *sbrcv = &so->so_rcv;
1266
1267 if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1268 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1269 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1270 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0, newsize);
1271 }
1272 }
1273
1274 /* A receiver will evaluate the flow of packets on a connection
1275 * to see if it can reduce ack traffic. The receiver will start
1276 * stretching acks if all of the following conditions are met:
1277 * 1. tcp_delack_enabled is set to 3
1278 * 2. If the bytes received in the last 100ms is greater than a threshold
1279 * defined by maxseg_unacked
1280 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1281 * 4. If the connection has seen enough packets to let the slow-start
1282 * finish after connection establishment or after some packet loss.
1283 *
1284 * The receiver will stop stretching acks if there is congestion/reordering
1285 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1286 * timer fires while stretching acks, it means that the packet flow has gone
1287 * below the threshold defined by maxseg_unacked and the receiver will stop
1288 * stretching acks. The receiver gets no indication when slow-start is completed
1289 * or when the connection reaches an idle state. That is why we use
1290 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1291 * state.
1292 */
1293 static inline int
1294 tcp_stretch_ack_enable(struct tcpcb *tp, int thflags)
1295 {
1296 if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1297 TSTMP_GEQ(tp->rcv_unackwin, tcp_now))
1298 tp->t_flags |= TF_STREAMING_ON;
1299 else
1300 tp->t_flags &= ~TF_STREAMING_ON;
1301
1302 /* If there has been an idle time, reset streaming detection */
1303 if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle))
1304 tp->t_flags &= ~TF_STREAMING_ON;
1305
1306 /*
1307 * If there are flags other than TH_ACK set, reset streaming
1308 * detection
1309 */
1310 if (thflags & ~TH_ACK)
1311 tp->t_flags &= ~TF_STREAMING_ON;
1312
1313 if (tp->t_flagsext & TF_DISABLE_STRETCHACK) {
1314 if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) {
1315 tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
1316 tp->rcv_nostrack_pkts = 0;
1317 tp->rcv_nostrack_ts = 0;
1318 } else {
1319 tp->rcv_nostrack_pkts++;
1320 }
1321 }
1322
1323 if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
1324 (tp->t_flags & TF_STREAMING_ON) &&
1325 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1326 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1327 return(1);
1328 }
1329
1330 return(0);
1331 }
1332
1333 /*
1334 * Reset the state related to stretch-ack algorithm. This will make
1335 * the receiver generate an ack every other packet. The receiver
1336 * will start re-evaluating the rate at which packets come to decide
1337 * if it can benefit by lowering the ack traffic.
1338 */
1339 void
1340 tcp_reset_stretch_ack(struct tcpcb *tp)
1341 {
1342 tp->t_flags &= ~(TF_STRETCHACK|TF_STREAMING_ON);
1343 tp->rcv_by_unackwin = 0;
1344 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1345
1346 /*
1347 * When there is packet loss or packet re-ordering or CWR due to
1348 * ECN, the sender's congestion window is reduced. In these states,
1349 * generate an ack for every other packet for some time to allow
1350 * the sender's congestion window to grow.
1351 */
1352 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1353 tp->rcv_waitforss = 0;
1354 }
1355
1356 /*
1357 * The last packet was a retransmission, check if this ack
1358 * indicates that the retransmission was spurious.
1359 *
1360 * If the connection supports timestamps, we could use it to
1361 * detect if the last retransmit was not needed. Otherwise,
1362 * we check if the ACK arrived within RTT/2 window, then it
1363 * was a mistake to do the retransmit in the first place.
1364 *
1365 * This function will return 1 if it is a spurious retransmit,
1366 * 0 otherwise.
1367 */
1368 int
1369 tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1370 struct tcpopt *to, u_int32_t rxtime)
1371 {
1372 int32_t tdiff, bad_rexmt_win;
1373 bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1374
1375 /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1376 if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))
1377 return (0);
1378 if (TSTMP_SUPPORTED(tp)) {
1379 if (rxtime > 0 && (to->to_flags & TOF_TS)
1380 && to->to_tsecr != 0
1381 && TSTMP_LT(to->to_tsecr, rxtime))
1382 return (1);
1383 } else {
1384 if ((tp->t_rxtshift == 1
1385 || (tp->t_flagsext & TF_SENT_TLPROBE))
1386 && rxtime > 0) {
1387 tdiff = (int32_t)(tcp_now - rxtime);
1388 if (tdiff < bad_rexmt_win)
1389 return(1);
1390 }
1391 }
1392 return(0);
1393 }
1394
1395
1396 /*
1397 * Restore congestion window state if a spurious timeout
1398 * was detected.
1399 */
1400 static void
1401 tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1402 {
1403 if (TSTMP_SUPPORTED(tp)) {
1404 u_int32_t fsize, acked;
1405 fsize = tp->snd_max - th->th_ack;
1406 acked = BYTES_ACKED(th, tp);
1407
1408 /*
1409 * Implement bad retransmit recovery as
1410 * described in RFC 4015.
1411 */
1412 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1413
1414 /* Initialize cwnd to the initial window */
1415 if (CC_ALGO(tp)->cwnd_init != NULL)
1416 CC_ALGO(tp)->cwnd_init(tp);
1417
1418 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1419
1420 } else {
1421 tp->snd_cwnd = tp->snd_cwnd_prev;
1422 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1423 if (tp->t_flags & TF_WASFRECOVERY)
1424 ENTER_FASTRECOVERY(tp);
1425
1426 /* Do not use the loss flight size in this case */
1427 tp->t_lossflightsize = 0;
1428 }
1429 tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
1430 tp->snd_recover = tp->snd_recover_prev;
1431 tp->snd_nxt = tp->snd_max;
1432
1433 /* Fix send socket buffer to reflect the change in cwnd */
1434 tcp_bad_rexmt_fix_sndbuf(tp);
1435
1436 /*
1437 * This RTT might reflect the extra delay induced
1438 * by the network. Skip using this sample for RTO
1439 * calculation and mark the connection so we can
1440 * recompute RTT when the next eligible sample is
1441 * found.
1442 */
1443 tp->t_flagsext |= TF_RECOMPUTE_RTT;
1444 tp->t_badrexmt_time = tcp_now;
1445 tp->t_rtttime = 0;
1446 }
1447
1448 /*
1449 * If the previous packet was sent in retransmission timer, and it was
1450 * not needed, then restore the congestion window to the state before that
1451 * transmission.
1452 *
1453 * If the last packet was sent in tail loss probe timeout, check if that
1454 * recovered the last packet. If so, that will indicate a real loss and
1455 * the congestion window needs to be lowered.
1456 */
1457 static void
1458 tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1459 {
1460 if (tp->t_rxtshift > 0 &&
1461 tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1462 ++tcpstat.tcps_sndrexmitbad;
1463 tcp_bad_rexmt_restore_state(tp, th);
1464 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1465 } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1466 && tp->t_tlphighrxt > 0
1467 && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1468 && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1469 /*
1470 * check DSACK information also to make sure that
1471 * the TLP was indeed needed
1472 */
1473 if (tcp_rxtseg_dsack_for_tlp(tp)) {
1474 /*
1475 * received a DSACK to indicate that TLP was
1476 * not needed
1477 */
1478 tcp_rxtseg_clean(tp);
1479 goto out;
1480 }
1481
1482 /*
1483 * The tail loss probe recovered the last packet and
1484 * we need to adjust the congestion window to take
1485 * this loss into account.
1486 */
1487 ++tcpstat.tcps_tlp_recoverlastpkt;
1488 if (!IN_FASTRECOVERY(tp)) {
1489 tcp_reduce_congestion_window(tp);
1490 EXIT_FASTRECOVERY(tp);
1491 }
1492 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1493 } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1494 /*
1495 * All of the retransmitted segments were duplicated, this
1496 * can be an indication of bad fast retransmit.
1497 */
1498 tcpstat.tcps_dsack_badrexmt++;
1499 tcp_bad_rexmt_restore_state(tp, th);
1500 tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
1501 tcp_rxtseg_clean(tp);
1502 }
1503 out:
1504 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1505 tp->t_tlphighrxt = 0;
1506 tp->t_tlpstart = 0;
1507
1508 /*
1509 * check if the latest ack was for a segment sent during PMTU
1510 * blackhole detection. If the timestamp on the ack is before
1511 * PMTU blackhole detection, then revert the size of the max
1512 * segment to previous size.
1513 */
1514 if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1515 tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1516 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1517 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1518 tcp_pmtud_revert_segment_size(tp);
1519 }
1520 }
1521 if (tp->t_pmtud_start_ts > 0)
1522 tp->t_pmtud_start_ts = 0;
1523 }
1524
1525 /*
1526 * Check if early retransmit can be attempted according to RFC 5827.
1527 *
1528 * If packet reordering is detected on a connection, fast recovery will
1529 * be delayed until it is clear that the packet was lost and not reordered.
1530 * But reordering detection is done only when SACK is enabled.
1531 *
1532 * On connections that do not support SACK, there is a limit on the number
1533 * of early retransmits that can be done per minute. This limit is needed
1534 * to make sure that too many packets are not retransmitted when there is
1535 * packet reordering.
1536 */
1537 static void
1538 tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1539 {
1540 u_int32_t obytes, snd_off;
1541 int32_t snd_len;
1542 struct socket *so = tp->t_inpcb->inp_socket;
1543
1544 if (early_rexmt && (SACK_ENABLED(tp) ||
1545 tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1546 SEQ_GT(tp->snd_max, tp->snd_una) &&
1547 (tp->t_dupacks == 1 ||
1548 (SACK_ENABLED(tp) &&
1549 !TAILQ_EMPTY(&tp->snd_holes)))) {
1550 /*
1551 * If there are only a few outstanding
1552 * segments on the connection, we might need
1553 * to lower the retransmit threshold. This
1554 * will allow us to do Early Retransmit as
1555 * described in RFC 5827.
1556 */
1557 if (SACK_ENABLED(tp) &&
1558 !TAILQ_EMPTY(&tp->snd_holes)) {
1559 obytes = (tp->snd_max - tp->snd_fack) +
1560 tp->sackhint.sack_bytes_rexmit;
1561 } else {
1562 obytes = (tp->snd_max - tp->snd_una);
1563 }
1564
1565 /*
1566 * In order to lower retransmit threshold the
1567 * following two conditions must be met.
1568 * 1. the amount of outstanding data is less
1569 * than 4*SMSS bytes
1570 * 2. there is no unsent data ready for
1571 * transmission or the advertised window
1572 * will limit sending new segments.
1573 */
1574 snd_off = tp->snd_max - tp->snd_una;
1575 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1576 if (obytes < (tp->t_maxseg << 2) &&
1577 snd_len <= 0) {
1578 u_int32_t osegs;
1579
1580 osegs = obytes / tp->t_maxseg;
1581 if ((osegs * tp->t_maxseg) < obytes)
1582 osegs++;
1583
1584 /*
1585 * Since the connection might have already
1586 * received some dupacks, we add them to
1587 * to the outstanding segments count to get
1588 * the correct retransmit threshold.
1589 *
1590 * By checking for early retransmit after
1591 * receiving some duplicate acks when SACK
1592 * is supported, the connection will
1593 * enter fast recovery even if multiple
1594 * segments are lost in the same window.
1595 */
1596 osegs += tp->t_dupacks;
1597 if (osegs < 4) {
1598 tp->t_rexmtthresh =
1599 ((osegs - 1) > 1) ? (osegs - 1) : 1;
1600 tp->t_rexmtthresh =
1601 min(tp->t_rexmtthresh, tcprexmtthresh);
1602 tp->t_rexmtthresh =
1603 max(tp->t_rexmtthresh, tp->t_dupacks);
1604
1605 if (tp->t_early_rexmt_count == 0)
1606 tp->t_early_rexmt_win = tcp_now;
1607
1608 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1609 tcpstat.tcps_tlp_recovery++;
1610 tcp_ccdbg_trace(tp, th,
1611 TCP_CC_TLP_RECOVERY);
1612 } else {
1613 tcpstat.tcps_early_rexmt++;
1614 tp->t_early_rexmt_count++;
1615 tcp_ccdbg_trace(tp, th,
1616 TCP_CC_EARLY_RETRANSMIT);
1617 }
1618 }
1619 }
1620 }
1621
1622 /*
1623 * If we ever sent a TLP probe, the acknowledgement will trigger
1624 * early retransmit because the value of snd_fack will be close
1625 * to snd_max. This will take care of adjustments to the
1626 * congestion window. So we can reset TF_SENT_PROBE flag.
1627 */
1628 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1629 tp->t_tlphighrxt = 0;
1630 tp->t_tlpstart = 0;
1631 }
1632
1633 static boolean_t
1634 tcp_tfo_syn(struct tcpcb *tp, struct tcpopt *to)
1635 {
1636 u_char out[CCAES_BLOCK_SIZE];
1637 unsigned char len;
1638
1639 if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
1640 !(tcp_fastopen & TCP_FASTOPEN_SERVER))
1641 return (FALSE);
1642
1643 if ((to->to_flags & TOF_TFOREQ)) {
1644 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1645
1646 tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
1647 tcpstat.tcps_tfo_cookie_req_rcv++;
1648 return (FALSE);
1649 }
1650
1651 /* Ok, then it must be an offered cookie. We need to check that ... */
1652 tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
1653
1654 len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1655 to->to_tfo++;
1656 if (memcmp(out, to->to_tfo, len)) {
1657 /* Cookies are different! Let's return and offer a new cookie */
1658 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1659
1660 tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
1661 tcpstat.tcps_tfo_cookie_invalid++;
1662 return (FALSE);
1663 }
1664
1665 if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1666 /* Need to decrement again as we just increased it... */
1667 OSDecrementAtomic(&tcp_tfo_halfcnt);
1668 return (FALSE);
1669 }
1670
1671 tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
1672
1673 tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
1674 tcpstat.tcps_tfo_syn_data_rcv++;
1675
1676 return (TRUE);
1677 }
1678
1679 static void
1680 tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to)
1681 {
1682 if (to->to_flags & TOF_TFO) {
1683 unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1684
1685 /*
1686 * If this happens, things have gone terribly wrong. len should
1687 * have been checked in tcp_dooptions.
1688 */
1689 VERIFY(len <= TFO_COOKIE_LEN_MAX);
1690
1691 to->to_tfo++;
1692
1693 tcp_cache_set_cookie(tp, to->to_tfo, len);
1694 tcp_heuristic_tfo_success(tp);
1695
1696 tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
1697 tcpstat.tcps_tfo_cookie_rcv++;
1698 if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
1699 tcpstat.tcps_tfo_cookie_wrong++;
1700 tp->t_tfo_stats |= TFO_S_COOKIE_WRONG;
1701 }
1702 } else {
1703 /*
1704 * Thus, no cookie in the response, but we either asked for one
1705 * or sent SYN+DATA. Now, we need to check whether we had to
1706 * rexmit the SYN. If that's the case, it's better to start
1707 * backing of TFO-cookie requests.
1708 */
1709 if (tp->t_tfo_flags & TFO_F_SYN_LOSS) {
1710 tp->t_tfo_stats |= TFO_S_SYN_LOSS;
1711 tcpstat.tcps_tfo_syn_loss++;
1712
1713 tcp_heuristic_tfo_loss(tp);
1714 } else {
1715 if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
1716 tp->t_tfo_stats |= TFO_S_NO_COOKIE_RCV;
1717 tcpstat.tcps_tfo_no_cookie_rcv++;
1718 }
1719
1720 tcp_heuristic_tfo_success(tp);
1721 }
1722 }
1723 }
1724
1725 static void
1726 tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
1727 {
1728 if (tlen == 0) {
1729 tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1730
1731 /*
1732 * We send the probe out rather quickly (after one RTO). It does not
1733 * really hurt that much, it's only one additional segment on the wire.
1734 */
1735 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1736 } else {
1737 /* If SYN/ACK+data, don't probe. We got the data! */
1738 tcp_heuristic_tfo_rcv_good(tp);
1739 }
1740 }
1741
1742 static void
1743 tcp_tfo_rcv_data(struct tcpcb *tp)
1744 {
1745 /* Transition from PROBING to NONE as data has been received */
1746 if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1747 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1748
1749 /* Data has been received - we are good to go! */
1750 tcp_heuristic_tfo_rcv_good(tp);
1751 }
1752 }
1753
1754 static void
1755 tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
1756 {
1757 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1758 tp->t_tfo_probes > 0) {
1759 if (th->th_seq == tp->rcv_nxt) {
1760 /* No hole, so stop probing */
1761 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1762 } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1763 /* There is a hole! Wait a bit for data... */
1764 tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1765 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1766 TCP_REXMTVAL(tp));
1767 }
1768 }
1769 }
1770
1771 /*
1772 * Update snd_wnd information.
1773 */
1774 static inline bool
1775 tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
1776 u_int32_t tiwin, int tlen)
1777 {
1778 /* Don't look at the window if there is no ACK flag */
1779 if ((thflags & TH_ACK) &&
1780 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1781 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1782 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1783 /* keep track of pure window updates */
1784 if (tlen == 0 &&
1785 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1786 tcpstat.tcps_rcvwinupd++;
1787 tp->snd_wnd = tiwin;
1788 tp->snd_wl1 = th->th_seq;
1789 tp->snd_wl2 = th->th_ack;
1790 if (tp->snd_wnd > tp->max_sndwnd)
1791 tp->max_sndwnd = tp->snd_wnd;
1792 return (true);
1793 }
1794 return (false);
1795 }
1796
1797 void
1798 tcp_input(struct mbuf *m, int off0)
1799 {
1800 struct tcphdr *th;
1801 struct ip *ip = NULL;
1802 struct inpcb *inp;
1803 u_char *optp = NULL;
1804 int optlen = 0;
1805 int tlen, off;
1806 int drop_hdrlen;
1807 struct tcpcb *tp = 0;
1808 int thflags;
1809 struct socket *so = 0;
1810 int todrop, acked, ourfinisacked, needoutput = 0;
1811 struct in_addr laddr;
1812 #if INET6
1813 struct in6_addr laddr6;
1814 #endif
1815 int dropsocket = 0;
1816 int iss = 0, nosock = 0;
1817 u_int32_t tiwin, sack_bytes_acked = 0;
1818 struct tcpopt to; /* options in this segment */
1819 #if TCPDEBUG
1820 short ostate = 0;
1821 #endif
1822 #if IPFIREWALL
1823 struct sockaddr_in *next_hop = NULL;
1824 struct m_tag *fwd_tag;
1825 #endif /* IPFIREWALL */
1826 u_char ip_ecn = IPTOS_ECN_NOTECT;
1827 unsigned int ifscope;
1828 uint8_t isconnected, isdisconnected;
1829 struct ifnet *ifp = m->m_pkthdr.rcvif;
1830 int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1831 int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1832 int turnoff_lro = 0, win;
1833 #if MPTCP
1834 struct mptcb *mp_tp = NULL;
1835 #endif /* MPTCP */
1836 boolean_t cell = IFNET_IS_CELLULAR(ifp);
1837 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1838 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1839 boolean_t recvd_dsack = FALSE;
1840 struct tcp_respond_args tra;
1841
1842 #define TCP_INC_VAR(stat, npkts) do { \
1843 stat += npkts; \
1844 } while (0)
1845
1846 TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1847 #if IPFIREWALL
1848 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1849 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1850 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1851 KERNEL_TAG_TYPE_IPFORWARD, NULL);
1852 } else {
1853 fwd_tag = NULL;
1854 }
1855 if (fwd_tag != NULL) {
1856 struct ip_fwd_tag *ipfwd_tag =
1857 (struct ip_fwd_tag *)(fwd_tag+1);
1858
1859 next_hop = ipfwd_tag->next_hop;
1860 m_tag_delete(m, fwd_tag);
1861 }
1862 #endif /* IPFIREWALL */
1863
1864 #if INET6
1865 struct ip6_hdr *ip6 = NULL;
1866 int isipv6;
1867 #endif /* INET6 */
1868 int rstreason; /* For badport_bandlim accounting purposes */
1869 struct proc *proc0=current_proc();
1870
1871 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1872
1873 #if INET6
1874 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1875 #endif
1876 bzero((char *)&to, sizeof(to));
1877
1878 #if INET6
1879 if (isipv6) {
1880 /*
1881 * Expect 32-bit aligned data pointer on
1882 * strict-align platforms
1883 */
1884 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1885
1886 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1887 ip6 = mtod(m, struct ip6_hdr *);
1888 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1889 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1890
1891 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1892 goto dropnosock;
1893
1894 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1895 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1896 th->th_seq, th->th_ack, th->th_win);
1897 /*
1898 * Be proactive about unspecified IPv6 address in source.
1899 * As we use all-zero to indicate unbounded/unconnected pcb,
1900 * unspecified IPv6 address can be used to confuse us.
1901 *
1902 * Note that packets with unspecified IPv6 destination is
1903 * already dropped in ip6_input.
1904 */
1905 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1906 /* XXX stat */
1907 IF_TCP_STATINC(ifp, unspecv6);
1908 goto dropnosock;
1909 }
1910 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1911 struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1912 struct tcphdr *, th);
1913
1914 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1915 } else
1916 #endif /* INET6 */
1917 {
1918 /*
1919 * Get IP and TCP header together in first mbuf.
1920 * Note: IP leaves IP header in first mbuf.
1921 */
1922 if (off0 > sizeof (struct ip)) {
1923 ip_stripoptions(m, (struct mbuf *)0);
1924 off0 = sizeof(struct ip);
1925 }
1926 if (m->m_len < sizeof (struct tcpiphdr)) {
1927 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1928 tcpstat.tcps_rcvshort++;
1929 return;
1930 }
1931 }
1932
1933 /* Expect 32-bit aligned data pointer on strict-align platforms */
1934 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1935
1936 ip = mtod(m, struct ip *);
1937 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1938 tlen = ip->ip_len;
1939
1940 if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1941 goto dropnosock;
1942
1943 #if INET6
1944 /* Re-initialization for later version check */
1945 ip->ip_v = IPVERSION;
1946 #endif
1947 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1948
1949 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1950 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1951
1952 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1953 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1954 th->th_seq, th->th_ack, th->th_win);
1955
1956 }
1957
1958 /*
1959 * Check that TCP offset makes sense,
1960 * pull out TCP options and adjust length.
1961 */
1962 off = th->th_off << 2;
1963 if (off < sizeof (struct tcphdr) || off > tlen) {
1964 tcpstat.tcps_rcvbadoff++;
1965 IF_TCP_STATINC(ifp, badformat);
1966 goto dropnosock;
1967 }
1968 tlen -= off; /* tlen is used instead of ti->ti_len */
1969 if (off > sizeof (struct tcphdr)) {
1970 #if INET6
1971 if (isipv6) {
1972 IP6_EXTHDR_CHECK(m, off0, off, return);
1973 ip6 = mtod(m, struct ip6_hdr *);
1974 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1975 } else
1976 #endif /* INET6 */
1977 {
1978 if (m->m_len < sizeof(struct ip) + off) {
1979 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1980 tcpstat.tcps_rcvshort++;
1981 return;
1982 }
1983 ip = mtod(m, struct ip *);
1984 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1985 }
1986 }
1987 optlen = off - sizeof (struct tcphdr);
1988 optp = (u_char *)(th + 1);
1989 /*
1990 * Do quick retrieval of timestamp options ("options
1991 * prediction?"). If timestamp is the only option and it's
1992 * formatted as recommended in RFC 1323 appendix A, we
1993 * quickly get the values now and not bother calling
1994 * tcp_dooptions(), etc.
1995 */
1996 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1997 (optlen > TCPOLEN_TSTAMP_APPA &&
1998 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1999 *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
2000 (th->th_flags & TH_SYN) == 0) {
2001 to.to_flags |= TOF_TS;
2002 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
2003 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
2004 optp = NULL; /* we've parsed the options */
2005 }
2006 }
2007 thflags = th->th_flags;
2008
2009 #if TCP_DROP_SYNFIN
2010 /*
2011 * If the drop_synfin option is enabled, drop all packets with
2012 * both the SYN and FIN bits set. This prevents e.g. nmap from
2013 * identifying the TCP/IP stack.
2014 *
2015 * This is a violation of the TCP specification.
2016 */
2017 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
2018 IF_TCP_STATINC(ifp, synfin);
2019 goto dropnosock;
2020 }
2021 #endif
2022
2023 /*
2024 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
2025 * until after ip6_savecontrol() is called and before other functions
2026 * which don't want those proto headers.
2027 * Because ip6_savecontrol() is going to parse the mbuf to
2028 * search for data to be passed up to user-land, it wants mbuf
2029 * parameters to be unchanged.
2030 */
2031 drop_hdrlen = off0 + off;
2032
2033 /* Since this is an entry point for input processing of tcp packets, we
2034 * can update the tcp clock here.
2035 */
2036 calculate_tcp_clock();
2037
2038 /*
2039 * Record the interface where this segment arrived on; this does not
2040 * affect normal data output (for non-detached TCP) as it provides a
2041 * hint about which route and interface to use for sending in the
2042 * absence of a PCB, when scoped routing (and thus source interface
2043 * selection) are enabled.
2044 */
2045 if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
2046 ifscope = IFSCOPE_NONE;
2047 else
2048 ifscope = m->m_pkthdr.rcvif->if_index;
2049
2050 /*
2051 * Convert TCP protocol specific fields to host format.
2052 */
2053
2054 #if BYTE_ORDER != BIG_ENDIAN
2055 NTOHL(th->th_seq);
2056 NTOHL(th->th_ack);
2057 NTOHS(th->th_win);
2058 NTOHS(th->th_urp);
2059 #endif
2060
2061 /*
2062 * Locate pcb for segment.
2063 */
2064 findpcb:
2065
2066 isconnected = FALSE;
2067 isdisconnected = FALSE;
2068
2069 #if IPFIREWALL_FORWARD
2070 if (next_hop != NULL
2071 #if INET6
2072 && isipv6 == 0 /* IPv6 support is not yet */
2073 #endif /* INET6 */
2074 ) {
2075 /*
2076 * Diverted. Pretend to be the destination.
2077 * already got one like this?
2078 */
2079 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2080 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
2081 if (!inp) {
2082 /*
2083 * No, then it's new. Try find the ambushing socket
2084 */
2085 if (!next_hop->sin_port) {
2086 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
2087 th->th_sport, next_hop->sin_addr,
2088 th->th_dport, 1, m->m_pkthdr.rcvif);
2089 } else {
2090 inp = in_pcblookup_hash(&tcbinfo,
2091 ip->ip_src, th->th_sport,
2092 next_hop->sin_addr,
2093 ntohs(next_hop->sin_port), 1,
2094 m->m_pkthdr.rcvif);
2095 }
2096 }
2097 } else
2098 #endif /* IPFIREWALL_FORWARD */
2099 {
2100 #if INET6
2101 if (isipv6)
2102 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
2103 &ip6->ip6_dst, th->th_dport, 1,
2104 m->m_pkthdr.rcvif);
2105 else
2106 #endif /* INET6 */
2107 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2108 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
2109 }
2110
2111 /*
2112 * Use the interface scope information from the PCB for outbound
2113 * segments. If the PCB isn't present and if scoped routing is
2114 * enabled, tcp_respond will use the scope of the interface where
2115 * the segment arrived on.
2116 */
2117 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
2118 ifscope = inp->inp_boundifp->if_index;
2119
2120 /*
2121 * If the state is CLOSED (i.e., TCB does not exist) then
2122 * all data in the incoming segment is discarded.
2123 * If the TCB exists but is in CLOSED state, it is embryonic,
2124 * but should either do a listen or a connect soon.
2125 */
2126 if (inp == NULL) {
2127 if (log_in_vain) {
2128 #if INET6
2129 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2130 #else /* INET6 */
2131 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
2132 #endif /* INET6 */
2133
2134 #if INET6
2135 if (isipv6) {
2136 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2137 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2138 } else
2139 #endif
2140 {
2141 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2142 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2143 }
2144 switch (log_in_vain) {
2145 case 1:
2146 if(thflags & TH_SYN)
2147 log(LOG_INFO,
2148 "Connection attempt to TCP %s:%d from %s:%d\n",
2149 dbuf, ntohs(th->th_dport),
2150 sbuf,
2151 ntohs(th->th_sport));
2152 break;
2153 case 2:
2154 log(LOG_INFO,
2155 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2156 dbuf, ntohs(th->th_dport), sbuf,
2157 ntohs(th->th_sport), thflags);
2158 break;
2159 case 3:
2160 case 4:
2161 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2162 !(m->m_flags & (M_BCAST | M_MCAST)) &&
2163 #if INET6
2164 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
2165 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
2166 #else
2167 ip->ip_dst.s_addr != ip->ip_src.s_addr
2168 #endif
2169 )
2170 log_in_vain_log((LOG_INFO,
2171 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2172 dbuf, ntohs(th->th_dport),
2173 sbuf,
2174 ntohs(th->th_sport)));
2175 break;
2176 default:
2177 break;
2178 }
2179 }
2180 if (blackhole) {
2181 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
2182
2183 switch (blackhole) {
2184 case 1:
2185 if (thflags & TH_SYN)
2186 goto dropnosock;
2187 break;
2188 case 2:
2189 goto dropnosock;
2190 default:
2191 goto dropnosock;
2192 }
2193 }
2194 rstreason = BANDLIM_RST_CLOSEDPORT;
2195 IF_TCP_STATINC(ifp, noconnnolist);
2196 goto dropwithresetnosock;
2197 }
2198 so = inp->inp_socket;
2199 if (so == NULL) {
2200 /* This case shouldn't happen as the socket shouldn't be null
2201 * if inp_state isn't set to INPCB_STATE_DEAD
2202 * But just in case, we pretend we didn't find the socket if we hit this case
2203 * as this isn't cause for a panic (the socket might be leaked however)...
2204 */
2205 inp = NULL;
2206 #if TEMPDEBUG
2207 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2208 #endif
2209 goto dropnosock;
2210 }
2211
2212 tcp_lock(so, 1, 0);
2213 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2214 tcp_unlock(so, 1, (void *)2);
2215 inp = NULL; // pretend we didn't find it
2216 goto dropnosock;
2217 }
2218
2219 #if NECP
2220 #if INET6
2221 if (isipv6) {
2222 if (!necp_socket_is_allowed_to_send_recv_v6(inp,
2223 th->th_dport, th->th_sport, &ip6->ip6_dst,
2224 &ip6->ip6_src, ifp, NULL, NULL)) {
2225 IF_TCP_STATINC(ifp, badformatipsec);
2226 goto drop;
2227 }
2228 } else
2229 #endif
2230 {
2231 if (!necp_socket_is_allowed_to_send_recv_v4(inp,
2232 th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src,
2233 ifp, NULL, NULL)) {
2234 IF_TCP_STATINC(ifp, badformatipsec);
2235 goto drop;
2236 }
2237 }
2238 #endif /* NECP */
2239
2240 tp = intotcpcb(inp);
2241 if (tp == 0) {
2242 rstreason = BANDLIM_RST_CLOSEDPORT;
2243 IF_TCP_STATINC(ifp, noconnlist);
2244 goto dropwithreset;
2245 }
2246 if (tp->t_state == TCPS_CLOSED)
2247 goto drop;
2248
2249 /* Unscale the window into a 32-bit value. */
2250 if ((thflags & TH_SYN) == 0)
2251 tiwin = th->th_win << tp->snd_scale;
2252 else
2253 tiwin = th->th_win;
2254
2255 #if CONFIG_MACF_NET
2256 if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
2257 goto drop;
2258 #endif
2259
2260 /* Avoid processing packets while closing a listen socket */
2261 if (tp->t_state == TCPS_LISTEN &&
2262 (so->so_options & SO_ACCEPTCONN) == 0)
2263 goto drop;
2264
2265 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
2266 #if TCPDEBUG
2267 if (so->so_options & SO_DEBUG) {
2268 ostate = tp->t_state;
2269 #if INET6
2270 if (isipv6)
2271 bcopy((char *)ip6, (char *)tcp_saveipgen,
2272 sizeof(*ip6));
2273 else
2274 #endif /* INET6 */
2275 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
2276 tcp_savetcp = *th;
2277 }
2278 #endif
2279 if (so->so_options & SO_ACCEPTCONN) {
2280 struct tcpcb *tp0 = tp;
2281 struct socket *so2;
2282 struct socket *oso;
2283 struct sockaddr_storage from;
2284 #if INET6
2285 struct inpcb *oinp = sotoinpcb(so);
2286 #endif /* INET6 */
2287 struct ifnet *head_ifscope;
2288 unsigned int head_nocell, head_recvanyif,
2289 head_noexpensive, head_awdl_unrestricted,
2290 head_intcoproc_allowed;
2291
2292 /* Get listener's bound-to-interface, if any */
2293 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2294 inp->inp_boundifp : NULL;
2295 /* Get listener's no-cellular information, if any */
2296 head_nocell = INP_NO_CELLULAR(inp);
2297 /* Get listener's recv-any-interface, if any */
2298 head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2299 /* Get listener's no-expensive information, if any */
2300 head_noexpensive = INP_NO_EXPENSIVE(inp);
2301 head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2302 head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
2303
2304 /*
2305 * If the state is LISTEN then ignore segment if it contains an RST.
2306 * If the segment contains an ACK then it is bad and send a RST.
2307 * If it does not contain a SYN then it is not interesting; drop it.
2308 * If it is from this socket, drop it, it must be forged.
2309 */
2310 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
2311 IF_TCP_STATINC(ifp, listbadsyn);
2312
2313 if (thflags & TH_RST) {
2314 goto drop;
2315 }
2316 if (thflags & TH_ACK) {
2317 tp = NULL;
2318 tcpstat.tcps_badsyn++;
2319 rstreason = BANDLIM_RST_OPENPORT;
2320 goto dropwithreset;
2321 }
2322
2323 /* We come here if there is no SYN set */
2324 tcpstat.tcps_badsyn++;
2325 goto drop;
2326 }
2327 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
2328 if (th->th_dport == th->th_sport) {
2329 #if INET6
2330 if (isipv6) {
2331 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2332 &ip6->ip6_src))
2333 goto drop;
2334 } else
2335 #endif /* INET6 */
2336 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2337 goto drop;
2338 }
2339 /*
2340 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2341 * in_broadcast() should never return true on a received
2342 * packet with M_BCAST not set.
2343 *
2344 * Packets with a multicast source address should also
2345 * be discarded.
2346 */
2347 if (m->m_flags & (M_BCAST|M_MCAST))
2348 goto drop;
2349 #if INET6
2350 if (isipv6) {
2351 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2352 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2353 goto drop;
2354 } else
2355 #endif
2356 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2357 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2358 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2359 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2360 goto drop;
2361
2362
2363 #if INET6
2364 /*
2365 * If deprecated address is forbidden,
2366 * we do not accept SYN to deprecated interface
2367 * address to prevent any new inbound connection from
2368 * getting established.
2369 * When we do not accept SYN, we send a TCP RST,
2370 * with deprecated source address (instead of dropping
2371 * it). We compromise it as it is much better for peer
2372 * to send a RST, and RST will be the final packet
2373 * for the exchange.
2374 *
2375 * If we do not forbid deprecated addresses, we accept
2376 * the SYN packet. RFC 4862 forbids dropping SYN in
2377 * this case.
2378 */
2379 if (isipv6 && !ip6_use_deprecated) {
2380 uint32_t ia6_flags;
2381
2382 if (ip6_getdstifaddr_info(m, NULL,
2383 &ia6_flags) == 0) {
2384 if (ia6_flags & IN6_IFF_DEPRECATED) {
2385 tp = NULL;
2386 rstreason = BANDLIM_RST_OPENPORT;
2387 IF_TCP_STATINC(ifp, deprecate6);
2388 goto dropwithreset;
2389 }
2390 }
2391 }
2392 #endif
2393 if (so->so_filt) {
2394 #if INET6
2395 if (isipv6) {
2396 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
2397
2398 sin6->sin6_len = sizeof(*sin6);
2399 sin6->sin6_family = AF_INET6;
2400 sin6->sin6_port = th->th_sport;
2401 sin6->sin6_flowinfo = 0;
2402 sin6->sin6_addr = ip6->ip6_src;
2403 sin6->sin6_scope_id = 0;
2404 }
2405 else
2406 #endif
2407 {
2408 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2409
2410 sin->sin_len = sizeof(*sin);
2411 sin->sin_family = AF_INET;
2412 sin->sin_port = th->th_sport;
2413 sin->sin_addr = ip->ip_src;
2414 }
2415 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2416 } else {
2417 so2 = sonewconn(so, 0, NULL);
2418 }
2419 if (so2 == 0) {
2420 tcpstat.tcps_listendrop++;
2421 if (tcp_dropdropablreq(so)) {
2422 if (so->so_filt)
2423 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2424 else
2425 so2 = sonewconn(so, 0, NULL);
2426 }
2427 if (!so2)
2428 goto drop;
2429 }
2430
2431 /* Point "inp" and "tp" in tandem to new socket */
2432 inp = (struct inpcb *)so2->so_pcb;
2433 tp = intotcpcb(inp);
2434
2435 oso = so;
2436 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2437
2438 so = so2;
2439 tcp_lock(so, 1, 0);
2440 /*
2441 * Mark socket as temporary until we're
2442 * committed to keeping it. The code at
2443 * ``drop'' and ``dropwithreset'' check the
2444 * flag dropsocket to see if the temporary
2445 * socket created here should be discarded.
2446 * We mark the socket as discardable until
2447 * we're committed to it below in TCPS_LISTEN.
2448 * There are some error conditions in which we
2449 * have to drop the temporary socket.
2450 */
2451 dropsocket++;
2452 /*
2453 * Inherit INP_BOUND_IF from listener; testing if
2454 * head_ifscope is non-NULL is sufficient, since it
2455 * can only be set to a non-zero value earlier if
2456 * the listener has such a flag set.
2457 */
2458 if (head_ifscope != NULL) {
2459 inp->inp_flags |= INP_BOUND_IF;
2460 inp->inp_boundifp = head_ifscope;
2461 } else {
2462 inp->inp_flags &= ~INP_BOUND_IF;
2463 }
2464 /*
2465 * Inherit restrictions from listener.
2466 */
2467 if (head_nocell)
2468 inp_set_nocellular(inp);
2469 if (head_noexpensive)
2470 inp_set_noexpensive(inp);
2471 if (head_awdl_unrestricted)
2472 inp_set_awdl_unrestricted(inp);
2473 if (head_intcoproc_allowed)
2474 inp_set_intcoproc_allowed(inp);
2475 /*
2476 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2477 */
2478 if (head_recvanyif)
2479 inp->inp_flags |= INP_RECV_ANYIF;
2480 else
2481 inp->inp_flags &= ~INP_RECV_ANYIF;
2482 #if INET6
2483 if (isipv6)
2484 inp->in6p_laddr = ip6->ip6_dst;
2485 else {
2486 inp->inp_vflag &= ~INP_IPV6;
2487 inp->inp_vflag |= INP_IPV4;
2488 #endif /* INET6 */
2489 inp->inp_laddr = ip->ip_dst;
2490 #if INET6
2491 }
2492 #endif /* INET6 */
2493 inp->inp_lport = th->th_dport;
2494 if (in_pcbinshash(inp, 0) != 0) {
2495 /*
2496 * Undo the assignments above if we failed to
2497 * put the PCB on the hash lists.
2498 */
2499 #if INET6
2500 if (isipv6)
2501 inp->in6p_laddr = in6addr_any;
2502 else
2503 #endif /* INET6 */
2504 inp->inp_laddr.s_addr = INADDR_ANY;
2505 inp->inp_lport = 0;
2506 tcp_lock(oso, 0, 0); /* release ref on parent */
2507 tcp_unlock(oso, 1, 0);
2508 goto drop;
2509 }
2510 #if INET6
2511 if (isipv6) {
2512 /*
2513 * Inherit socket options from the listening
2514 * socket.
2515 * Note that in6p_inputopts are not (even
2516 * should not be) copied, since it stores
2517 * previously received options and is used to
2518 * detect if each new option is different than
2519 * the previous one and hence should be passed
2520 * to a user.
2521 * If we copied in6p_inputopts, a user would
2522 * not be able to receive options just after
2523 * calling the accept system call.
2524 */
2525 inp->inp_flags |=
2526 oinp->inp_flags & INP_CONTROLOPTS;
2527 if (oinp->in6p_outputopts)
2528 inp->in6p_outputopts =
2529 ip6_copypktopts(oinp->in6p_outputopts,
2530 M_NOWAIT);
2531 } else
2532 #endif /* INET6 */
2533 {
2534 inp->inp_options = ip_srcroute();
2535 inp->inp_ip_tos = oinp->inp_ip_tos;
2536 }
2537 tcp_lock(oso, 0, 0);
2538 #if IPSEC
2539 /* copy old policy into new socket's */
2540 if (sotoinpcb(oso)->inp_sp)
2541 {
2542 int error = 0;
2543 /* Is it a security hole here to silently fail to copy the policy? */
2544 if (inp->inp_sp != NULL)
2545 error = ipsec_init_policy(so, &inp->inp_sp);
2546 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2547 printf("tcp_input: could not copy policy\n");
2548 }
2549 #endif
2550 /* inherit states from the listener */
2551 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2552 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2553 tp->t_state = TCPS_LISTEN;
2554 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2555 tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT|TF_FASTOPEN));
2556 tp->t_keepinit = tp0->t_keepinit;
2557 tp->t_keepcnt = tp0->t_keepcnt;
2558 tp->t_keepintvl = tp0->t_keepintvl;
2559 tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2560 tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2561 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2562 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2563 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2564 tp->t_inpcb->inp_flags2 |=
2565 tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD;
2566
2567 /* now drop the reference on the listener */
2568 tcp_unlock(oso, 1, 0);
2569
2570 tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(ifp));
2571
2572 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2573 }
2574 }
2575 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2576 LCK_MTX_ASSERT_OWNED);
2577
2578 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2579 /*
2580 * Evaluate the rate of arrival of packets to see if the
2581 * receiver can reduce the ack traffic. The algorithm to
2582 * stretch acks will be enabled if the connection meets
2583 * certain criteria defined in tcp_stretch_ack_enable function.
2584 */
2585 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2586 TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2587 }
2588 if (tcp_stretch_ack_enable(tp, thflags)) {
2589 tp->t_flags |= TF_STRETCHACK;
2590 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2591 tp->rcv_waitforss = 0;
2592 } else {
2593 tp->t_flags &= ~(TF_STRETCHACK);
2594 }
2595 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2596 tp->rcv_by_unackwin += (tlen + off);
2597 } else {
2598 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2599 tp->rcv_by_unackwin = tlen + off;
2600 }
2601 }
2602
2603 /*
2604 * Keep track of how many bytes were received in the LRO packet
2605 */
2606 if ((pktf_sw_lro_pkt) && (nlropkts > 2)) {
2607 tp->t_lropktlen += tlen;
2608 }
2609 /*
2610 * Explicit Congestion Notification - Flag that we need to send ECT if
2611 * + The IP Congestion experienced flag was set.
2612 * + Socket is in established state
2613 * + We negotiated ECN in the TCP setup
2614 * + This isn't a pure ack (tlen > 0)
2615 * + The data is in the valid window
2616 *
2617 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2618 */
2619 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2620 TCP_ECN_ENABLED(tp) && tlen > 0 &&
2621 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2622 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2623 tp->t_ecn_recv_ce++;
2624 tcpstat.tcps_ecn_recv_ce++;
2625 INP_INC_IFNET_STAT(inp, ecn_recv_ce);
2626 /* Mark this connection as it received CE from network */
2627 tp->ecn_flags |= TE_RECV_ECN_CE;
2628 tp->ecn_flags |= TE_SENDECE;
2629 }
2630
2631 /*
2632 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2633 * bother doing extensive checks for state and whatnot.
2634 */
2635 if (thflags & TH_CWR) {
2636 tp->ecn_flags &= ~TE_SENDECE;
2637 tp->t_ecn_recv_cwr++;
2638 }
2639
2640 /*
2641 * If we received an explicit notification of congestion in
2642 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2643 * the ack-strteching state. We need to handle ECN notification if
2644 * an ECN setup SYN was sent even once.
2645 */
2646 if (tp->t_state == TCPS_ESTABLISHED
2647 && (tp->ecn_flags & TE_SETUPSENT)
2648 && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
2649 tcp_reset_stretch_ack(tp);
2650 CLEAR_IAJ_STATE(tp);
2651 }
2652
2653 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2654 !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2655 tcpstat.tcps_ecn_fallback_ce++;
2656 tcp_heuristic_ecn_aggressive(tp);
2657 tp->ecn_flags |= TE_CEHEURI_SET;
2658 }
2659
2660 if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
2661 ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2662 if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
2663 tp->t_ecn_recv_ce_pkt++;
2664 } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
2665 tcpstat.tcps_ecn_fallback_ce++;
2666 tcp_heuristic_ecn_aggressive(tp);
2667 tp->ecn_flags |= TE_CEHEURI_SET;
2668 INP_INC_IFNET_STAT(inp,ecn_fallback_ce);
2669 } else {
2670 /* We tracked the first ECN_MIN_CE_PROBES segments, we
2671 * now know that the path is good.
2672 */
2673 tp->ecn_flags |= TE_CEHEURI_SET;
2674 }
2675 }
2676
2677 /*
2678 * Try to determine if we are receiving a packet after a long time.
2679 * Use our own approximation of idletime to roughly measure remote
2680 * end's idle time. Since slowstart is used after an idle period
2681 * we want to avoid doing LRO if the remote end is not up to date
2682 * on initial window support and starts with 1 or 2 packets as its IW.
2683 */
2684 if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2685 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2686 turnoff_lro = 1;
2687 }
2688
2689 /* Update rcvtime as a new segment was received on the connection */
2690 tp->t_rcvtime = tcp_now;
2691
2692 /*
2693 * Segment received on connection.
2694 * Reset idle time and keep-alive timer.
2695 */
2696 if (TCPS_HAVEESTABLISHED(tp->t_state))
2697 tcp_keepalive_reset(tp);
2698
2699 /*
2700 * Process options if not in LISTEN state,
2701 * else do it below (after getting remote address).
2702 */
2703 if (tp->t_state != TCPS_LISTEN && optp) {
2704 tcp_dooptions(tp, optp, optlen, th, &to);
2705 #if MPTCP
2706 if (mptcp_input_preproc(tp, m, drop_hdrlen) != 0) {
2707 tp->t_flags |= TF_ACKNOW;
2708 (void) tcp_output(tp);
2709 tcp_check_timer_state(tp);
2710 tcp_unlock(so, 1, 0);
2711 KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2712 DBG_FUNC_END,0,0,0,0,0);
2713 return;
2714 }
2715 #endif /* MPTCP */
2716 }
2717 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2718 if (!(thflags & TH_ACK) ||
2719 (SEQ_GT(th->th_ack, tp->iss) &&
2720 SEQ_LEQ(th->th_ack, tp->snd_max)))
2721 tcp_finalize_options(tp, &to, ifscope);
2722 }
2723
2724 #if TRAFFIC_MGT
2725 /*
2726 * Compute inter-packet arrival jitter. According to RFC 3550,
2727 * inter-packet arrival jitter is defined as the difference in
2728 * packet spacing at the receiver compared to the sender for a
2729 * pair of packets. When two packets of maximum segment size come
2730 * one after the other with consecutive sequence numbers, we
2731 * consider them as packets sent together at the sender and use
2732 * them as a pair to compute inter-packet arrival jitter. This
2733 * metric indicates the delay induced by the network components due
2734 * to queuing in edge/access routers.
2735 */
2736 if (tp->t_state == TCPS_ESTABLISHED &&
2737 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2738 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2739 ((to.to_flags & TOF_TS) == 0 ||
2740 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2741 th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
2742 int seg_size = tlen;
2743 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2744 TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2745 }
2746
2747 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2748 seg_size = m->m_pkthdr.lro_pktlen;
2749 }
2750 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2751 (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2752 /*
2753 * State related to inter-arrival jitter is
2754 * uninitialized or we are trying to find a good
2755 * first packet to start computing the metric
2756 */
2757 update_iaj_state(tp, seg_size, 0);
2758 } else {
2759 if (seg_size == tp->iaj_size) {
2760 /*
2761 * Compute inter-arrival jitter taking
2762 * this packet as the second packet
2763 */
2764 if (pktf_sw_lro_pkt)
2765 compute_iaj(tp, nlropkts,
2766 m->m_pkthdr.lro_elapsed);
2767 else
2768 compute_iaj(tp, 1, 0);
2769 }
2770 if (seg_size < tp->iaj_size) {
2771 /*
2772 * There is a smaller packet in the stream.
2773 * Some times the maximum size supported
2774 * on a path can change if there is a new
2775 * link with smaller MTU. The receiver will
2776 * not know about this change. If there
2777 * are too many packets smaller than
2778 * iaj_size, we try to learn the iaj_size
2779 * again.
2780 */
2781 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2782 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2783 update_iaj_state(tp, seg_size, 1);
2784 } else {
2785 CLEAR_IAJ_STATE(tp);
2786 }
2787 } else {
2788 update_iaj_state(tp, seg_size, 0);
2789 }
2790 }
2791 } else {
2792 CLEAR_IAJ_STATE(tp);
2793 }
2794 #endif /* TRAFFIC_MGT */
2795
2796 /*
2797 * Header prediction: check for the two common cases
2798 * of a uni-directional data xfer. If the packet has
2799 * no control flags, is in-sequence, the window didn't
2800 * change and we're not retransmitting, it's a
2801 * candidate. If the length is zero and the ack moved
2802 * forward, we're the sender side of the xfer. Just
2803 * free the data acked & wake any higher level process
2804 * that was blocked waiting for space. If the length
2805 * is non-zero and the ack didn't move, we're the
2806 * receiver side. If we're getting packets in-order
2807 * (the reassembly queue is empty), add the data to
2808 * the socket buffer and note that we need a delayed ack.
2809 * Make sure that the hidden state-flags are also off.
2810 * Since we check for TCPS_ESTABLISHED above, it can only
2811 * be TH_NEEDSYN.
2812 */
2813 if (tp->t_state == TCPS_ESTABLISHED &&
2814 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR)) == TH_ACK &&
2815 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2816 ((to.to_flags & TOF_TS) == 0 ||
2817 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2818 th->th_seq == tp->rcv_nxt &&
2819 tiwin && tiwin == tp->snd_wnd &&
2820 tp->snd_nxt == tp->snd_max) {
2821
2822 /*
2823 * If last ACK falls within this segment's sequence numbers,
2824 * record the timestamp.
2825 * NOTE that the test is modified according to the latest
2826 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2827 */
2828 if ((to.to_flags & TOF_TS) != 0 &&
2829 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2830 tp->ts_recent_age = tcp_now;
2831 tp->ts_recent = to.to_tsval;
2832 }
2833
2834 if (tlen == 0) {
2835 if (SEQ_GT(th->th_ack, tp->snd_una) &&
2836 SEQ_LEQ(th->th_ack, tp->snd_max) &&
2837 tp->snd_cwnd >= tp->snd_ssthresh &&
2838 (!IN_FASTRECOVERY(tp) &&
2839 ((!(SACK_ENABLED(tp)) &&
2840 tp->t_dupacks < tp->t_rexmtthresh) ||
2841 (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2842 TAILQ_EMPTY(&tp->snd_holes))))) {
2843 /*
2844 * this is a pure ack for outstanding data.
2845 */
2846 ++tcpstat.tcps_predack;
2847
2848 tcp_bad_rexmt_check(tp, th, &to);
2849
2850 /* Recalculate the RTT */
2851 tcp_compute_rtt(tp, &to, th);
2852
2853 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
2854 acked = BYTES_ACKED(th, tp);
2855 tcpstat.tcps_rcvackpack++;
2856 tcpstat.tcps_rcvackbyte += acked;
2857
2858 /*
2859 * Handle an ack that is in sequence during
2860 * congestion avoidance phase. The
2861 * calculations in this function
2862 * assume that snd_una is not updated yet.
2863 */
2864 if (CC_ALGO(tp)->congestion_avd != NULL)
2865 CC_ALGO(tp)->congestion_avd(tp, th);
2866 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
2867 sbdrop(&so->so_snd, acked);
2868 if (so->so_flags & SOF_ENABLE_MSGS) {
2869 VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2870 so->so_msg_state->msg_serial_bytes -= acked;
2871 }
2872 tcp_sbsnd_trim(&so->so_snd);
2873
2874 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2875 SEQ_LEQ(th->th_ack, tp->snd_recover))
2876 tp->snd_recover = th->th_ack - 1;
2877 tp->snd_una = th->th_ack;
2878
2879 TCP_RESET_REXMT_STATE(tp);
2880
2881 /*
2882 * pull snd_wl2 up to prevent seq wrap relative
2883 * to th_ack.
2884 */
2885 tp->snd_wl2 = th->th_ack;
2886
2887 if (tp->t_dupacks > 0) {
2888 tp->t_dupacks = 0;
2889 tp->t_rexmtthresh = tcprexmtthresh;
2890 }
2891
2892 m_freem(m);
2893
2894 /*
2895 * If all outstanding data are acked, stop
2896 * retransmit timer, otherwise restart timer
2897 * using current (possibly backed-off) value.
2898 * If process is waiting for space,
2899 * wakeup/selwakeup/signal. If data
2900 * are ready to send, let tcp_output
2901 * decide between more output or persist.
2902 */
2903 if (tp->snd_una == tp->snd_max) {
2904 tp->t_timer[TCPT_REXMT] = 0;
2905 tp->t_timer[TCPT_PTO] = 0;
2906 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
2907 tp->t_timer[TCPT_REXMT] =
2908 OFFSET_FROM_START(tp,
2909 tp->t_rxtcur);
2910 }
2911 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
2912 !TCP_DSACK_SEQ_IN_WINDOW(tp,
2913 tp->t_dsack_lastuna, tp->snd_una))
2914 tcp_rxtseg_clean(tp);
2915
2916 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2917 tp->t_bwmeas != NULL)
2918 tcp_bwmeas_check(tp);
2919
2920 sowwakeup(so); /* has to be done with socket lock held */
2921 if (!SLIST_EMPTY(&tp->t_notify_ack))
2922 tcp_notify_acknowledgement(tp, so);
2923
2924 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2925 (void) tcp_output(tp);
2926 }
2927
2928 tcp_tfo_rcv_ack(tp, th);
2929
2930 tcp_check_timer_state(tp);
2931 tcp_unlock(so, 1, 0);
2932 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2933 return;
2934 }
2935 } else if (th->th_ack == tp->snd_una &&
2936 LIST_EMPTY(&tp->t_segq) &&
2937 tlen <= tcp_sbspace(tp)) {
2938 /*
2939 * this is a pure, in-sequence data packet
2940 * with nothing on the reassembly queue and
2941 * we have enough buffer space to take it.
2942 */
2943
2944 /*
2945 * If this is a connection in steady state, start
2946 * coalescing packets belonging to this flow.
2947 */
2948 if (turnoff_lro) {
2949 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2950 tp->t_inpcb->inp_faddr,
2951 tp->t_inpcb->inp_lport,
2952 tp->t_inpcb->inp_fport);
2953 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2954 tp->t_idleat = tp->rcv_nxt;
2955 } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2956 (so->so_flags & SOF_USELRO) &&
2957 !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2958 (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2959 ((th->th_seq - tp->irs) >
2960 (tp->t_maxseg << lro_start)) &&
2961 ((tp->t_idleat == 0) || ((th->th_seq -
2962 tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2963 tp->t_flagsext |= TF_LRO_OFFLOADED;
2964 tcp_start_coalescing(ip, th, tlen);
2965 tp->t_idleat = 0;
2966 }
2967
2968 /* Clean receiver SACK report if present */
2969 if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2970 tcp_clean_sackreport(tp);
2971 ++tcpstat.tcps_preddat;
2972 tp->rcv_nxt += tlen;
2973 /*
2974 * Pull snd_wl1 up to prevent seq wrap relative to
2975 * th_seq.
2976 */
2977 tp->snd_wl1 = th->th_seq;
2978 /*
2979 * Pull rcv_up up to prevent seq wrap relative to
2980 * rcv_nxt.
2981 */
2982 tp->rcv_up = tp->rcv_nxt;
2983 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2984 tcpstat.tcps_rcvbyte += tlen;
2985 if (nstat_collect) {
2986 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2987 INP_ADD_STAT(inp, cell, wifi, wired,
2988 rxpackets, m->m_pkthdr.lro_npkts);
2989 } else {
2990 INP_ADD_STAT(inp, cell, wifi, wired,
2991 rxpackets, 1);
2992 }
2993 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2994 tlen);
2995 }
2996
2997 /*
2998 * Calculate the RTT on the receiver only if the
2999 * connection is in streaming mode and the last
3000 * packet was not an end-of-write
3001 */
3002 if (tp->t_flags & TF_STREAMING_ON)
3003 tcp_compute_rtt(tp, &to, th);
3004
3005 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen,
3006 TCP_AUTORCVBUF_MAX(ifp));
3007
3008 /*
3009 * Add data to socket buffer.
3010 */
3011 so_recv_data_stat(so, m, 0);
3012 m_adj(m, drop_hdrlen); /* delayed header drop */
3013
3014 /*
3015 * If message delivery (SOF_ENABLE_MSGS) is enabled on
3016 * this socket, deliver the packet received as an
3017 * in-order message with sequence number attached to it.
3018 */
3019 if (sbappendstream_rcvdemux(so, m,
3020 th->th_seq - (tp->irs + 1), 0)) {
3021 sorwakeup(so);
3022 }
3023 #if INET6
3024 if (isipv6) {
3025 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3026 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
3027 th->th_seq, th->th_ack, th->th_win);
3028 }
3029 else
3030 #endif
3031 {
3032 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3033 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
3034 th->th_seq, th->th_ack, th->th_win);
3035 }
3036 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
3037 if (DELAY_ACK(tp, th)) {
3038 if ((tp->t_flags & TF_DELACK) == 0) {
3039 tp->t_flags |= TF_DELACK;
3040 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3041 }
3042 } else {
3043 tp->t_flags |= TF_ACKNOW;
3044 tcp_output(tp);
3045 }
3046
3047 tcp_adaptive_rwtimo_check(tp, tlen);
3048
3049 if (tlen > 0)
3050 tcp_tfo_rcv_data(tp);
3051
3052 tcp_check_timer_state(tp);
3053 tcp_unlock(so, 1, 0);
3054 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3055 return;
3056 }
3057 }
3058
3059 /*
3060 * Calculate amount of space in receive window,
3061 * and then do TCP input processing.
3062 * Receive window is amount of space in rcv queue,
3063 * but not less than advertised window.
3064 */
3065 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3066 LCK_MTX_ASSERT_OWNED);
3067 win = tcp_sbspace(tp);
3068 if (win < 0)
3069 win = 0;
3070 else { /* clip rcv window to 4K for modems */
3071 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
3072 win = min(win, slowlink_wsize);
3073 }
3074 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
3075 #if MPTCP
3076 /*
3077 * Ensure that the subflow receive window isn't greater
3078 * than the connection level receive window.
3079 */
3080 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
3081 (mp_tp = tptomptp(tp))) {
3082 MPT_LOCK(mp_tp);
3083 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
3084 tp->rcv_wnd = imax(mp_tp->mpt_rcvwnd, (int)(tp->rcv_adv - tp->rcv_nxt));
3085 tcpstat.tcps_mp_reducedwin++;
3086 }
3087 MPT_UNLOCK(mp_tp);
3088 }
3089 #endif /* MPTCP */
3090
3091 switch (tp->t_state) {
3092
3093 /*
3094 * Initialize tp->rcv_nxt, and tp->irs, select an initial
3095 * tp->iss, and send a segment:
3096 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3097 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3098 * Fill in remote peer address fields if not previously specified.
3099 * Enter SYN_RECEIVED state, and process any other fields of this
3100 * segment in this state.
3101 */
3102 case TCPS_LISTEN: {
3103 struct sockaddr_in *sin;
3104 #if INET6
3105 struct sockaddr_in6 *sin6;
3106 #endif
3107
3108 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3109 LCK_MTX_ASSERT_OWNED);
3110 #if INET6
3111 if (isipv6) {
3112 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
3113 M_SONAME, M_NOWAIT);
3114 if (sin6 == NULL)
3115 goto drop;
3116 bzero(sin6, sizeof(*sin6));
3117 sin6->sin6_family = AF_INET6;
3118 sin6->sin6_len = sizeof(*sin6);
3119 sin6->sin6_addr = ip6->ip6_src;
3120 sin6->sin6_port = th->th_sport;
3121 laddr6 = inp->in6p_laddr;
3122 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
3123 inp->in6p_laddr = ip6->ip6_dst;
3124 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
3125 proc0)) {
3126 inp->in6p_laddr = laddr6;
3127 FREE(sin6, M_SONAME);
3128 goto drop;
3129 }
3130 FREE(sin6, M_SONAME);
3131 } else
3132 #endif
3133 {
3134 lck_mtx_assert(
3135 &((struct inpcb *)so->so_pcb)->inpcb_mtx,
3136 LCK_MTX_ASSERT_OWNED);
3137 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
3138 M_NOWAIT);
3139 if (sin == NULL)
3140 goto drop;
3141 sin->sin_family = AF_INET;
3142 sin->sin_len = sizeof(*sin);
3143 sin->sin_addr = ip->ip_src;
3144 sin->sin_port = th->th_sport;
3145 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
3146 laddr = inp->inp_laddr;
3147 if (inp->inp_laddr.s_addr == INADDR_ANY)
3148 inp->inp_laddr = ip->ip_dst;
3149 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
3150 IFSCOPE_NONE, NULL)) {
3151 inp->inp_laddr = laddr;
3152 FREE(sin, M_SONAME);
3153 goto drop;
3154 }
3155 FREE(sin, M_SONAME);
3156 }
3157
3158 tcp_dooptions(tp, optp, optlen, th, &to);
3159 tcp_finalize_options(tp, &to, ifscope);
3160
3161 if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to))
3162 isconnected = TRUE;
3163
3164 if (iss)
3165 tp->iss = iss;
3166 else {
3167 tp->iss = tcp_new_isn(tp);
3168 }
3169 tp->irs = th->th_seq;
3170 tcp_sendseqinit(tp);
3171 tcp_rcvseqinit(tp);
3172 tp->snd_recover = tp->snd_una;
3173 /*
3174 * Initialization of the tcpcb for transaction;
3175 * set SND.WND = SEG.WND,
3176 * initialize CCsend and CCrecv.
3177 */
3178 tp->snd_wnd = tiwin; /* initial send-window */
3179 tp->t_flags |= TF_ACKNOW;
3180 tp->t_unacksegs = 0;
3181 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3182 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3183 tp->t_state = TCPS_SYN_RECEIVED;
3184 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3185 TCP_CONN_KEEPINIT(tp));
3186 dropsocket = 0; /* committed to socket */
3187
3188 if (inp->inp_flowhash == 0)
3189 inp->inp_flowhash = inp_calc_flowhash(inp);
3190 #if INET6
3191 /* update flowinfo - RFC 6437 */
3192 if (inp->inp_flow == 0 &&
3193 inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3194 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3195 inp->inp_flow |=
3196 (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
3197 }
3198 #endif /* INET6 */
3199
3200 /* reset the incomp processing flag */
3201 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3202 tcpstat.tcps_accepts++;
3203 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
3204 /* ECN-setup SYN */
3205 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
3206 }
3207
3208 goto trimthenstep6;
3209 }
3210
3211 /*
3212 * If the state is SYN_RECEIVED and the seg contains an ACK,
3213 * but not for our SYN/ACK, send a RST.
3214 */
3215 case TCPS_SYN_RECEIVED:
3216 if ((thflags & TH_ACK) &&
3217 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
3218 SEQ_GT(th->th_ack, tp->snd_max))) {
3219 rstreason = BANDLIM_RST_OPENPORT;
3220 IF_TCP_STATINC(ifp, ooopacket);
3221 goto dropwithreset;
3222 }
3223
3224 /*
3225 * In SYN_RECEIVED state, if we recv some SYNS with
3226 * window scale and others without, window scaling should
3227 * be disabled. Otherwise the window advertised will be
3228 * lower if we assume scaling and the other end does not.
3229 */
3230 if ((thflags & TH_SYN) &&
3231 (tp->irs == th->th_seq) &&
3232 !(to.to_flags & TOF_SCALE))
3233 tp->t_flags &= ~TF_RCVD_SCALE;
3234 break;
3235
3236 /*
3237 * If the state is SYN_SENT:
3238 * if seg contains an ACK, but not for our SYN, drop the input.
3239 * if seg contains a RST, then drop the connection.
3240 * if seg does not contain SYN, then drop it.
3241 * Otherwise this is an acceptable SYN segment
3242 * initialize tp->rcv_nxt and tp->irs
3243 * if seg contains ack then advance tp->snd_una
3244 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3245 * arrange for segment to be acked (eventually)
3246 * continue processing rest of data/controls, beginning with URG
3247 */
3248 case TCPS_SYN_SENT:
3249 if ((thflags & TH_ACK) &&
3250 (SEQ_LEQ(th->th_ack, tp->iss) ||
3251 SEQ_GT(th->th_ack, tp->snd_max))) {
3252 rstreason = BANDLIM_UNLIMITED;
3253 IF_TCP_STATINC(ifp, ooopacket);
3254 goto dropwithreset;
3255 }
3256 if (thflags & TH_RST) {
3257 if ((thflags & TH_ACK) != 0) {
3258 #if MPTCP
3259 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
3260 SEQ_GT(th->th_ack, tp->iss+1)) {
3261 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
3262 /* ignore the RST and retransmit SYN */
3263 goto drop;
3264 }
3265 #endif /* MPTCP */
3266 soevent(so,
3267 (SO_FILT_HINT_LOCKED |
3268 SO_FILT_HINT_CONNRESET));
3269 tp = tcp_drop(tp, ECONNREFUSED);
3270 postevent(so, 0, EV_RESET);
3271 }
3272 goto drop;
3273 }
3274 if ((thflags & TH_SYN) == 0)
3275 goto drop;
3276 tp->snd_wnd = th->th_win; /* initial send window */
3277
3278 tp->irs = th->th_seq;
3279 tcp_rcvseqinit(tp);
3280 if (thflags & TH_ACK) {
3281 tcpstat.tcps_connects++;
3282
3283 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
3284 /* ECN-setup SYN-ACK */
3285 tp->ecn_flags |= TE_SETUPRECEIVED;
3286 if (TCP_ECN_ENABLED(tp)) {
3287 tcp_heuristic_ecn_success(tp);
3288 tcpstat.tcps_ecn_client_success++;
3289 }
3290 } else {
3291 if (tp->ecn_flags & TE_SETUPSENT &&
3292 tp->t_rxtshift == 0) {
3293 tcp_heuristic_ecn_success(tp);
3294 tcpstat.tcps_ecn_not_supported++;
3295 }
3296 if (tp->ecn_flags & TE_SETUPSENT &&
3297 tp->t_rxtshift > 0)
3298 tcp_heuristic_ecn_loss(tp);
3299
3300 /* non-ECN-setup SYN-ACK */
3301 tp->ecn_flags &= ~TE_SENDIPECT;
3302 }
3303
3304 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
3305 /* XXXMAC: recursive lock: SOCK_LOCK(so); */
3306 mac_socketpeer_label_associate_mbuf(m, so);
3307 /* XXXMAC: SOCK_UNLOCK(so); */
3308 #endif
3309 /* Do window scaling on this connection? */
3310 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3311 tp->snd_scale = tp->requested_s_scale;
3312 tp->rcv_scale = tp->request_r_scale;
3313 }
3314
3315 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3316 tp->snd_una++; /* SYN is acked */
3317 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3318 tp->snd_nxt = tp->snd_una;
3319
3320 /*
3321 * We have sent more in the SYN than what is being
3322 * acked. (e.g., TFO)
3323 * We should restart the sending from what the receiver
3324 * has acknowledged immediately.
3325 */
3326 if (SEQ_GT(tp->snd_nxt, th->th_ack))
3327 tp->snd_max = tp->snd_nxt = th->th_ack;
3328
3329 /*
3330 * If there's data, delay ACK; if there's also a FIN
3331 * ACKNOW will be turned on later.
3332 */
3333 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
3334 if (DELAY_ACK(tp, th) && tlen != 0 ) {
3335 if ((tp->t_flags & TF_DELACK) == 0) {
3336 tp->t_flags |= TF_DELACK;
3337 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3338 }
3339 }
3340 else {
3341 tp->t_flags |= TF_ACKNOW;
3342 }
3343 /*
3344 * Received <SYN,ACK> in SYN_SENT[*] state.
3345 * Transitions:
3346 * SYN_SENT --> ESTABLISHED
3347 * SYN_SENT* --> FIN_WAIT_1
3348 */
3349 tp->t_starttime = tcp_now;
3350 tcp_sbrcv_tstmp_check(tp);
3351 if (tp->t_flags & TF_NEEDFIN) {
3352 DTRACE_TCP4(state__change, void, NULL,
3353 struct inpcb *, inp,
3354 struct tcpcb *, tp, int32_t,
3355 TCPS_FIN_WAIT_1);
3356 tp->t_state = TCPS_FIN_WAIT_1;
3357 tp->t_flags &= ~TF_NEEDFIN;
3358 thflags &= ~TH_SYN;
3359 } else {
3360 DTRACE_TCP4(state__change, void, NULL,
3361 struct inpcb *, inp, struct tcpcb *,
3362 tp, int32_t, TCPS_ESTABLISHED);
3363 tp->t_state = TCPS_ESTABLISHED;
3364 tp->t_timer[TCPT_KEEP] =
3365 OFFSET_FROM_START(tp,
3366 TCP_CONN_KEEPIDLE(tp));
3367 if (nstat_collect)
3368 nstat_route_connect_success(
3369 inp->inp_route.ro_rt);
3370 /*
3371 * The SYN is acknowledged but una is not
3372 * updated yet. So pass the value of
3373 * ack to compute sndbytes correctly
3374 */
3375 inp_count_sndbytes(inp, th->th_ack);
3376 }
3377 #if MPTCP
3378 /*
3379 * Do not send the connect notification for additional
3380 * subflows until ACK for 3-way handshake arrives.
3381 */
3382 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3383 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3384 isconnected = FALSE;
3385 /* Start data xmit if fastjoin */
3386 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
3387 soevent(so, (SO_FILT_HINT_LOCKED |
3388 SO_FILT_HINT_MPFASTJ));
3389 }
3390 } else
3391 #endif /* MPTCP */
3392 isconnected = TRUE;
3393
3394 if ((tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) ||
3395 (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT)) {
3396 tcp_tfo_synack(tp, &to);
3397
3398 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3399 SEQ_LT(tp->snd_una, th->th_ack)) {
3400 tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
3401 tcpstat.tcps_tfo_syn_data_acked++;
3402 #if MPTCP
3403 if (so->so_flags & SOF_MP_SUBFLOW)
3404 so->so_flags1 |= SOF1_TFO_REWIND;
3405 #endif
3406 if (!(tp->t_tfo_flags & TFO_F_NO_RCVPROBING))
3407 tcp_tfo_rcv_probe(tp, tlen);
3408 }
3409 }
3410 } else {
3411 /*
3412 * Received initial SYN in SYN-SENT[*] state => simul-
3413 * taneous open. If segment contains CC option and there is
3414 * a cached CC, apply TAO test; if it succeeds, connection is
3415 * half-synchronized. Otherwise, do 3-way handshake:
3416 * SYN-SENT -> SYN-RECEIVED
3417 * SYN-SENT* -> SYN-RECEIVED*
3418 */
3419 tp->t_flags |= TF_ACKNOW;
3420 tp->t_timer[TCPT_REXMT] = 0;
3421 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3422 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3423 tp->t_state = TCPS_SYN_RECEIVED;
3424
3425 /*
3426 * During simultaneous open, TFO should not be used.
3427 * So, we disable it here, to prevent that data gets
3428 * sent on the SYN/ACK.
3429 */
3430 tcp_disable_tfo(tp);
3431 }
3432
3433 trimthenstep6:
3434 /*
3435 * Advance th->th_seq to correspond to first data byte.
3436 * If data, trim to stay within window,
3437 * dropping FIN if necessary.
3438 */
3439 th->th_seq++;
3440 if (tlen > tp->rcv_wnd) {
3441 todrop = tlen - tp->rcv_wnd;
3442 m_adj(m, -todrop);
3443 tlen = tp->rcv_wnd;
3444 thflags &= ~TH_FIN;
3445 tcpstat.tcps_rcvpackafterwin++;
3446 tcpstat.tcps_rcvbyteafterwin += todrop;
3447 }
3448 tp->snd_wl1 = th->th_seq - 1;
3449 tp->rcv_up = th->th_seq;
3450 /*
3451 * Client side of transaction: already sent SYN and data.
3452 * If the remote host used T/TCP to validate the SYN,
3453 * our data will be ACK'd; if so, enter normal data segment
3454 * processing in the middle of step 5, ack processing.
3455 * Otherwise, goto step 6.
3456 */
3457 if (thflags & TH_ACK)
3458 goto process_ACK;
3459 goto step6;
3460 /*
3461 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3462 * do normal processing.
3463 *
3464 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
3465 */
3466 case TCPS_LAST_ACK:
3467 case TCPS_CLOSING:
3468 case TCPS_TIME_WAIT:
3469 break; /* continue normal processing */
3470
3471 /* Received a SYN while connection is already established.
3472 * This is a "half open connection and other anomalies" described
3473 * in RFC793 page 34, send an ACK so the remote reset the connection
3474 * or recovers by adjusting its sequence numberering
3475 */
3476 case TCPS_ESTABLISHED:
3477 if (thflags & TH_SYN)
3478 goto dropafterack;
3479 break;
3480 }
3481
3482 /*
3483 * States other than LISTEN or SYN_SENT.
3484 * First check the RST flag and sequence number since reset segments
3485 * are exempt from the timestamp and connection count tests. This
3486 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3487 * below which allowed reset segments in half the sequence space
3488 * to fall though and be processed (which gives forged reset
3489 * segments with a random sequence number a 50 percent chance of
3490 * killing a connection).
3491 * Then check timestamp, if present.
3492 * Then check the connection count, if present.
3493 * Then check that at least some bytes of segment are within
3494 * receive window. If segment begins before rcv_nxt,
3495 * drop leading data (and SYN); if nothing left, just ack.
3496 *
3497 *
3498 * If the RST bit is set, check the sequence number to see
3499 * if this is a valid reset segment.
3500 * RFC 793 page 37:
3501 * In all states except SYN-SENT, all reset (RST) segments
3502 * are validated by checking their SEQ-fields. A reset is
3503 * valid if its sequence number is in the window.
3504 * Note: this does not take into account delayed ACKs, so
3505 * we should test against last_ack_sent instead of rcv_nxt.
3506 * The sequence number in the reset segment is normally an
3507 * echo of our outgoing acknowlegement numbers, but some hosts
3508 * send a reset with the sequence number at the rightmost edge
3509 * of our receive window, and we have to handle this case.
3510 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3511 * that brute force RST attacks are possible. To combat this,
3512 * we use a much stricter check while in the ESTABLISHED state,
3513 * only accepting RSTs where the sequence number is equal to
3514 * last_ack_sent. In all other states (the states in which a
3515 * RST is more likely), the more permissive check is used.
3516 * If we have multiple segments in flight, the intial reset
3517 * segment sequence numbers will be to the left of last_ack_sent,
3518 * but they will eventually catch up.
3519 * In any case, it never made sense to trim reset segments to
3520 * fit the receive window since RFC 1122 says:
3521 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
3522 *
3523 * A TCP SHOULD allow a received RST segment to include data.
3524 *
3525 * DISCUSSION
3526 * It has been suggested that a RST segment could contain
3527 * ASCII text that encoded and explained the cause of the
3528 * RST. No standard has yet been established for such
3529 * data.
3530 *
3531 * If the reset segment passes the sequence number test examine
3532 * the state:
3533 * SYN_RECEIVED STATE:
3534 * If passive open, return to LISTEN state.
3535 * If active open, inform user that connection was refused.
3536 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3537 * Inform user that connection was reset, and close tcb.
3538 * CLOSING, LAST_ACK STATES:
3539 * Close the tcb.
3540 * TIME_WAIT STATE:
3541 * Drop the segment - see Stevens, vol. 2, p. 964 and
3542 * RFC 1337.
3543 *
3544 * Radar 4803931: Allows for the case where we ACKed the FIN but
3545 * there is already a RST in flight from the peer.
3546 * In that case, accept the RST for non-established
3547 * state if it's one off from last_ack_sent.
3548
3549 */
3550 if (thflags & TH_RST) {
3551 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3552 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
3553 (tp->rcv_wnd == 0 &&
3554 ((tp->last_ack_sent == th->th_seq) ||
3555 ((tp->last_ack_sent -1) == th->th_seq)))) {
3556 switch (tp->t_state) {
3557
3558 case TCPS_SYN_RECEIVED:
3559 IF_TCP_STATINC(ifp, rstinsynrcv);
3560 so->so_error = ECONNREFUSED;
3561 goto close;
3562
3563 case TCPS_ESTABLISHED:
3564 if (tp->last_ack_sent != th->th_seq) {
3565 tcpstat.tcps_badrst++;
3566 goto drop;
3567 }
3568 if (TCP_ECN_ENABLED(tp) &&
3569 tp->snd_una == tp->iss + 1 &&
3570 SEQ_GT(tp->snd_max, tp->snd_una)) {
3571 /*
3572 * If the first data packet on an
3573 * ECN connection, receives a RST
3574 * increment the heuristic
3575 */
3576 tcp_heuristic_ecn_droprst(tp);
3577 }
3578 case TCPS_FIN_WAIT_1:
3579 case TCPS_CLOSE_WAIT:
3580 /*
3581 Drop through ...
3582 */
3583 case TCPS_FIN_WAIT_2:
3584 so->so_error = ECONNRESET;
3585 close:
3586 postevent(so, 0, EV_RESET);
3587 soevent(so,
3588 (SO_FILT_HINT_LOCKED |
3589 SO_FILT_HINT_CONNRESET));
3590
3591 tcpstat.tcps_drops++;
3592 tp = tcp_close(tp);
3593 break;
3594
3595 case TCPS_CLOSING:
3596 case TCPS_LAST_ACK:
3597 tp = tcp_close(tp);
3598 break;
3599
3600 case TCPS_TIME_WAIT:
3601 break;
3602 }
3603 }
3604 goto drop;
3605 }
3606
3607 /*
3608 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3609 * and it's less than ts_recent, drop it.
3610 */
3611 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3612 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3613
3614 /* Check to see if ts_recent is over 24 days old. */
3615 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3616 /*
3617 * Invalidate ts_recent. If this segment updates
3618 * ts_recent, the age will be reset later and ts_recent
3619 * will get a valid value. If it does not, setting
3620 * ts_recent to zero will at least satisfy the
3621 * requirement that zero be placed in the timestamp
3622 * echo reply when ts_recent isn't valid. The
3623 * age isn't reset until we get a valid ts_recent
3624 * because we don't want out-of-order segments to be
3625 * dropped when ts_recent is old.
3626 */
3627 tp->ts_recent = 0;
3628 } else {
3629 tcpstat.tcps_rcvduppack++;
3630 tcpstat.tcps_rcvdupbyte += tlen;
3631 tp->t_pawsdrop++;
3632 tcpstat.tcps_pawsdrop++;
3633
3634 /*
3635 * PAWS-drop when ECN is being used? That indicates
3636 * that ECT-marked packets take a different path, with
3637 * different congestion-characteristics.
3638 *
3639 * Only fallback when we did send less than 2GB as PAWS
3640 * really has no reason to kick in earlier.
3641 */
3642 if (TCP_ECN_ENABLED(tp) &&
3643 inp->inp_stat->rxbytes < 2147483648) {
3644 INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
3645 tcpstat.tcps_ecn_fallback_reorder++;
3646 tcp_heuristic_ecn_aggressive(tp);
3647 }
3648
3649 if (nstat_collect) {
3650 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3651 1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3652 INP_ADD_STAT(inp, cell, wifi, wired,
3653 rxpackets, 1);
3654 INP_ADD_STAT(inp, cell, wifi, wired,
3655 rxbytes, tlen);
3656 tp->t_stat.rxduplicatebytes += tlen;
3657 }
3658 if (tlen > 0)
3659 goto dropafterack;
3660 goto drop;
3661 }
3662 }
3663
3664 /*
3665 * In the SYN-RECEIVED state, validate that the packet belongs to
3666 * this connection before trimming the data to fit the receive
3667 * window. Check the sequence number versus IRS since we know
3668 * the sequence numbers haven't wrapped. This is a partial fix
3669 * for the "LAND" DoS attack.
3670 */
3671 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3672 rstreason = BANDLIM_RST_OPENPORT;
3673 IF_TCP_STATINC(ifp, dospacket);
3674 goto dropwithreset;
3675 }
3676
3677 todrop = tp->rcv_nxt - th->th_seq;
3678 if (todrop > 0) {
3679 if (thflags & TH_SYN) {
3680 thflags &= ~TH_SYN;
3681 th->th_seq++;
3682 if (th->th_urp > 1)
3683 th->th_urp--;
3684 else
3685 thflags &= ~TH_URG;
3686 todrop--;
3687 }
3688 /*
3689 * Following if statement from Stevens, vol. 2, p. 960.
3690 */
3691 if (todrop > tlen
3692 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3693 /*
3694 * Any valid FIN must be to the left of the window.
3695 * At this point the FIN must be a duplicate or out
3696 * of sequence; drop it.
3697 */
3698 thflags &= ~TH_FIN;
3699
3700 /*
3701 * Send an ACK to resynchronize and drop any data.
3702 * But keep on processing for RST or ACK.
3703 */
3704 tp->t_flags |= TF_ACKNOW;
3705 if (todrop == 1) {
3706 /* This could be a keepalive */
3707 soevent(so, SO_FILT_HINT_LOCKED |
3708 SO_FILT_HINT_KEEPALIVE);
3709 }
3710 todrop = tlen;
3711 tcpstat.tcps_rcvduppack++;
3712 tcpstat.tcps_rcvdupbyte += todrop;
3713 } else {
3714 tcpstat.tcps_rcvpartduppack++;
3715 tcpstat.tcps_rcvpartdupbyte += todrop;
3716 }
3717
3718 if (TCP_DSACK_ENABLED(tp) && todrop > 1) {
3719 /*
3720 * Note the duplicate data sequence space so that
3721 * it can be reported in DSACK option.
3722 */
3723 tp->t_dsack_lseq = th->th_seq;
3724 tp->t_dsack_rseq = th->th_seq + todrop;
3725 tp->t_flags |= TF_ACKNOW;
3726 }
3727 if (nstat_collect) {
3728 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3729 todrop, NSTAT_RX_FLAG_DUPLICATE);
3730 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3731 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
3732 tp->t_stat.rxduplicatebytes += todrop;
3733 }
3734 drop_hdrlen += todrop; /* drop from the top afterwards */
3735 th->th_seq += todrop;
3736 tlen -= todrop;
3737 if (th->th_urp > todrop)
3738 th->th_urp -= todrop;
3739 else {
3740 thflags &= ~TH_URG;
3741 th->th_urp = 0;
3742 }
3743 }
3744
3745 /*
3746 * If new data are received on a connection after the user
3747 * processes are gone, then RST the other end.
3748 * Send also a RST when we received a data segment after we've
3749 * sent our FIN when the socket is defunct.
3750 * Note that an MPTCP subflow socket would have SS_NOFDREF set
3751 * by default so check to make sure that we test for SOF_MP_SUBFLOW
3752 * socket flag (which would be cleared when the socket is closed.)
3753 */
3754 if (!(so->so_flags & SOF_MP_SUBFLOW) && tlen &&
3755 (((so->so_state & SS_NOFDREF) &&
3756 tp->t_state > TCPS_CLOSE_WAIT) ||
3757 ((so->so_flags & SOF_DEFUNCT) &&
3758 tp->t_state > TCPS_FIN_WAIT_1))) {
3759 tp = tcp_close(tp);
3760 tcpstat.tcps_rcvafterclose++;
3761 rstreason = BANDLIM_UNLIMITED;
3762 IF_TCP_STATINC(ifp, cleanup);
3763 goto dropwithreset;
3764 }
3765
3766 /*
3767 * If segment ends after window, drop trailing data
3768 * (and PUSH and FIN); if nothing left, just ACK.
3769 */
3770 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3771 if (todrop > 0) {
3772 tcpstat.tcps_rcvpackafterwin++;
3773 if (todrop >= tlen) {
3774 tcpstat.tcps_rcvbyteafterwin += tlen;
3775 /*
3776 * If a new connection request is received
3777 * while in TIME_WAIT, drop the old connection
3778 * and start over if the sequence numbers
3779 * are above the previous ones.
3780 */
3781 if (thflags & TH_SYN &&
3782 tp->t_state == TCPS_TIME_WAIT &&
3783 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3784 iss = tcp_new_isn(tp);
3785 tp = tcp_close(tp);
3786 tcp_unlock(so, 1, 0);
3787 goto findpcb;
3788 }
3789 /*
3790 * If window is closed can only take segments at
3791 * window edge, and have to drop data and PUSH from
3792 * incoming segments. Continue processing, but
3793 * remember to ack. Otherwise, drop segment
3794 * and ack.
3795 */
3796 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3797 tp->t_flags |= TF_ACKNOW;
3798 tcpstat.tcps_rcvwinprobe++;
3799 } else
3800 goto dropafterack;
3801 } else
3802 tcpstat.tcps_rcvbyteafterwin += todrop;
3803 m_adj(m, -todrop);
3804 tlen -= todrop;
3805 thflags &= ~(TH_PUSH|TH_FIN);
3806 }
3807
3808 /*
3809 * If last ACK falls within this segment's sequence numbers,
3810 * record its timestamp.
3811 * NOTE:
3812 * 1) That the test incorporates suggestions from the latest
3813 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
3814 * 2) That updating only on newer timestamps interferes with
3815 * our earlier PAWS tests, so this check should be solely
3816 * predicated on the sequence space of this segment.
3817 * 3) That we modify the segment boundary check to be
3818 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
3819 * instead of RFC1323's
3820 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
3821 * This modified check allows us to overcome RFC1323's
3822 * limitations as described in Stevens TCP/IP Illustrated
3823 * Vol. 2 p.869. In such cases, we can still calculate the
3824 * RTT correctly when RCV.NXT == Last.ACK.Sent.
3825 */
3826 if ((to.to_flags & TOF_TS) != 0 &&
3827 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3828 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3829 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
3830 tp->ts_recent_age = tcp_now;
3831 tp->ts_recent = to.to_tsval;
3832 }
3833
3834 /*
3835 * If a SYN is in the window, then this is an
3836 * error and we send an RST and drop the connection.
3837 */
3838 if (thflags & TH_SYN) {
3839 tp = tcp_drop(tp, ECONNRESET);
3840 rstreason = BANDLIM_UNLIMITED;
3841 postevent(so, 0, EV_RESET);
3842 IF_TCP_STATINC(ifp, synwindow);
3843 goto dropwithreset;
3844 }
3845
3846 /*
3847 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
3848 * flag is on (half-synchronized state), then queue data for
3849 * later processing; else drop segment and return.
3850 */
3851 if ((thflags & TH_ACK) == 0) {
3852 if (tp->t_state == TCPS_SYN_RECEIVED ||
3853 (tp->t_flags & TF_NEEDSYN)) {
3854 if ((tfo_enabled(tp))) {
3855 /*
3856 * So, we received a valid segment while in
3857 * SYN-RECEIVED (TF_NEEDSYN is actually never
3858 * set, so this is dead code).
3859 * As this cannot be an RST (see that if a bit
3860 * higher), and it does not have the ACK-flag
3861 * set, we want to retransmit the SYN/ACK.
3862 * Thus, we have to reset snd_nxt to snd_una to
3863 * trigger the going back to sending of the
3864 * SYN/ACK. This is more consistent with the
3865 * behavior of tcp_output(), which expects
3866 * to send the segment that is pointed to by
3867 * snd_nxt.
3868 */
3869 tp->snd_nxt = tp->snd_una;
3870
3871 /*
3872 * We need to make absolutely sure that we are
3873 * going to reply upon a duplicate SYN-segment.
3874 */
3875 if (th->th_flags & TH_SYN)
3876 needoutput = 1;
3877 }
3878
3879 goto step6;
3880 } else if (tp->t_flags & TF_ACKNOW)
3881 goto dropafterack;
3882 else
3883 goto drop;
3884 }
3885
3886 /*
3887 * Ack processing.
3888 */
3889
3890 switch (tp->t_state) {
3891
3892 /*
3893 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3894 * ESTABLISHED state and continue processing.
3895 * The ACK was checked above.
3896 */
3897 case TCPS_SYN_RECEIVED:
3898
3899 tcpstat.tcps_connects++;
3900
3901 /* Do window scaling? */
3902 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3903 tp->snd_scale = tp->requested_s_scale;
3904 tp->rcv_scale = tp->request_r_scale;
3905 tp->snd_wnd = th->th_win << tp->snd_scale;
3906 tiwin = tp->snd_wnd;
3907 }
3908 /*
3909 * Make transitions:
3910 * SYN-RECEIVED -> ESTABLISHED
3911 * SYN-RECEIVED* -> FIN-WAIT-1
3912 */
3913 tp->t_starttime = tcp_now;
3914 tcp_sbrcv_tstmp_check(tp);
3915 if (tp->t_flags & TF_NEEDFIN) {
3916 DTRACE_TCP4(state__change, void, NULL,
3917 struct inpcb *, inp,
3918 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3919 tp->t_state = TCPS_FIN_WAIT_1;
3920 tp->t_flags &= ~TF_NEEDFIN;
3921 } else {
3922 DTRACE_TCP4(state__change, void, NULL,
3923 struct inpcb *, inp,
3924 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3925 tp->t_state = TCPS_ESTABLISHED;
3926 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3927 TCP_CONN_KEEPIDLE(tp));
3928 if (nstat_collect)
3929 nstat_route_connect_success(
3930 tp->t_inpcb->inp_route.ro_rt);
3931 /*
3932 * The SYN is acknowledged but una is not updated
3933 * yet. So pass the value of ack to compute
3934 * sndbytes correctly
3935 */
3936 inp_count_sndbytes(inp, th->th_ack);
3937 }
3938 /*
3939 * If segment contains data or ACK, will call tcp_reass()
3940 * later; if not, do so now to pass queued data to user.
3941 */
3942 if (tlen == 0 && (thflags & TH_FIN) == 0)
3943 (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3944 NULL, ifp);
3945 tp->snd_wl1 = th->th_seq - 1;
3946
3947 #if MPTCP
3948 /*
3949 * Do not send the connect notification for additional subflows
3950 * until ACK for 3-way handshake arrives.
3951 */
3952 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3953 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3954 isconnected = FALSE;
3955 } else
3956 #endif /* MPTCP */
3957 isconnected = TRUE;
3958 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
3959 /* Done this when receiving the SYN */
3960 isconnected = FALSE;
3961
3962 OSDecrementAtomic(&tcp_tfo_halfcnt);
3963
3964 /* Panic if something has gone terribly wrong. */
3965 VERIFY(tcp_tfo_halfcnt >= 0);
3966
3967 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
3968 }
3969
3970 /*
3971 * In case there is data in the send-queue (e.g., TFO is being
3972 * used, or connectx+data has been done), then if we would
3973 * "FALLTHROUGH", we would handle this ACK as if data has been
3974 * acknowledged. But, we have to prevent this. And this
3975 * can be prevented by increasing snd_una by 1, so that the
3976 * SYN is not considered as data (snd_una++ is actually also
3977 * done in SYN_SENT-state as part of the regular TCP stack).
3978 *
3979 * In case there is data on this ack as well, the data will be
3980 * handled by the label "dodata" right after step6.
3981 */
3982 if (so->so_snd.sb_cc) {
3983 tp->snd_una++; /* SYN is acked */
3984 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3985 tp->snd_nxt = tp->snd_una;
3986
3987 /*
3988 * No duplicate-ACK handling is needed. So, we
3989 * directly advance to processing the ACK (aka,
3990 * updating the RTT estimation,...)
3991 *
3992 * But, we first need to handle eventual SACKs,
3993 * because TFO will start sending data with the
3994 * SYN/ACK, so it might be that the client
3995 * includes a SACK with its ACK.
3996 */
3997 if (SACK_ENABLED(tp) &&
3998 (to.to_nsacks > 0 ||
3999 !TAILQ_EMPTY(&tp->snd_holes)))
4000 tcp_sack_doack(tp, &to, th,
4001 &sack_bytes_acked);
4002
4003 goto process_ACK;
4004 }
4005
4006 /* FALLTHROUGH */
4007
4008 /*
4009 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
4010 * ACKs. If the ack is in the range
4011 * tp->snd_una < th->th_ack <= tp->snd_max
4012 * then advance tp->snd_una to th->th_ack and drop
4013 * data from the retransmission queue. If this ACK reflects
4014 * more up to date window information we update our window information.
4015 */
4016 case TCPS_ESTABLISHED:
4017 case TCPS_FIN_WAIT_1:
4018 case TCPS_FIN_WAIT_2:
4019 case TCPS_CLOSE_WAIT:
4020 case TCPS_CLOSING:
4021 case TCPS_LAST_ACK:
4022 case TCPS_TIME_WAIT:
4023 if (SEQ_GT(th->th_ack, tp->snd_max)) {
4024 tcpstat.tcps_rcvacktoomuch++;
4025 goto dropafterack;
4026 }
4027 if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
4028 recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
4029 /*
4030 * If DSACK is received and this packet has no
4031 * other SACK information, it can be dropped.
4032 * We do not want to treat it as a duplicate ack.
4033 */
4034 if (recvd_dsack &&
4035 SEQ_LEQ(th->th_ack, tp->snd_una) &&
4036 to.to_nsacks == 0) {
4037 tcp_bad_rexmt_check(tp, th, &to);
4038 goto drop;
4039 }
4040 }
4041
4042 if (SACK_ENABLED(tp) &&
4043 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
4044 tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
4045
4046 #if MPTCP
4047 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
4048 if (tp->t_mpflags & TMPF_PREESTABLISHED) {
4049 /* MP TCP establishment succeeded */
4050 tp->t_mpuna = 0;
4051 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
4052 if (tp->t_mpflags & TMPF_SENT_JOIN) {
4053 tp->t_mpflags &=
4054 ~TMPF_PREESTABLISHED;
4055 tp->t_mpflags |=
4056 TMPF_MPTCP_TRUE;
4057 so->so_flags |= SOF_MPTCP_TRUE;
4058 mptcplog((LOG_DEBUG, "MPTCP "
4059 "Sockets: %s \n",__func__),
4060 MPTCP_SOCKET_DBG,
4061 MPTCP_LOGLVL_LOG);
4062
4063 tp->t_timer[TCPT_JACK_RXMT] = 0;
4064 tp->t_mprxtshift = 0;
4065 isconnected = TRUE;
4066 } else {
4067 isconnected = FALSE;
4068 }
4069 } else {
4070 isconnected = TRUE;
4071 tp->t_mpflags &= ~TMPF_SENT_KEYS;
4072 }
4073 }
4074 }
4075 #endif /* MPTCP */
4076
4077 tcp_tfo_rcv_ack(tp, th);
4078
4079 /*
4080 * If we have outstanding data (other than
4081 * a window probe), this is a completely
4082 * duplicate ack and the ack is the biggest we've seen.
4083 *
4084 * Need to accommodate a change in window on duplicate acks
4085 * to allow operating systems that update window during
4086 * recovery with SACK
4087 */
4088 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
4089 if (tlen == 0 && (tiwin == tp->snd_wnd ||
4090 (to.to_nsacks > 0 && sack_bytes_acked > 0))) {
4091 /*
4092 * If both ends send FIN at the same time,
4093 * then the ack will be a duplicate ack
4094 * but we have to process the FIN. Check
4095 * for this condition and process the FIN
4096 * instead of the dupack
4097 */
4098 if ((thflags & TH_FIN) &&
4099 !TCPS_HAVERCVDFIN(tp->t_state))
4100 break;
4101 process_dupack:
4102 #if MPTCP
4103 /*
4104 * MPTCP options that are ignored must
4105 * not be treated as duplicate ACKs.
4106 */
4107 if (to.to_flags & TOF_MPTCP) {
4108 goto drop;
4109 }
4110
4111 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
4112 mptcplog((LOG_DEBUG, "MPTCP "
4113 "Sockets: bypass ack recovery\n"),
4114 MPTCP_SOCKET_DBG,
4115 MPTCP_LOGLVL_VERBOSE);
4116 break;
4117 }
4118 #endif /* MPTCP */
4119 /*
4120 * If a duplicate acknowledgement was seen
4121 * after ECN, it indicates packet loss in
4122 * addition to ECN. Reset INRECOVERY flag
4123 * so that we can process partial acks
4124 * correctly
4125 */
4126 if (tp->ecn_flags & TE_INRECOVERY)
4127 tp->ecn_flags &= ~TE_INRECOVERY;
4128
4129 tcpstat.tcps_rcvdupack++;
4130 ++tp->t_dupacks;
4131
4132 /*
4133 * Check if we need to reset the limit on
4134 * early retransmit
4135 */
4136 if (tp->t_early_rexmt_count > 0 &&
4137 TSTMP_GEQ(tcp_now,
4138 (tp->t_early_rexmt_win +
4139 TCP_EARLY_REXMT_WIN)))
4140 tp->t_early_rexmt_count = 0;
4141
4142 /*
4143 * Is early retransmit needed? We check for
4144 * this when the connection is waiting for
4145 * duplicate acks to enter fast recovery.
4146 */
4147 if (!IN_FASTRECOVERY(tp))
4148 tcp_early_rexmt_check(tp, th);
4149
4150 /*
4151 * If we've seen exactly rexmt threshold
4152 * of duplicate acks, assume a packet
4153 * has been dropped and retransmit it.
4154 * Kludge snd_nxt & the congestion
4155 * window so we send only this one
4156 * packet.
4157 *
4158 * We know we're losing at the current
4159 * window size so do congestion avoidance
4160 * (set ssthresh to half the current window
4161 * and pull our congestion window back to
4162 * the new ssthresh).
4163 *
4164 * Dup acks mean that packets have left the
4165 * network (they're now cached at the receiver)
4166 * so bump cwnd by the amount in the receiver
4167 * to keep a constant cwnd packets in the
4168 * network.
4169 */
4170 if (tp->t_timer[TCPT_REXMT] == 0 ||
4171 (th->th_ack != tp->snd_una
4172 && sack_bytes_acked == 0)) {
4173 tp->t_dupacks = 0;
4174 tp->t_rexmtthresh = tcprexmtthresh;
4175 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
4176 IN_FASTRECOVERY(tp)) {
4177
4178 /*
4179 * If this connection was seeing packet
4180 * reordering, then recovery might be
4181 * delayed to disambiguate between
4182 * reordering and loss
4183 */
4184 if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4185 (tp->t_flagsext &
4186 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
4187 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4188 /*
4189 * Since the SACK information is already
4190 * updated, this ACK will be dropped
4191 */
4192 break;
4193 }
4194
4195 if (SACK_ENABLED(tp)
4196 && IN_FASTRECOVERY(tp)) {
4197 int awnd;
4198
4199 /*
4200 * Compute the amount of data in flight first.
4201 * We can inject new data into the pipe iff
4202 * we have less than 1/2 the original window's
4203 * worth of data in flight.
4204 */
4205 awnd = (tp->snd_nxt - tp->snd_fack) +
4206 tp->sackhint.sack_bytes_rexmit;
4207 if (awnd < tp->snd_ssthresh) {
4208 tp->snd_cwnd += tp->t_maxseg;
4209 if (tp->snd_cwnd > tp->snd_ssthresh)
4210 tp->snd_cwnd = tp->snd_ssthresh;
4211 }
4212 } else {
4213 tp->snd_cwnd += tp->t_maxseg;
4214 }
4215
4216 /* Process any window updates */
4217 if (tiwin > tp->snd_wnd)
4218 tcp_update_window(tp, thflags,
4219 th, tiwin, tlen);
4220 tcp_ccdbg_trace(tp, th,
4221 TCP_CC_IN_FASTRECOVERY);
4222
4223 (void) tcp_output(tp);
4224
4225 goto drop;
4226 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
4227 tcp_seq onxt = tp->snd_nxt;
4228
4229 /*
4230 * If we're doing sack, check to
4231 * see if we're already in sack
4232 * recovery. If we're not doing sack,
4233 * check to see if we're in newreno
4234 * recovery.
4235 */
4236 if (SACK_ENABLED(tp)) {
4237 if (IN_FASTRECOVERY(tp)) {
4238 tp->t_dupacks = 0;
4239 break;
4240 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4241 break;
4242 }
4243 } else {
4244 if (SEQ_LEQ(th->th_ack,
4245 tp->snd_recover)) {
4246 tp->t_dupacks = 0;
4247 break;
4248 }
4249 }
4250 if (tp->t_flags & TF_SENTFIN)
4251 tp->snd_recover = tp->snd_max - 1;
4252 else
4253 tp->snd_recover = tp->snd_max;
4254 tp->t_timer[TCPT_PTO] = 0;
4255 tp->t_rtttime = 0;
4256
4257 /*
4258 * If the connection has seen pkt
4259 * reordering, delay recovery until
4260 * it is clear that the packet
4261 * was lost.
4262 */
4263 if (SACK_ENABLED(tp) &&
4264 (tp->t_flagsext &
4265 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4266 == TF_PKTS_REORDERED &&
4267 !IN_FASTRECOVERY(tp) &&
4268 tp->t_reorderwin > 0 &&
4269 (tp->t_state == TCPS_ESTABLISHED ||
4270 tp->t_state == TCPS_FIN_WAIT_1)) {
4271 tp->t_timer[TCPT_DELAYFR] =
4272 OFFSET_FROM_START(tp,
4273 tp->t_reorderwin);
4274 tp->t_flagsext |= TF_DELAY_RECOVERY;
4275 tcpstat.tcps_delay_recovery++;
4276 tcp_ccdbg_trace(tp, th,
4277 TCP_CC_DELAY_FASTRECOVERY);
4278 break;
4279 }
4280
4281 tcp_rexmt_save_state(tp);
4282 /*
4283 * If the current tcp cc module has
4284 * defined a hook for tasks to run
4285 * before entering FR, call it
4286 */
4287 if (CC_ALGO(tp)->pre_fr != NULL)
4288 CC_ALGO(tp)->pre_fr(tp);
4289 ENTER_FASTRECOVERY(tp);
4290 tp->t_timer[TCPT_REXMT] = 0;
4291 if (TCP_ECN_ENABLED(tp))
4292 tp->ecn_flags |= TE_SENDCWR;
4293
4294 if (SACK_ENABLED(tp)) {
4295 tcpstat.tcps_sack_recovery_episode++;
4296 tp->t_sack_recovery_episode++;
4297 tp->sack_newdata = tp->snd_nxt;
4298 tp->snd_cwnd = tp->t_maxseg;
4299 tp->t_flagsext &=
4300 ~TF_CWND_NONVALIDATED;
4301
4302 /* Process any window updates */
4303 if (tiwin > tp->snd_wnd)
4304 tcp_update_window(
4305 tp, thflags,
4306 th, tiwin, tlen);
4307
4308 tcp_ccdbg_trace(tp, th,
4309 TCP_CC_ENTER_FASTRECOVERY);
4310 (void) tcp_output(tp);
4311 goto drop;
4312 }
4313 tp->snd_nxt = th->th_ack;
4314 tp->snd_cwnd = tp->t_maxseg;
4315
4316 /* Process any window updates */
4317 if (tiwin > tp->snd_wnd)
4318 tcp_update_window(tp,
4319 thflags,
4320 th, tiwin, tlen);
4321
4322 (void) tcp_output(tp);
4323 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4324 tcp_cc_adjust_nonvalidated_cwnd(tp);
4325 } else {
4326 tp->snd_cwnd = tp->snd_ssthresh +
4327 tp->t_maxseg * tp->t_dupacks;
4328 }
4329 if (SEQ_GT(onxt, tp->snd_nxt))
4330 tp->snd_nxt = onxt;
4331
4332 tcp_ccdbg_trace(tp, th,
4333 TCP_CC_ENTER_FASTRECOVERY);
4334 goto drop;
4335 } else if (limited_txmt &&
4336 ALLOW_LIMITED_TRANSMIT(tp) &&
4337 (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
4338 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
4339 u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
4340
4341 /* Use Limited Transmit algorithm on the first two
4342 * duplicate acks when there is new data to transmit
4343 */
4344 tp->snd_cwnd += incr;
4345 tcpstat.tcps_limited_txt++;
4346 (void) tcp_output(tp);
4347
4348 tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
4349
4350 /* Reset snd_cwnd back to normal */
4351 tp->snd_cwnd -= incr;
4352 }
4353 }
4354 break;
4355 }
4356 /*
4357 * If the congestion window was inflated to account
4358 * for the other side's cached packets, retract it.
4359 */
4360 if (IN_FASTRECOVERY(tp)) {
4361 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4362 /*
4363 * If we received an ECE and entered
4364 * recovery, the subsequent ACKs should
4365 * not be treated as partial acks.
4366 */
4367 if (tp->ecn_flags & TE_INRECOVERY)
4368 goto process_ACK;
4369
4370 if (SACK_ENABLED(tp))
4371 tcp_sack_partialack(tp, th);
4372 else
4373 tcp_newreno_partial_ack(tp, th);
4374 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
4375 } else {
4376 EXIT_FASTRECOVERY(tp);
4377 if (CC_ALGO(tp)->post_fr != NULL)
4378 CC_ALGO(tp)->post_fr(tp, th);
4379 tp->t_pipeack = 0;
4380 tcp_clear_pipeack_state(tp);
4381 tcp_ccdbg_trace(tp, th,
4382 TCP_CC_EXIT_FASTRECOVERY);
4383 }
4384 } else if ((tp->t_flagsext &
4385 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4386 == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4387 /*
4388 * If the ack acknowledges upto snd_recover or if
4389 * it acknowledges all the snd holes, exit
4390 * recovery and cancel the timer. Otherwise,
4391 * this is a partial ack. Wait for recovery timer
4392 * to enter recovery. The snd_holes have already
4393 * been updated.
4394 */
4395 if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
4396 TAILQ_EMPTY(&tp->snd_holes)) {
4397 tp->t_timer[TCPT_DELAYFR] = 0;
4398 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
4399 EXIT_FASTRECOVERY(tp);
4400 tcp_ccdbg_trace(tp, th,
4401 TCP_CC_EXIT_FASTRECOVERY);
4402 }
4403 } else {
4404 /*
4405 * We were not in fast recovery. Reset the
4406 * duplicate ack counter.
4407 */
4408 tp->t_dupacks = 0;
4409 tp->t_rexmtthresh = tcprexmtthresh;
4410 }
4411
4412
4413 /*
4414 * If we reach this point, ACK is not a duplicate,
4415 * i.e., it ACKs something we sent.
4416 */
4417 if (tp->t_flags & TF_NEEDSYN) {
4418 /*
4419 * T/TCP: Connection was half-synchronized, and our
4420 * SYN has been ACK'd (so connection is now fully
4421 * synchronized). Go to non-starred state,
4422 * increment snd_una for ACK of SYN, and check if
4423 * we can do window scaling.
4424 */
4425 tp->t_flags &= ~TF_NEEDSYN;
4426 tp->snd_una++;
4427 /* Do window scaling? */
4428 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4429 tp->snd_scale = tp->requested_s_scale;
4430 tp->rcv_scale = tp->request_r_scale;
4431 }
4432 }
4433
4434 process_ACK:
4435 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
4436 acked = BYTES_ACKED(th, tp);
4437 tcpstat.tcps_rcvackpack++;
4438 tcpstat.tcps_rcvackbyte += acked;
4439
4440 /*
4441 * If the last packet was a retransmit, make sure
4442 * it was not spurious.
4443 *
4444 * This will also take care of congestion window
4445 * adjustment if a last packet was recovered due to a
4446 * tail loss probe.
4447 */
4448 tcp_bad_rexmt_check(tp, th, &to);
4449
4450 /* Recalculate the RTT */
4451 tcp_compute_rtt(tp, &to, th);
4452
4453 /*
4454 * If all outstanding data is acked, stop retransmit
4455 * timer and remember to restart (more output or persist).
4456 * If there is more data to be acked, restart retransmit
4457 * timer, using current (possibly backed-off) value.
4458 */
4459 TCP_RESET_REXMT_STATE(tp);
4460 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
4461 tp->t_rttmin, TCPTV_REXMTMAX,
4462 TCP_ADD_REXMTSLOP(tp));
4463 if (th->th_ack == tp->snd_max) {
4464 tp->t_timer[TCPT_REXMT] = 0;
4465 tp->t_timer[TCPT_PTO] = 0;
4466 needoutput = 1;
4467 } else if (tp->t_timer[TCPT_PERSIST] == 0)
4468 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
4469 tp->t_rxtcur);
4470
4471 /*
4472 * If no data (only SYN) was ACK'd, skip rest of ACK
4473 * processing.
4474 */
4475 if (acked == 0)
4476 goto step6;
4477
4478 /*
4479 * When outgoing data has been acked (except the SYN+data), we
4480 * mark this connection as "sending good" for TFO.
4481 */
4482 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
4483 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
4484 !(th->th_flags & TH_SYN))
4485 tcp_heuristic_tfo_snd_good(tp);
4486
4487 /*
4488 * If TH_ECE is received, make sure that ECN is enabled
4489 * on that connection and we have sent ECT on data packets.
4490 */
4491 if ((thflags & TH_ECE) != 0 && TCP_ECN_ENABLED(tp) &&
4492 (tp->ecn_flags & TE_SENDIPECT)) {
4493 /*
4494 * Reduce the congestion window if we haven't
4495 * done so.
4496 */
4497 if (!IN_FASTRECOVERY(tp)) {
4498 tcp_reduce_congestion_window(tp);
4499 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
4500 /*
4501 * Also note that the connection received
4502 * ECE atleast once
4503 */
4504 tp->ecn_flags |= TE_RECV_ECN_ECE;
4505 INP_INC_IFNET_STAT(inp, ecn_recv_ece);
4506 tcpstat.tcps_ecn_recv_ece++;
4507 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
4508 }
4509 }
4510
4511 /*
4512 * When new data is acked, open the congestion window.
4513 * The specifics of how this is achieved are up to the
4514 * congestion control algorithm in use for this connection.
4515 *
4516 * The calculations in this function assume that snd_una is
4517 * not updated yet.
4518 */
4519 if (!IN_FASTRECOVERY(tp)) {
4520 if (CC_ALGO(tp)->ack_rcvd != NULL)
4521 CC_ALGO(tp)->ack_rcvd(tp, th);
4522 tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
4523 }
4524 if (acked > so->so_snd.sb_cc) {
4525 tp->snd_wnd -= so->so_snd.sb_cc;
4526 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
4527 if (so->so_flags & SOF_ENABLE_MSGS) {
4528 so->so_msg_state->msg_serial_bytes -=
4529 (int)so->so_snd.sb_cc;
4530 }
4531 ourfinisacked = 1;
4532 } else {
4533 sbdrop(&so->so_snd, acked);
4534 if (so->so_flags & SOF_ENABLE_MSGS) {
4535 so->so_msg_state->msg_serial_bytes -=
4536 acked;
4537 }
4538 tcp_sbsnd_trim(&so->so_snd);
4539 tp->snd_wnd -= acked;
4540 ourfinisacked = 0;
4541 }
4542 /* detect una wraparound */
4543 if ( !IN_FASTRECOVERY(tp) &&
4544 SEQ_GT(tp->snd_una, tp->snd_recover) &&
4545 SEQ_LEQ(th->th_ack, tp->snd_recover))
4546 tp->snd_recover = th->th_ack - 1;
4547
4548 if (IN_FASTRECOVERY(tp) &&
4549 SEQ_GEQ(th->th_ack, tp->snd_recover))
4550 EXIT_FASTRECOVERY(tp);
4551
4552 tp->snd_una = th->th_ack;
4553
4554 if (SACK_ENABLED(tp)) {
4555 if (SEQ_GT(tp->snd_una, tp->snd_recover))
4556 tp->snd_recover = tp->snd_una;
4557 }
4558 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
4559 tp->snd_nxt = tp->snd_una;
4560 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
4561 !TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
4562 tp->snd_una))
4563 tcp_rxtseg_clean(tp);
4564 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
4565 tp->t_bwmeas != NULL)
4566 tcp_bwmeas_check(tp);
4567
4568 /*
4569 * sowwakeup must happen after snd_una, et al. are
4570 * updated so that the sequence numbers are in sync with
4571 * so_snd
4572 */
4573 sowwakeup(so);
4574
4575 if (!SLIST_EMPTY(&tp->t_notify_ack))
4576 tcp_notify_acknowledgement(tp, so);
4577
4578 switch (tp->t_state) {
4579
4580 /*
4581 * In FIN_WAIT_1 STATE in addition to the processing
4582 * for the ESTABLISHED state if our FIN is now acknowledged
4583 * then enter FIN_WAIT_2.
4584 */
4585 case TCPS_FIN_WAIT_1:
4586 if (ourfinisacked) {
4587 /*
4588 * If we can't receive any more
4589 * data, then closing user can proceed.
4590 * Starting the TCPT_2MSL timer is contrary to the
4591 * specification, but if we don't get a FIN
4592 * we'll hang forever.
4593 */
4594 if (so->so_state & SS_CANTRCVMORE) {
4595 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
4596 TCP_CONN_MAXIDLE(tp));
4597 isconnected = FALSE;
4598 isdisconnected = TRUE;
4599 }
4600 DTRACE_TCP4(state__change, void, NULL,
4601 struct inpcb *, inp,
4602 struct tcpcb *, tp,
4603 int32_t, TCPS_FIN_WAIT_2);
4604 tp->t_state = TCPS_FIN_WAIT_2;
4605 /* fall through and make sure we also recognize
4606 * data ACKed with the FIN
4607 */
4608 }
4609 break;
4610
4611 /*
4612 * In CLOSING STATE in addition to the processing for
4613 * the ESTABLISHED state if the ACK acknowledges our FIN
4614 * then enter the TIME-WAIT state, otherwise ignore
4615 * the segment.
4616 */
4617 case TCPS_CLOSING:
4618 if (ourfinisacked) {
4619 DTRACE_TCP4(state__change, void, NULL,
4620 struct inpcb *, inp,
4621 struct tcpcb *, tp,
4622 int32_t, TCPS_TIME_WAIT);
4623 tp->t_state = TCPS_TIME_WAIT;
4624 tcp_canceltimers(tp);
4625 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4626 tp->t_flags |= TF_CLOSING;
4627 } else {
4628 add_to_time_wait(tp, 2 * tcp_msl);
4629 }
4630 isconnected = FALSE;
4631 isdisconnected = TRUE;
4632 }
4633 break;
4634
4635 /*
4636 * In LAST_ACK, we may still be waiting for data to drain
4637 * and/or to be acked, as well as for the ack of our FIN.
4638 * If our FIN is now acknowledged, delete the TCB,
4639 * enter the closed state and return.
4640 */
4641 case TCPS_LAST_ACK:
4642 if (ourfinisacked) {
4643 tp = tcp_close(tp);
4644 goto drop;
4645 }
4646 break;
4647
4648 /*
4649 * In TIME_WAIT state the only thing that should arrive
4650 * is a retransmission of the remote FIN. Acknowledge
4651 * it and restart the finack timer.
4652 */
4653 case TCPS_TIME_WAIT:
4654 add_to_time_wait(tp, 2 * tcp_msl);
4655 goto dropafterack;
4656 }
4657
4658 /*
4659 * If there is a SACK option on the ACK and we
4660 * haven't seen any duplicate acks before, count
4661 * it as a duplicate ack even if the cumulative
4662 * ack is advanced. If the receiver delayed an
4663 * ack and detected loss afterwards, then the ack
4664 * will advance cumulative ack and will also have
4665 * a SACK option. So counting it as one duplicate
4666 * ack is ok.
4667 */
4668 if (sack_ackadv == 1 &&
4669 tp->t_state == TCPS_ESTABLISHED &&
4670 SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4671 to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4672 SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4673 !(tp->t_flagsext & TF_PKTS_REORDERED)) {
4674 tcpstat.tcps_sack_ackadv++;
4675 goto process_dupack;
4676 }
4677 }
4678
4679 step6:
4680 /*
4681 * Update window information.
4682 */
4683 if (tcp_update_window(tp, thflags, th, tiwin, tlen))
4684 needoutput = 1;
4685
4686 /*
4687 * Process segments with URG.
4688 */
4689 if ((thflags & TH_URG) && th->th_urp &&
4690 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4691 /*
4692 * This is a kludge, but if we receive and accept
4693 * random urgent pointers, we'll crash in
4694 * soreceive. It's hard to imagine someone
4695 * actually wanting to send this much urgent data.
4696 */
4697 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4698 th->th_urp = 0; /* XXX */
4699 thflags &= ~TH_URG; /* XXX */
4700 goto dodata; /* XXX */
4701 }
4702 /*
4703 * If this segment advances the known urgent pointer,
4704 * then mark the data stream. This should not happen
4705 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4706 * a FIN has been received from the remote side.
4707 * In these states we ignore the URG.
4708 *
4709 * According to RFC961 (Assigned Protocols),
4710 * the urgent pointer points to the last octet
4711 * of urgent data. We continue, however,
4712 * to consider it to indicate the first octet
4713 * of data past the urgent section as the original
4714 * spec states (in one of two places).
4715 */
4716 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4717 tp->rcv_up = th->th_seq + th->th_urp;
4718 so->so_oobmark = so->so_rcv.sb_cc +
4719 (tp->rcv_up - tp->rcv_nxt) - 1;
4720 if (so->so_oobmark == 0) {
4721 so->so_state |= SS_RCVATMARK;
4722 postevent(so, 0, EV_OOB);
4723 }
4724 sohasoutofband(so);
4725 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4726 }
4727 /*
4728 * Remove out of band data so doesn't get presented to user.
4729 * This can happen independent of advancing the URG pointer,
4730 * but if two URG's are pending at once, some out-of-band
4731 * data may creep in... ick.
4732 */
4733 if (th->th_urp <= (u_int32_t)tlen
4734 #if SO_OOBINLINE
4735 && (so->so_options & SO_OOBINLINE) == 0
4736 #endif
4737 )
4738 tcp_pulloutofband(so, th, m,
4739 drop_hdrlen); /* hdr drop is delayed */
4740 } else {
4741 /*
4742 * If no out of band data is expected,
4743 * pull receive urgent pointer along
4744 * with the receive window.
4745 */
4746 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4747 tp->rcv_up = tp->rcv_nxt;
4748 }
4749 dodata:
4750
4751 /* Set socket's connect or disconnect state correcly before doing data.
4752 * The following might unlock the socket if there is an upcall or a socket
4753 * filter.
4754 */
4755 if (isconnected) {
4756 soisconnected(so);
4757 } else if (isdisconnected) {
4758 soisdisconnected(so);
4759 }
4760
4761 /* Let's check the state of pcb just to make sure that it did not get closed
4762 * when we unlocked above
4763 */
4764 if (inp->inp_state == INPCB_STATE_DEAD) {
4765 /* Just drop the packet that we are processing and return */
4766 goto drop;
4767 }
4768
4769 /*
4770 * Process the segment text, merging it into the TCP sequencing queue,
4771 * and arranging for acknowledgment of receipt if necessary.
4772 * This process logically involves adjusting tp->rcv_wnd as data
4773 * is presented to the user (this happens in tcp_usrreq.c,
4774 * case PRU_RCVD). If a FIN has already been received on this
4775 * connection then we just ignore the text.
4776 *
4777 * If we are in SYN-received state and got a valid TFO cookie, we want
4778 * to process the data.
4779 */
4780 if ((tlen || (thflags & TH_FIN)) &&
4781 TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
4782 (TCPS_HAVEESTABLISHED(tp->t_state) ||
4783 (tp->t_state == TCPS_SYN_RECEIVED &&
4784 (tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
4785 tcp_seq save_start = th->th_seq;
4786 tcp_seq save_end = th->th_seq + tlen;
4787 m_adj(m, drop_hdrlen); /* delayed header drop */
4788 /*
4789 * Insert segment which includes th into TCP reassembly queue
4790 * with control block tp. Set thflags to whether reassembly now
4791 * includes a segment with FIN. This handles the common case
4792 * inline (segment is the next to be received on an established
4793 * connection, and the queue is empty), avoiding linkage into
4794 * and removal from the queue and repetition of various
4795 * conversions.
4796 * Set DELACK for segments received in order, but ack
4797 * immediately when segments are out of order (so
4798 * fast retransmit can work).
4799 */
4800 if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
4801 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4802 /*
4803 * Calculate the RTT on the receiver only if the
4804 * connection is in streaming mode and the last
4805 * packet was not an end-of-write
4806 */
4807 if (tp->t_flags & TF_STREAMING_ON)
4808 tcp_compute_rtt(tp, &to, th);
4809
4810 if (DELAY_ACK(tp, th) &&
4811 ((tp->t_flags & TF_ACKNOW) == 0) ) {
4812 if ((tp->t_flags & TF_DELACK) == 0) {
4813 tp->t_flags |= TF_DELACK;
4814 tp->t_timer[TCPT_DELACK] =
4815 OFFSET_FROM_START(tp, tcp_delack);
4816 }
4817 }
4818 else {
4819 tp->t_flags |= TF_ACKNOW;
4820 }
4821 tp->rcv_nxt += tlen;
4822 thflags = th->th_flags & TH_FIN;
4823 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4824 tcpstat.tcps_rcvbyte += tlen;
4825 if (nstat_collect) {
4826 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4827 INP_ADD_STAT(inp, cell, wifi, wired,
4828 rxpackets, m->m_pkthdr.lro_npkts);
4829 } else {
4830 INP_ADD_STAT(inp, cell, wifi, wired,
4831 rxpackets, 1);
4832 }
4833 INP_ADD_STAT(inp, cell, wifi, wired,
4834 rxbytes, tlen);
4835 }
4836 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen,
4837 TCP_AUTORCVBUF_MAX(ifp));
4838 so_recv_data_stat(so, m, drop_hdrlen);
4839
4840 if (sbappendstream_rcvdemux(so, m,
4841 th->th_seq - (tp->irs + 1), 0)) {
4842 sorwakeup(so);
4843 }
4844 } else {
4845 thflags = tcp_reass(tp, th, &tlen, m, ifp);
4846 tp->t_flags |= TF_ACKNOW;
4847 }
4848
4849 if ((tlen > 0 || (th->th_flags & TH_FIN)) && SACK_ENABLED(tp)) {
4850 if (th->th_flags & TH_FIN)
4851 save_end++;
4852 tcp_update_sack_list(tp, save_start, save_end);
4853 }
4854
4855 tcp_adaptive_rwtimo_check(tp, tlen);
4856
4857 if (tlen > 0)
4858 tcp_tfo_rcv_data(tp);
4859
4860 if (tp->t_flags & TF_DELACK)
4861 {
4862 #if INET6
4863 if (isipv6) {
4864 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4865 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4866 th->th_seq, th->th_ack, th->th_win);
4867 }
4868 else
4869 #endif
4870 {
4871 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4872 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4873 th->th_seq, th->th_ack, th->th_win);
4874 }
4875
4876 }
4877 } else {
4878 m_freem(m);
4879 thflags &= ~TH_FIN;
4880 }
4881
4882 /*
4883 * If FIN is received ACK the FIN and let the user know
4884 * that the connection is closing.
4885 */
4886 if (thflags & TH_FIN) {
4887 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4888 socantrcvmore(so);
4889 postevent(so, 0, EV_FIN);
4890 /*
4891 * If connection is half-synchronized
4892 * (ie NEEDSYN flag on) then delay ACK,
4893 * so it may be piggybacked when SYN is sent.
4894 * Otherwise, since we received a FIN then no
4895 * more input can be expected, send ACK now.
4896 */
4897 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4898 if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4899 if ((tp->t_flags & TF_DELACK) == 0) {
4900 tp->t_flags |= TF_DELACK;
4901 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4902 }
4903 } else {
4904 tp->t_flags |= TF_ACKNOW;
4905 }
4906 tp->rcv_nxt++;
4907 }
4908 switch (tp->t_state) {
4909
4910 /*
4911 * In SYN_RECEIVED and ESTABLISHED STATES
4912 * enter the CLOSE_WAIT state.
4913 */
4914 case TCPS_SYN_RECEIVED:
4915 tp->t_starttime = tcp_now;
4916 case TCPS_ESTABLISHED:
4917 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4918 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4919 tp->t_state = TCPS_CLOSE_WAIT;
4920 break;
4921
4922 /*
4923 * If still in FIN_WAIT_1 STATE FIN has not been acked so
4924 * enter the CLOSING state.
4925 */
4926 case TCPS_FIN_WAIT_1:
4927 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4928 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4929 tp->t_state = TCPS_CLOSING;
4930 break;
4931
4932 /*
4933 * In FIN_WAIT_2 state enter the TIME_WAIT state,
4934 * starting the time-wait timer, turning off the other
4935 * standard timers.
4936 */
4937 case TCPS_FIN_WAIT_2:
4938 DTRACE_TCP4(state__change, void, NULL,
4939 struct inpcb *, inp,
4940 struct tcpcb *, tp,
4941 int32_t, TCPS_TIME_WAIT);
4942 tp->t_state = TCPS_TIME_WAIT;
4943 tcp_canceltimers(tp);
4944 tp->t_flags |= TF_ACKNOW;
4945 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4946 tp->t_flags |= TF_CLOSING;
4947 } else {
4948 add_to_time_wait(tp, 2 * tcp_msl);
4949 }
4950 soisdisconnected(so);
4951 break;
4952
4953 /*
4954 * In TIME_WAIT state restart the 2 MSL time_wait timer.
4955 */
4956 case TCPS_TIME_WAIT:
4957 add_to_time_wait(tp, 2 * tcp_msl);
4958 break;
4959 }
4960 }
4961 #if TCPDEBUG
4962 if (so->so_options & SO_DEBUG)
4963 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4964 &tcp_savetcp, 0);
4965 #endif
4966
4967 /*
4968 * Return any desired output.
4969 */
4970 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4971 (void) tcp_output(tp);
4972 }
4973
4974 tcp_check_timer_state(tp);
4975
4976
4977 tcp_unlock(so, 1, 0);
4978 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4979 return;
4980
4981 dropafterack:
4982 /*
4983 * Generate an ACK dropping incoming segment if it occupies
4984 * sequence space, where the ACK reflects our state.
4985 *
4986 * We can now skip the test for the RST flag since all
4987 * paths to this code happen after packets containing
4988 * RST have been dropped.
4989 *
4990 * In the SYN-RECEIVED state, don't send an ACK unless the
4991 * segment we received passes the SYN-RECEIVED ACK test.
4992 * If it fails send a RST. This breaks the loop in the
4993 * "LAND" DoS attack, and also prevents an ACK storm
4994 * between two listening ports that have been sent forged
4995 * SYN segments, each with the source address of the other.
4996 */
4997 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4998 (SEQ_GT(tp->snd_una, th->th_ack) ||
4999 SEQ_GT(th->th_ack, tp->snd_max)) ) {
5000 rstreason = BANDLIM_RST_OPENPORT;
5001 IF_TCP_STATINC(ifp, dospacket);
5002 goto dropwithreset;
5003 }
5004 #if TCPDEBUG
5005 if (so->so_options & SO_DEBUG)
5006 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5007 &tcp_savetcp, 0);
5008 #endif
5009 m_freem(m);
5010 tp->t_flags |= TF_ACKNOW;
5011 (void) tcp_output(tp);
5012
5013 /* Don't need to check timer state as we should have done it during tcp_output */
5014 tcp_unlock(so, 1, 0);
5015 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
5016 return;
5017 dropwithresetnosock:
5018 nosock = 1;
5019 dropwithreset:
5020 /*
5021 * Generate a RST, dropping incoming segment.
5022 * Make ACK acceptable to originator of segment.
5023 * Don't bother to respond if destination was broadcast/multicast.
5024 */
5025 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
5026 goto drop;
5027 #if INET6
5028 if (isipv6) {
5029 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
5030 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
5031 goto drop;
5032 } else
5033 #endif /* INET6 */
5034 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
5035 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
5036 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
5037 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
5038 goto drop;
5039 /* IPv6 anycast check is done at tcp6_input() */
5040
5041 /*
5042 * Perform bandwidth limiting.
5043 */
5044 #if ICMP_BANDLIM
5045 if (badport_bandlim(rstreason) < 0)
5046 goto drop;
5047 #endif
5048
5049 #if TCPDEBUG
5050 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
5051 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5052 &tcp_savetcp, 0);
5053 #endif
5054 bzero(&tra, sizeof(tra));
5055 tra.ifscope = ifscope;
5056 tra.awdl_unrestricted = 1;
5057 tra.intcoproc_allowed = 1;
5058 if (thflags & TH_ACK)
5059 /* mtod() below is safe as long as hdr dropping is delayed */
5060 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
5061 TH_RST, &tra);
5062 else {
5063 if (thflags & TH_SYN)
5064 tlen++;
5065 /* mtod() below is safe as long as hdr dropping is delayed */
5066 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
5067 (tcp_seq)0, TH_RST|TH_ACK, &tra);
5068 }
5069 /* destroy temporarily created socket */
5070 if (dropsocket) {
5071 (void) soabort(so);
5072 tcp_unlock(so, 1, 0);
5073 } else if ((inp != NULL) && (nosock == 0)) {
5074 tcp_unlock(so, 1, 0);
5075 }
5076 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
5077 return;
5078 dropnosock:
5079 nosock = 1;
5080 drop:
5081 /*
5082 * Drop space held by incoming segment and return.
5083 */
5084 #if TCPDEBUG
5085 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
5086 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5087 &tcp_savetcp, 0);
5088 #endif
5089 m_freem(m);
5090 /* destroy temporarily created socket */
5091 if (dropsocket) {
5092 (void) soabort(so);
5093 tcp_unlock(so, 1, 0);
5094 }
5095 else if (nosock == 0) {
5096 tcp_unlock(so, 1, 0);
5097 }
5098 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
5099 return;
5100 }
5101
5102 /*
5103 * Parse TCP options and place in tcpopt.
5104 */
5105 static void
5106 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
5107 struct tcpopt *to)
5108 {
5109 u_short mss = 0;
5110 int opt, optlen;
5111
5112 for (; cnt > 0; cnt -= optlen, cp += optlen) {
5113 opt = cp[0];
5114 if (opt == TCPOPT_EOL)
5115 break;
5116 if (opt == TCPOPT_NOP)
5117 optlen = 1;
5118 else {
5119 if (cnt < 2)
5120 break;
5121 optlen = cp[1];
5122 if (optlen < 2 || optlen > cnt)
5123 break;
5124 }
5125 switch (opt) {
5126
5127 default:
5128 continue;
5129
5130 case TCPOPT_MAXSEG:
5131 if (optlen != TCPOLEN_MAXSEG)
5132 continue;
5133 if (!(th->th_flags & TH_SYN))
5134 continue;
5135 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
5136 NTOHS(mss);
5137 to->to_mss = mss;
5138 to->to_flags |= TOF_MSS;
5139 break;
5140
5141 case TCPOPT_WINDOW:
5142 if (optlen != TCPOLEN_WINDOW)
5143 continue;
5144 if (!(th->th_flags & TH_SYN))
5145 continue;
5146 to->to_flags |= TOF_SCALE;
5147 to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
5148 break;
5149
5150 case TCPOPT_TIMESTAMP:
5151 if (optlen != TCPOLEN_TIMESTAMP)
5152 continue;
5153 to->to_flags |= TOF_TS;
5154 bcopy((char *)cp + 2,
5155 (char *)&to->to_tsval, sizeof(to->to_tsval));
5156 NTOHL(to->to_tsval);
5157 bcopy((char *)cp + 6,
5158 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
5159 NTOHL(to->to_tsecr);
5160 /* Re-enable sending Timestamps if we received them */
5161 if (!(tp->t_flags & TF_REQ_TSTMP) &&
5162 tcp_do_rfc1323 == 1)
5163 tp->t_flags |= TF_REQ_TSTMP;
5164 break;
5165 case TCPOPT_SACK_PERMITTED:
5166 if (!tcp_do_sack ||
5167 optlen != TCPOLEN_SACK_PERMITTED)
5168 continue;
5169 if (th->th_flags & TH_SYN)
5170 to->to_flags |= TOF_SACK;
5171 break;
5172 case TCPOPT_SACK:
5173 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
5174 continue;
5175 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
5176 to->to_sacks = cp + 2;
5177 tcpstat.tcps_sack_rcv_blocks++;
5178
5179 break;
5180 case TCPOPT_FASTOPEN:
5181 if (optlen == TCPOLEN_FASTOPEN_REQ) {
5182 if (tp->t_state != TCPS_LISTEN)
5183 continue;
5184
5185 to->to_flags |= TOF_TFOREQ;
5186 } else {
5187 if (optlen < TCPOLEN_FASTOPEN_REQ ||
5188 (optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX ||
5189 (optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN)
5190 continue;
5191 if (tp->t_state != TCPS_LISTEN &&
5192 tp->t_state != TCPS_SYN_SENT)
5193 continue;
5194
5195 to->to_flags |= TOF_TFO;
5196 to->to_tfo = cp + 1;
5197 }
5198
5199 break;
5200 #if MPTCP
5201 case TCPOPT_MULTIPATH:
5202 tcp_do_mptcp_options(tp, cp, th, to, optlen);
5203 break;
5204 #endif /* MPTCP */
5205 }
5206 }
5207 }
5208
5209 static void
5210 tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope)
5211 {
5212 if (to->to_flags & TOF_TS) {
5213 tp->t_flags |= TF_RCVD_TSTMP;
5214 tp->ts_recent = to->to_tsval;
5215 tp->ts_recent_age = tcp_now;
5216
5217 }
5218 if (to->to_flags & TOF_MSS)
5219 tcp_mss(tp, to->to_mss, ifscope);
5220 if (SACK_ENABLED(tp)) {
5221 if (!(to->to_flags & TOF_SACK))
5222 tp->t_flagsext &= ~(TF_SACK_ENABLE);
5223 else
5224 tp->t_flags |= TF_SACK_PERMIT;
5225 }
5226 if (to->to_flags & TOF_SCALE) {
5227 tp->t_flags |= TF_RCVD_SCALE;
5228 tp->requested_s_scale = to->to_requested_s_scale;
5229
5230 /* Re-enable window scaling, if the option is received */
5231 if (tp->request_r_scale > 0)
5232 tp->t_flags |= TF_REQ_SCALE;
5233 }
5234 }
5235
5236 /*
5237 * Pull out of band byte out of a segment so
5238 * it doesn't appear in the user's data queue.
5239 * It is still reflected in the segment length for
5240 * sequencing purposes.
5241 *
5242 * @param off delayed to be droped hdrlen
5243 */
5244 static void
5245 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off)
5246 {
5247 int cnt = off + th->th_urp - 1;
5248
5249 while (cnt >= 0) {
5250 if (m->m_len > cnt) {
5251 char *cp = mtod(m, caddr_t) + cnt;
5252 struct tcpcb *tp = sototcpcb(so);
5253
5254 tp->t_iobc = *cp;
5255 tp->t_oobflags |= TCPOOB_HAVEDATA;
5256 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
5257 m->m_len--;
5258 if (m->m_flags & M_PKTHDR)
5259 m->m_pkthdr.len--;
5260 return;
5261 }
5262 cnt -= m->m_len;
5263 m = m->m_next;
5264 if (m == 0)
5265 break;
5266 }
5267 panic("tcp_pulloutofband");
5268 }
5269
5270 uint32_t
5271 get_base_rtt(struct tcpcb *tp)
5272 {
5273 struct rtentry *rt = tp->t_inpcb->inp_route.ro_rt;
5274 return ((rt == NULL) ? 0 : rt->rtt_min);
5275 }
5276
5277 /* Each value of RTT base represents the minimum RTT seen in a minute.
5278 * We keep upto N_RTT_BASE minutes worth of history.
5279 */
5280 void
5281 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
5282 {
5283 u_int32_t base_rtt, i;
5284 struct rtentry *rt;
5285
5286 if ((rt = tp->t_inpcb->inp_route.ro_rt) == NULL)
5287 return;
5288 if (rt->rtt_expire_ts == 0) {
5289 RT_LOCK_SPIN(rt);
5290 if (rt->rtt_expire_ts != 0) {
5291 RT_UNLOCK(rt);
5292 goto update;
5293 }
5294 rt->rtt_expire_ts = tcp_now;
5295 rt->rtt_index = 0;
5296 rt->rtt_hist[0] = rtt;
5297 rt->rtt_min = rtt;
5298 RT_UNLOCK(rt);
5299 return;
5300 }
5301 update:
5302 #if TRAFFIC_MGT
5303 /*
5304 * If the recv side is being throttled, check if the
5305 * current RTT is closer to the base RTT seen in
5306 * first (recent) two slots. If so, unthrottle the stream.
5307 */
5308 if ((tp->t_flagsext & TF_RECV_THROTTLE) &&
5309 (int)(tcp_now - tp->t_recv_throttle_ts) >= TCP_RECV_THROTTLE_WIN) {
5310 base_rtt = rt->rtt_min;
5311 if (tp->t_rttcur <= (base_rtt + target_qdelay)) {
5312 tp->t_flagsext &= ~TF_RECV_THROTTLE;
5313 tp->t_recv_throttle_ts = 0;
5314 }
5315 }
5316 #endif /* TRAFFIC_MGT */
5317 if ((int)(tcp_now - rt->rtt_expire_ts) >=
5318 TCP_RTT_HISTORY_EXPIRE_TIME) {
5319 RT_LOCK_SPIN(rt);
5320 /* check the condition again to avoid race */
5321 if ((int)(tcp_now - rt->rtt_expire_ts) >=
5322 TCP_RTT_HISTORY_EXPIRE_TIME) {
5323 rt->rtt_index++;
5324 if (rt->rtt_index >= NRTT_HIST)
5325 rt->rtt_index = 0;
5326 rt->rtt_hist[rt->rtt_index] = rtt;
5327 rt->rtt_expire_ts = tcp_now;
5328 } else {
5329 rt->rtt_hist[rt->rtt_index] =
5330 min(rt->rtt_hist[rt->rtt_index], rtt);
5331 }
5332 /* forget the old value and update minimum */
5333 rt->rtt_min = 0;
5334 for (i = 0; i < NRTT_HIST; ++i) {
5335 if (rt->rtt_hist[i] != 0 &&
5336 (rt->rtt_min == 0 ||
5337 rt->rtt_hist[i] < rt->rtt_min))
5338 rt->rtt_min = rt->rtt_hist[i];
5339 }
5340 RT_UNLOCK(rt);
5341 } else {
5342 rt->rtt_hist[rt->rtt_index] =
5343 min(rt->rtt_hist[rt->rtt_index], rtt);
5344 if (rt->rtt_min == 0)
5345 rt->rtt_min = rtt;
5346 else
5347 rt->rtt_min = min(rt->rtt_min, rtt);
5348 }
5349 }
5350
5351 /*
5352 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
5353 * present but transmit timer is running and timed sequence number was
5354 * acked, update smoothed RTT.
5355 *
5356 * If timestamps are supported, a receiver can update RTT even if
5357 * there is no outstanding data.
5358 *
5359 * Some boxes send broken timestamp replies during the SYN+ACK phase,
5360 * ignore timestamps of 0or we could calculate a huge RTT and blow up
5361 * the retransmit timer.
5362 */
5363 static void
5364 tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
5365 {
5366 int rtt = 0;
5367 VERIFY(to != NULL && th != NULL);
5368 if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
5369 u_int32_t pipe_ack_val;
5370 rtt = tcp_now - tp->t_rtttime;
5371 /*
5372 * Compute pipe ack -- the amount of data acknowledged
5373 * in the last RTT
5374 */
5375 if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
5376 pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
5377 /* Update the sample */
5378 tp->t_pipeack_sample[tp->t_pipeack_ind++] =
5379 pipe_ack_val;
5380 tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
5381
5382 /* Compute the max of the pipeack samples */
5383 pipe_ack_val = tcp_get_max_pipeack(tp);
5384 tp->t_pipeack = (pipe_ack_val >
5385 TCP_CC_CWND_INIT_BYTES) ?
5386 pipe_ack_val : 0;
5387 }
5388 /* start another measurement */
5389 tp->t_rtttime = 0;
5390 }
5391 if (((to->to_flags & TOF_TS) != 0) &&
5392 (to->to_tsecr != 0) &&
5393 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
5394 tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
5395 to->to_tsecr, th->th_ack);
5396 } else if (rtt > 0) {
5397 tcp_xmit_timer(tp, rtt, 0, th->th_ack);
5398 }
5399 }
5400
5401 /*
5402 * Collect new round-trip time estimate and update averages and
5403 * current timeout.
5404 */
5405 static void
5406 tcp_xmit_timer(struct tcpcb *tp, int rtt,
5407 u_int32_t tsecr, tcp_seq th_ack)
5408 {
5409 int delta;
5410
5411 /*
5412 * On AWDL interface, the initial RTT measurement on SYN
5413 * can be wrong due to peer caching. Avoid the first RTT
5414 * measurement as it might skew up the RTO.
5415 * <rdar://problem/28739046>
5416 */
5417 if (tp->t_inpcb->inp_last_outifp != NULL &&
5418 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_AWDL) &&
5419 th_ack == tp->iss + 1)
5420 return;
5421
5422 if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
5423 if (SEQ_GT(th_ack, tp->snd_una) &&
5424 SEQ_LEQ(th_ack, tp->snd_max) &&
5425 (tsecr == 0 ||
5426 TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
5427 /*
5428 * We received a new ACk after a
5429 * spurious timeout. Adapt retransmission
5430 * timer as described in rfc 4015.
5431 */
5432 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
5433 tp->t_badrexmt_time = 0;
5434 tp->t_srtt = max(tp->t_srtt_prev, rtt);
5435 tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
5436 tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
5437 tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
5438
5439 if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5440 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5441
5442 goto compute_rto;
5443 } else {
5444 return;
5445 }
5446 }
5447
5448 tcpstat.tcps_rttupdated++;
5449 tp->t_rttupdated++;
5450
5451 if (rtt > 0) {
5452 tp->t_rttcur = rtt;
5453 update_base_rtt(tp, rtt);
5454 }
5455
5456 if (tp->t_srtt != 0) {
5457 /*
5458 * srtt is stored as fixed point with 5 bits after the
5459 * binary point (i.e., scaled by 32). The following magic
5460 * is equivalent to the smoothing algorithm in rfc793 with
5461 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
5462 * point).
5463 *
5464 * Freebsd adjusts rtt to origin 0 by subtracting 1
5465 * from the provided rtt value. This was required because
5466 * of the way t_rtttime was initiailised to 1 before.
5467 * Since we changed t_rtttime to be based on
5468 * tcp_now, this extra adjustment is not needed.
5469 */
5470 delta = (rtt << TCP_DELTA_SHIFT)
5471 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
5472
5473 if ((tp->t_srtt += delta) <= 0)
5474 tp->t_srtt = 1;
5475
5476 /*
5477 * We accumulate a smoothed rtt variance (actually, a
5478 * smoothed mean difference), then set the retransmit
5479 * timer to smoothed rtt + 4 times the smoothed variance.
5480 * rttvar is stored as fixed point with 4 bits after the
5481 * binary point (scaled by 16). The following is
5482 * equivalent to rfc793 smoothing with an alpha of .75
5483 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
5484 * rfc793's wired-in beta.
5485 */
5486 if (delta < 0)
5487 delta = -delta;
5488 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
5489 if ((tp->t_rttvar += delta) <= 0)
5490 tp->t_rttvar = 1;
5491 if (tp->t_rttbest == 0 ||
5492 tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5493 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5494 } else {
5495 /*
5496 * No rtt measurement yet - use the unsmoothed rtt.
5497 * Set the variance to half the rtt (so our first
5498 * retransmit happens at 3*rtt).
5499 */
5500 tp->t_srtt = rtt << TCP_RTT_SHIFT;
5501 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
5502 }
5503
5504 compute_rto:
5505 nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
5506 tp->t_rttvar);
5507
5508 /*
5509 * the retransmit should happen at rtt + 4 * rttvar.
5510 * Because of the way we do the smoothing, srtt and rttvar
5511 * will each average +1/2 tick of bias. When we compute
5512 * the retransmit timer, we want 1/2 tick of rounding and
5513 * 1 extra tick because of +-1/2 tick uncertainty in the
5514 * firing of the timer. The bias will give us exactly the
5515 * 1.5 tick we need. But, because the bias is
5516 * statistical, we have to test that we don't drop below
5517 * the minimum feasible timer (which is 2 ticks).
5518 */
5519 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5520 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
5521 TCP_ADD_REXMTSLOP(tp));
5522
5523 /*
5524 * We received an ack for a packet that wasn't retransmitted;
5525 * it is probably safe to discard any error indications we've
5526 * received recently. This isn't quite right, but close enough
5527 * for now (a route might have failed after we sent a segment,
5528 * and the return path might not be symmetrical).
5529 */
5530 tp->t_softerror = 0;
5531 }
5532
5533 static inline unsigned int
5534 tcp_maxmtu(struct rtentry *rt)
5535 {
5536 unsigned int maxmtu;
5537
5538 RT_LOCK_ASSERT_HELD(rt);
5539 if (rt->rt_rmx.rmx_mtu == 0)
5540 maxmtu = rt->rt_ifp->if_mtu;
5541 else
5542 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
5543
5544 return (maxmtu);
5545 }
5546
5547 #if INET6
5548 static inline unsigned int
5549 tcp_maxmtu6(struct rtentry *rt)
5550 {
5551 unsigned int maxmtu;
5552 struct nd_ifinfo *ndi = NULL;
5553
5554 RT_LOCK_ASSERT_HELD(rt);
5555 if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
5556 ndi = NULL;
5557 if (ndi != NULL)
5558 lck_mtx_lock(&ndi->lock);
5559 if (rt->rt_rmx.rmx_mtu == 0)
5560 maxmtu = IN6_LINKMTU(rt->rt_ifp);
5561 else
5562 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
5563 if (ndi != NULL)
5564 lck_mtx_unlock(&ndi->lock);
5565
5566 return (maxmtu);
5567 }
5568 #endif
5569
5570 /*
5571 * Determine a reasonable value for maxseg size.
5572 * If the route is known, check route for mtu.
5573 * If none, use an mss that can be handled on the outgoing
5574 * interface without forcing IP to fragment; if bigger than
5575 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
5576 * to utilize large mbufs. If no route is found, route has no mtu,
5577 * or the destination isn't local, use a default, hopefully conservative
5578 * size (usually 512 or the default IP max size, but no more than the mtu
5579 * of the interface), as we can't discover anything about intervening
5580 * gateways or networks. We also initialize the congestion/slow start
5581 * window. While looking at the routing entry, we also initialize
5582 * other path-dependent parameters from pre-set or cached values
5583 * in the routing entry.
5584 *
5585 * Also take into account the space needed for options that we
5586 * send regularly. Make maxseg shorter by that amount to assure
5587 * that we can send maxseg amount of data even when the options
5588 * are present. Store the upper limit of the length of options plus
5589 * data in maxopd.
5590 *
5591 * NOTE that this routine is only called when we process an incoming
5592 * segment, for outgoing segments only tcp_mssopt is called.
5593 *
5594 */
5595 void
5596 tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope)
5597 {
5598 struct rtentry *rt;
5599 struct ifnet *ifp;
5600 int rtt, mss;
5601 u_int32_t bufsize;
5602 struct inpcb *inp;
5603 struct socket *so;
5604 struct rmxp_tao *taop;
5605 int origoffer = offer;
5606 u_int32_t sb_max_corrected;
5607 int isnetlocal = 0;
5608 #if INET6
5609 int isipv6;
5610 int min_protoh;
5611 #endif
5612
5613 inp = tp->t_inpcb;
5614 #if INET6
5615 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5616 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5617 : sizeof (struct tcpiphdr);
5618 #else
5619 #define min_protoh (sizeof (struct tcpiphdr))
5620 #endif
5621
5622 #if INET6
5623 if (isipv6) {
5624 rt = tcp_rtlookup6(inp, input_ifscope);
5625 }
5626 else
5627 #endif /* INET6 */
5628 {
5629 rt = tcp_rtlookup(inp, input_ifscope);
5630 }
5631 isnetlocal = (tp->t_flags & TF_LOCAL);
5632
5633 if (rt == NULL) {
5634 tp->t_maxopd = tp->t_maxseg =
5635 #if INET6
5636 isipv6 ? tcp_v6mssdflt :
5637 #endif /* INET6 */
5638 tcp_mssdflt;
5639 return;
5640 }
5641 ifp = rt->rt_ifp;
5642 /*
5643 * Slower link window correction:
5644 * If a value is specificied for slowlink_wsize use it for
5645 * PPP links believed to be on a serial modem (speed <128Kbps).
5646 * Excludes 9600bps as it is the default value adversized
5647 * by pseudo-devices over ppp.
5648 */
5649 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5650 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
5651 tp->t_flags |= TF_SLOWLINK;
5652 }
5653 so = inp->inp_socket;
5654
5655 taop = rmx_taop(rt->rt_rmx);
5656 /*
5657 * Offer == -1 means that we didn't receive SYN yet,
5658 * use cached value in that case;
5659 */
5660 if (offer == -1)
5661 offer = taop->tao_mssopt;
5662 /*
5663 * Offer == 0 means that there was no MSS on the SYN segment,
5664 * in this case we use tcp_mssdflt.
5665 */
5666 if (offer == 0)
5667 offer =
5668 #if INET6
5669 isipv6 ? tcp_v6mssdflt :
5670 #endif /* INET6 */
5671 tcp_mssdflt;
5672 else {
5673 /*
5674 * Prevent DoS attack with too small MSS. Round up
5675 * to at least minmss.
5676 */
5677 offer = max(offer, tcp_minmss);
5678 /*
5679 * Sanity check: make sure that maxopd will be large
5680 * enough to allow some data on segments even is the
5681 * all the option space is used (40bytes). Otherwise
5682 * funny things may happen in tcp_output.
5683 */
5684 offer = max(offer, 64);
5685 }
5686 taop->tao_mssopt = offer;
5687
5688 /*
5689 * While we're here, check if there's an initial rtt
5690 * or rttvar. Convert from the route-table units
5691 * to scaled multiples of the slow timeout timer.
5692 */
5693 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
5694 tcp_getrt_rtt(tp, rt);
5695 } else {
5696 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
5697 }
5698
5699 #if INET6
5700 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5701 #else
5702 mss = tcp_maxmtu(rt);
5703 #endif
5704
5705 #if NECP
5706 // At this point, the mss is just the MTU. Adjust if necessary.
5707 mss = necp_socket_get_effective_mtu(inp, mss);
5708 #endif /* NECP */
5709
5710 mss -= min_protoh;
5711
5712 if (rt->rt_rmx.rmx_mtu == 0) {
5713 #if INET6
5714 if (isipv6) {
5715 if (!isnetlocal)
5716 mss = min(mss, tcp_v6mssdflt);
5717 } else
5718 #endif /* INET6 */
5719 if (!isnetlocal)
5720 mss = min(mss, tcp_mssdflt);
5721 }
5722
5723 mss = min(mss, offer);
5724 /*
5725 * maxopd stores the maximum length of data AND options
5726 * in a segment; maxseg is the amount of data in a normal
5727 * segment. We need to store this value (maxopd) apart
5728 * from maxseg, because now every segment carries options
5729 * and thus we normally have somewhat less data in segments.
5730 */
5731 tp->t_maxopd = mss;
5732
5733 /*
5734 * origoffer==-1 indicates, that no segments were received yet.
5735 * In this case we just guess.
5736 */
5737 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
5738 (origoffer == -1 ||
5739 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5740 mss -= TCPOLEN_TSTAMP_APPA;
5741
5742 #if MPTCP
5743 mss -= mptcp_adj_mss(tp, FALSE);
5744 #endif /* MPTCP */
5745 tp->t_maxseg = mss;
5746
5747 /*
5748 * Calculate corrected value for sb_max; ensure to upgrade the
5749 * numerator for large sb_max values else it will overflow.
5750 */
5751 sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5752
5753 /*
5754 * If there's a pipesize (ie loopback), change the socket
5755 * buffer to that size only if it's bigger than the current
5756 * sockbuf size. Make the socket buffers an integral
5757 * number of mss units; if the mss is larger than
5758 * the socket buffer, decrease the mss.
5759 */
5760 #if RTV_SPIPE
5761 bufsize = rt->rt_rmx.rmx_sendpipe;
5762 if (bufsize < so->so_snd.sb_hiwat)
5763 #endif
5764 bufsize = so->so_snd.sb_hiwat;
5765 if (bufsize < mss)
5766 mss = bufsize;
5767 else {
5768 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5769 if (bufsize > sb_max_corrected)
5770 bufsize = sb_max_corrected;
5771 (void)sbreserve(&so->so_snd, bufsize);
5772 }
5773 tp->t_maxseg = mss;
5774
5775 /*
5776 * Update MSS using recommendation from link status report. This is
5777 * temporary
5778 */
5779 tcp_update_mss_locked(so, ifp);
5780
5781 #if RTV_RPIPE
5782 bufsize = rt->rt_rmx.rmx_recvpipe;
5783 if (bufsize < so->so_rcv.sb_hiwat)
5784 #endif
5785 bufsize = so->so_rcv.sb_hiwat;
5786 if (bufsize > mss) {
5787 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5788 if (bufsize > sb_max_corrected)
5789 bufsize = sb_max_corrected;
5790 (void)sbreserve(&so->so_rcv, bufsize);
5791 }
5792
5793 set_tcp_stream_priority(so);
5794
5795 if (rt->rt_rmx.rmx_ssthresh) {
5796 /*
5797 * There's some sort of gateway or interface
5798 * buffer limit on the path. Use this to set
5799 * slow-start threshold, but set the threshold to
5800 * no less than 2*mss.
5801 */
5802 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5803 tcpstat.tcps_usedssthresh++;
5804 } else {
5805 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5806 }
5807
5808 /*
5809 * Set the slow-start flight size depending on whether this
5810 * is a local network or not.
5811 */
5812 if (CC_ALGO(tp)->cwnd_init != NULL)
5813 CC_ALGO(tp)->cwnd_init(tp);
5814
5815 tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
5816
5817 /* Route locked during lookup above */
5818 RT_UNLOCK(rt);
5819 }
5820
5821 /*
5822 * Determine the MSS option to send on an outgoing SYN.
5823 */
5824 int
5825 tcp_mssopt(struct tcpcb *tp)
5826 {
5827 struct rtentry *rt;
5828 int mss;
5829 #if INET6
5830 int isipv6;
5831 int min_protoh;
5832 #endif
5833
5834 #if INET6
5835 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5836 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5837 : sizeof (struct tcpiphdr);
5838 #else
5839 #define min_protoh (sizeof (struct tcpiphdr))
5840 #endif
5841
5842 #if INET6
5843 if (isipv6)
5844 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
5845 else
5846 #endif /* INET6 */
5847 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
5848 if (rt == NULL) {
5849 return (
5850 #if INET6
5851 isipv6 ? tcp_v6mssdflt :
5852 #endif /* INET6 */
5853 tcp_mssdflt);
5854 }
5855 /*
5856 * Slower link window correction:
5857 * If a value is specificied for slowlink_wsize use it for PPP links
5858 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5859 * it is the default value adversized by pseudo-devices over ppp.
5860 */
5861 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5862 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5863 tp->t_flags |= TF_SLOWLINK;
5864 }
5865
5866 #if INET6
5867 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5868 #else
5869 mss = tcp_maxmtu(rt);
5870 #endif
5871 /* Route locked during lookup above */
5872 RT_UNLOCK(rt);
5873
5874 #if NECP
5875 // At this point, the mss is just the MTU. Adjust if necessary.
5876 mss = necp_socket_get_effective_mtu(tp->t_inpcb, mss);
5877 #endif /* NECP */
5878
5879 return (mss - min_protoh);
5880 }
5881
5882 /*
5883 * On a partial ack arrives, force the retransmission of the
5884 * next unacknowledged segment. Do not clear tp->t_dupacks.
5885 * By setting snd_nxt to th_ack, this forces retransmission timer to
5886 * be started again.
5887 */
5888 static void
5889 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
5890 {
5891 tcp_seq onxt = tp->snd_nxt;
5892 u_int32_t ocwnd = tp->snd_cwnd;
5893 tp->t_timer[TCPT_REXMT] = 0;
5894 tp->t_timer[TCPT_PTO] = 0;
5895 tp->t_rtttime = 0;
5896 tp->snd_nxt = th->th_ack;
5897 /*
5898 * Set snd_cwnd to one segment beyond acknowledged offset
5899 * (tp->snd_una has not yet been updated when this function
5900 * is called)
5901 */
5902 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
5903 tp->t_flags |= TF_ACKNOW;
5904 (void) tcp_output(tp);
5905 tp->snd_cwnd = ocwnd;
5906 if (SEQ_GT(onxt, tp->snd_nxt))
5907 tp->snd_nxt = onxt;
5908 /*
5909 * Partial window deflation. Relies on fact that tp->snd_una
5910 * not updated yet.
5911 */
5912 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5913 tp->snd_cwnd -= BYTES_ACKED(th, tp);
5914 else
5915 tp->snd_cwnd = 0;
5916 tp->snd_cwnd += tp->t_maxseg;
5917 }
5918
5919 /*
5920 * Drop a random TCP connection that hasn't been serviced yet and
5921 * is eligible for discard. There is a one in qlen chance that
5922 * we will return a null, saying that there are no dropable
5923 * requests. In this case, the protocol specific code should drop
5924 * the new request. This insures fairness.
5925 *
5926 * The listening TCP socket "head" must be locked
5927 */
5928 static int
5929 tcp_dropdropablreq(struct socket *head)
5930 {
5931 struct socket *so, *sonext;
5932 unsigned int i, j, qlen;
5933 static u_int32_t rnd = 0;
5934 static u_int64_t old_runtime;
5935 static unsigned int cur_cnt, old_cnt;
5936 u_int64_t now_sec;
5937 struct inpcb *inp = NULL;
5938 struct tcpcb *tp;
5939
5940 if ((head->so_options & SO_ACCEPTCONN) == 0)
5941 return (0);
5942
5943 if (TAILQ_EMPTY(&head->so_incomp))
5944 return (0);
5945
5946 so_acquire_accept_list(head, NULL);
5947 socket_unlock(head, NULL);
5948
5949 /*
5950 * Check if there is any socket in the incomp queue
5951 * that is closed because of a reset from the peer and is
5952 * waiting to be garbage collected. If so, pick that as
5953 * the victim
5954 */
5955 TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5956 inp = sotoinpcb(so);
5957 tp = intotcpcb(inp);
5958 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5959 so->so_head != NULL &&
5960 (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5961 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5962 /*
5963 * The listen socket is already locked but we
5964 * can lock this socket here without lock ordering
5965 * issues because it is in the incomp queue and
5966 * is not visible to others.
5967 */
5968 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5969 so->so_usecount++;
5970 goto found_victim;
5971 } else {
5972 continue;
5973 }
5974 }
5975 }
5976
5977 so = TAILQ_FIRST(&head->so_incomp);
5978
5979 now_sec = net_uptime();
5980 if ((i = (now_sec - old_runtime)) != 0) {
5981 old_runtime = now_sec;
5982 old_cnt = cur_cnt / i;
5983 cur_cnt = 0;
5984 }
5985
5986 qlen = head->so_incqlen;
5987 if (rnd == 0)
5988 rnd = RandomULong();
5989
5990 if (++cur_cnt > qlen || old_cnt > qlen) {
5991 rnd = (314159 * rnd + 66329) & 0xffff;
5992 j = ((qlen + 1) * rnd) >> 16;
5993
5994 while (j-- && so)
5995 so = TAILQ_NEXT(so, so_list);
5996 }
5997 /* Find a connection that is not already closing (or being served) */
5998 while (so) {
5999 inp = (struct inpcb *)so->so_pcb;
6000
6001 sonext = TAILQ_NEXT(so, so_list);
6002
6003 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
6004 != WNT_STOPUSING) {
6005 /*
6006 * Avoid the issue of a socket being accepted
6007 * by one input thread and being dropped by
6008 * another input thread. If we can't get a hold
6009 * on this mutex, then grab the next socket in
6010 * line.
6011 */
6012 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
6013 so->so_usecount++;
6014 if ((so->so_usecount == 2) &&
6015 (so->so_state & SS_INCOMP) &&
6016 !(so->so_flags & SOF_INCOMP_INPROGRESS)) {
6017 break;
6018 } else {
6019 /*
6020 * don't use if being accepted or
6021 * used in any other way
6022 */
6023 in_pcb_checkstate(inp, WNT_RELEASE, 1);
6024 tcp_unlock(so, 1, 0);
6025 }
6026 } else {
6027 /*
6028 * do not try to lock the inp in
6029 * in_pcb_checkstate because the lock
6030 * is already held in some other thread.
6031 * Only drop the inp_wntcnt reference.
6032 */
6033 in_pcb_checkstate(inp, WNT_RELEASE, 1);
6034 }
6035 }
6036 so = sonext;
6037 }
6038 if (so == NULL) {
6039 socket_lock(head, 0);
6040 so_release_accept_list(head);
6041 return (0);
6042 }
6043
6044 /* Makes sure socket is still in the right state to be discarded */
6045
6046 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
6047 tcp_unlock(so, 1, 0);
6048 socket_lock(head, 0);
6049 so_release_accept_list(head);
6050 return (0);
6051 }
6052
6053 found_victim:
6054 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
6055 /* do not discard: that socket is being accepted */
6056 tcp_unlock(so, 1, 0);
6057 socket_lock(head, 0);
6058 so_release_accept_list(head);
6059 return (0);
6060 }
6061
6062 socket_lock(head, 0);
6063 TAILQ_REMOVE(&head->so_incomp, so, so_list);
6064 head->so_incqlen--;
6065 head->so_qlen--;
6066 so->so_state &= ~SS_INCOMP;
6067 so->so_flags |= SOF_OVERFLOW;
6068 so->so_head = NULL;
6069 so_release_accept_list(head);
6070 tcp_unlock(head, 0, 0);
6071
6072 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
6073 tp = sototcpcb(so);
6074
6075 tcp_close(tp);
6076 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
6077 /*
6078 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
6079 * doesn't require a lock, it could have happened while
6080 * we are holding the lock. This pcb will have to
6081 * be garbage collected later.
6082 * Release the reference held for so_incomp queue
6083 */
6084 VERIFY(so->so_usecount > 0);
6085 so->so_usecount--;
6086 tcp_unlock(so, 1, 0);
6087 } else {
6088 /*
6089 * Unlock this socket and leave the reference on.
6090 * We need to acquire the pcbinfo lock in order to
6091 * fully dispose it off
6092 */
6093 tcp_unlock(so, 0, 0);
6094
6095 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
6096
6097 tcp_lock(so, 0, 0);
6098 /* Release the reference held for so_incomp queue */
6099 VERIFY(so->so_usecount > 0);
6100 so->so_usecount--;
6101
6102 if (so->so_usecount != 1 ||
6103 (inp->inp_wantcnt > 0 &&
6104 inp->inp_wantcnt != WNT_STOPUSING)) {
6105 /*
6106 * There is an extra wantcount or usecount
6107 * that must have been added when the socket
6108 * was unlocked. This socket will have to be
6109 * garbage collected later
6110 */
6111 tcp_unlock(so, 1, 0);
6112 } else {
6113 /* Drop the reference held for this function */
6114 VERIFY(so->so_usecount > 0);
6115 so->so_usecount--;
6116
6117 in_pcbdispose(inp);
6118 }
6119 lck_rw_done(tcbinfo.ipi_lock);
6120 }
6121 tcpstat.tcps_drops++;
6122
6123 tcp_lock(head, 0, 0);
6124 return(1);
6125 }
6126
6127 /* Set background congestion control on a socket */
6128 void
6129 tcp_set_background_cc(struct socket *so)
6130 {
6131 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
6132 }
6133
6134 /* Set foreground congestion control on a socket */
6135 void
6136 tcp_set_foreground_cc(struct socket *so)
6137 {
6138 if (tcp_use_newreno)
6139 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
6140 else
6141 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
6142 }
6143
6144 static void
6145 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
6146 {
6147 struct inpcb *inp = sotoinpcb(so);
6148 struct tcpcb *tp = intotcpcb(inp);
6149 u_char old_cc_index = 0;
6150 if (tp->tcp_cc_index != cc_index) {
6151
6152 old_cc_index = tp->tcp_cc_index;
6153
6154 if (CC_ALGO(tp)->cleanup != NULL)
6155 CC_ALGO(tp)->cleanup(tp);
6156 tp->tcp_cc_index = cc_index;
6157
6158 tcp_cc_allocate_state(tp);
6159
6160 if (CC_ALGO(tp)->switch_to != NULL)
6161 CC_ALGO(tp)->switch_to(tp, old_cc_index);
6162
6163 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
6164 }
6165 }
6166
6167 void
6168 tcp_set_recv_bg(struct socket *so)
6169 {
6170 if (!IS_TCP_RECV_BG(so))
6171 so->so_flags1 |= SOF1_TRAFFIC_MGT_TCP_RECVBG;
6172
6173 /* Unset Large Receive Offload on background sockets */
6174 so_set_lro(so, SO_TC_BK);
6175 }
6176
6177 void
6178 tcp_clear_recv_bg(struct socket *so)
6179 {
6180 if (IS_TCP_RECV_BG(so))
6181 so->so_flags1 &= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG);
6182
6183 /*
6184 * Set/unset use of Large Receive Offload depending on
6185 * the traffic class
6186 */
6187 so_set_lro(so, so->so_traffic_class);
6188 }
6189
6190 void
6191 inp_fc_unthrottle_tcp(struct inpcb *inp)
6192 {
6193 struct tcpcb *tp = inp->inp_ppcb;
6194 /*
6195 * Back off the slow-start threshold and enter
6196 * congestion avoidance phase
6197 */
6198 if (CC_ALGO(tp)->pre_fr != NULL)
6199 CC_ALGO(tp)->pre_fr(tp);
6200
6201 tp->snd_cwnd = tp->snd_ssthresh;
6202 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
6203 /*
6204 * Restart counting for ABC as we changed the
6205 * congestion window just now.
6206 */
6207 tp->t_bytes_acked = 0;
6208
6209 /* Reset retransmit shift as we know that the reason
6210 * for delay in sending a packet is due to flow
6211 * control on the outgoing interface. There is no need
6212 * to backoff retransmit timer.
6213 */
6214 TCP_RESET_REXMT_STATE(tp);
6215
6216 /*
6217 * Start the output stream again. Since we are
6218 * not retransmitting data, do not reset the
6219 * retransmit timer or rtt calculation.
6220 */
6221 tcp_output(tp);
6222 }
6223
6224 static int
6225 tcp_getstat SYSCTL_HANDLER_ARGS
6226 {
6227 #pragma unused(oidp, arg1, arg2)
6228
6229 int error;
6230
6231 proc_t caller = PROC_NULL;
6232 proc_t caller_parent = PROC_NULL;
6233 char command_name[MAXCOMLEN + 1] = "";
6234 char parent_name[MAXCOMLEN + 1] = "";
6235
6236 if ((caller = proc_self()) != PROC_NULL) {
6237 /* get process name */
6238 strlcpy(command_name, caller->p_comm, sizeof(command_name));
6239
6240 /* get parent process name if possible */
6241 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
6242 strlcpy(parent_name, caller_parent->p_comm,
6243 sizeof(parent_name));
6244 proc_rele(caller_parent);
6245 }
6246
6247 if ((escape_str(command_name, strlen(command_name),
6248 sizeof(command_name)) == 0) &&
6249 (escape_str(parent_name, strlen(parent_name),
6250 sizeof(parent_name)) == 0)) {
6251 kern_asl_msg(LOG_DEBUG, "messagetracer",
6252 5,
6253 "com.apple.message.domain",
6254 "com.apple.kernel.tcpstat", /* 1 */
6255 "com.apple.message.signature",
6256 "tcpstat", /* 2 */
6257 "com.apple.message.signature2", command_name, /* 3 */
6258 "com.apple.message.signature3", parent_name, /* 4 */
6259 "com.apple.message.summarize", "YES", /* 5 */
6260 NULL);
6261 }
6262 }
6263 if (caller != PROC_NULL)
6264 proc_rele(caller);
6265
6266 if (req->oldptr == 0) {
6267 req->oldlen= (size_t)sizeof(struct tcpstat);
6268 }
6269
6270 error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
6271
6272 return (error);
6273
6274 }
6275
6276 /*
6277 * Checksum extended TCP header and data.
6278 */
6279 int
6280 tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
6281 {
6282 struct ifnet *ifp = m->m_pkthdr.rcvif;
6283
6284 switch (af) {
6285 case AF_INET: {
6286 struct ip *ip = mtod(m, struct ip *);
6287 struct ipovly *ipov = (struct ipovly *)ip;
6288
6289 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6290 return (0);
6291
6292 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6293 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6294 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6295 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6296 th->th_sum = m->m_pkthdr.csum_rx_val;
6297 } else {
6298 uint16_t sum = m->m_pkthdr.csum_rx_val;
6299 uint16_t start = m->m_pkthdr.csum_rx_start;
6300
6301 /*
6302 * Perform 1's complement adjustment of octets
6303 * that got included/excluded in the hardware-
6304 * calculated checksum value. Ignore cases
6305 * where the value includes or excludes the IP
6306 * header span, as the sum for those octets
6307 * would already be 0xffff and thus no-op.
6308 */
6309 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6310 start != 0 && (off - start) != off) {
6311 #if BYTE_ORDER != BIG_ENDIAN
6312 if (start < off) {
6313 HTONS(ip->ip_len);
6314 HTONS(ip->ip_off);
6315 }
6316 #endif
6317 /* callee folds in sum */
6318 sum = m_adj_sum16(m, start, off, sum);
6319 #if BYTE_ORDER != BIG_ENDIAN
6320 if (start < off) {
6321 NTOHS(ip->ip_off);
6322 NTOHS(ip->ip_len);
6323 }
6324 #endif
6325 }
6326
6327 /* callee folds in sum */
6328 th->th_sum = in_pseudo(ip->ip_src.s_addr,
6329 ip->ip_dst.s_addr,
6330 sum + htonl(tlen + IPPROTO_TCP));
6331 }
6332 th->th_sum ^= 0xffff;
6333 } else {
6334 uint16_t ip_sum;
6335 int len;
6336 char b[9];
6337
6338 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
6339 bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
6340 ip_sum = ipov->ih_len;
6341 ipov->ih_len = (u_short)tlen;
6342 #if BYTE_ORDER != BIG_ENDIAN
6343 HTONS(ipov->ih_len);
6344 #endif
6345 len = sizeof (struct ip) + tlen;
6346 th->th_sum = in_cksum(m, len);
6347 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
6348 ipov->ih_len = ip_sum;
6349
6350 tcp_in_cksum_stats(len);
6351 }
6352 break;
6353 }
6354 #if INET6
6355 case AF_INET6: {
6356 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6357
6358 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6359 return (0);
6360
6361 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6362 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6363 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6364 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6365 th->th_sum = m->m_pkthdr.csum_rx_val;
6366 } else {
6367 uint16_t sum = m->m_pkthdr.csum_rx_val;
6368 uint16_t start = m->m_pkthdr.csum_rx_start;
6369
6370 /*
6371 * Perform 1's complement adjustment of octets
6372 * that got included/excluded in the hardware-
6373 * calculated checksum value.
6374 */
6375 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6376 start != off) {
6377 uint16_t s, d;
6378
6379 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
6380 s = ip6->ip6_src.s6_addr16[1];
6381 ip6->ip6_src.s6_addr16[1] = 0 ;
6382 }
6383 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
6384 d = ip6->ip6_dst.s6_addr16[1];
6385 ip6->ip6_dst.s6_addr16[1] = 0;
6386 }
6387
6388 /* callee folds in sum */
6389 sum = m_adj_sum16(m, start, off, sum);
6390
6391 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
6392 ip6->ip6_src.s6_addr16[1] = s;
6393 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
6394 ip6->ip6_dst.s6_addr16[1] = d;
6395 }
6396
6397 th->th_sum = in6_pseudo(
6398 &ip6->ip6_src, &ip6->ip6_dst,
6399 sum + htonl(tlen + IPPROTO_TCP));
6400 }
6401 th->th_sum ^= 0xffff;
6402 } else {
6403 tcp_in6_cksum_stats(tlen);
6404 th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
6405 }
6406 break;
6407 }
6408 #endif /* INET6 */
6409 default:
6410 VERIFY(0);
6411 /* NOTREACHED */
6412 }
6413
6414 if (th->th_sum != 0) {
6415 tcpstat.tcps_rcvbadsum++;
6416 IF_TCP_STATINC(ifp, badformat);
6417 return (-1);
6418 }
6419
6420 return (0);
6421 }
6422
6423 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
6424 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
6425 "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
6426
6427 static int
6428 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
6429 {
6430 #pragma unused(arg1, arg2)
6431
6432 int error, val = tcprexmtthresh;
6433
6434 error = sysctl_handle_int(oidp, &val, 0, req);
6435 if (error || !req->newptr)
6436 return (error);
6437
6438 /*
6439 * Constrain the number of duplicate ACKs
6440 * to consider for TCP fast retransmit
6441 * to either 2 or 3
6442 */
6443
6444 if (val < 2 || val > 3)
6445 return (EINVAL);
6446
6447 tcprexmtthresh = val;
6448
6449 return (0);
6450 }
6451
6452 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW |
6453 CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
6454 "Duplicate ACK Threshold for Fast Retransmit");