]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/proc.h> /* for proc0 declaration */
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/syslog.h>
81 #include <sys/mcache.h>
82 #include <sys/kasl.h>
83 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
84
85 #include <machine/endian.h>
86
87 #include <net/if.h>
88 #include <net/if_types.h>
89 #include <net/route.h>
90 #include <net/ntstat.h>
91 #include <net/dlil.h>
92
93 #include <netinet/in.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
97 #include <netinet/in_var.h>
98 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
99 #include <netinet/in_pcb.h>
100 #include <netinet/ip_var.h>
101 #include <mach/sdt.h>
102 #if INET6
103 #include <netinet/ip6.h>
104 #include <netinet/icmp6.h>
105 #include <netinet6/nd6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet6/in6_pcb.h>
108 #endif
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_cache.h>
111 #include <netinet/tcp_fsm.h>
112 #include <netinet/tcp_seq.h>
113 #include <netinet/tcp_timer.h>
114 #include <netinet/tcp_var.h>
115 #include <netinet/tcp_cc.h>
116 #include <dev/random/randomdev.h>
117 #include <kern/zalloc.h>
118 #if INET6
119 #include <netinet6/tcp6_var.h>
120 #endif
121 #include <netinet/tcpip.h>
122 #if TCPDEBUG
123 #include <netinet/tcp_debug.h>
124 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
125 struct tcphdr tcp_savetcp;
126 #endif /* TCPDEBUG */
127
128 #if IPSEC
129 #include <netinet6/ipsec.h>
130 #if INET6
131 #include <netinet6/ipsec6.h>
132 #endif
133 #include <netkey/key.h>
134 #endif /*IPSEC*/
135
136 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
137 #include <security/mac_framework.h>
138 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
139
140 #include <sys/kdebug.h>
141 #include <netinet/lro_ext.h>
142 #if MPTCP
143 #include <netinet/mptcp_var.h>
144 #include <netinet/mptcp.h>
145 #include <netinet/mptcp_opt.h>
146 #endif /* MPTCP */
147
148 #include <corecrypto/ccaes.h>
149
150 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
151 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
152 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
153 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
154
155 tcp_cc tcp_ccgen;
156
157 struct tcpstat tcpstat;
158
159 static int log_in_vain = 0;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0,
162 "Log all incoming TCP connections");
163
164 static int blackhole = 0;
165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
166 CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0,
167 "Do not send RST when dropping refused connections");
168
169 int tcp_delack_enabled = 3;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack,
171 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0,
172 "Delay ACK to try and piggyback it onto a data packet");
173
174 int tcp_lq_overflow = 1;
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow,
176 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0,
177 "Listen Queue Overflow");
178
179 int tcp_recv_bg = 0;
180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
181 &tcp_recv_bg, 0, "Receive background");
182
183 #if TCP_DROP_SYNFIN
184 static int drop_synfin = 1;
185 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin,
186 CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0,
187 "Drop TCP packets with SYN+FIN set");
188 #endif
189
190 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
191 "TCP Segment Reassembly Queue");
192
193 static int tcp_reass_overflows = 0;
194 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
195 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
196 "Global number of TCP Segment Reassembly Queue Overflows");
197
198
199 __private_extern__ int slowlink_wsize = 8192;
200 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize,
201 CTLFLAG_RW | CTLFLAG_LOCKED,
202 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
203
204 int maxseg_unacked = 8;
205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked,
206 CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0,
207 "Maximum number of outstanding segments left unacked");
208
209 int tcp_do_rfc3465 = 1;
210 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
211 &tcp_do_rfc3465, 0, "");
212
213 int tcp_do_rfc3465_lim2 = 1;
214 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2,
215 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0,
216 "Appropriate bytes counting w/ L=2*SMSS");
217
218 int rtt_samples_per_slot = 20;
219
220 int tcp_allowed_iaj = ALLOWED_IAJ;
221 int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
222 u_int32_t tcp_autorcvbuf_inc_shift = 3;
223 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj,
224 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_allowed_iaj, 0,
225 "Allowed inter-packet arrival jiter");
226 #if (DEVELOPMENT || DEBUG)
227 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh,
228 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_high_thresh, 0,
229 "Used in calculating maximum accumulated IAJ");
230
231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift,
232 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_inc_shift, 0,
233 "Shift for increment in receive socket buffer size");
234 #endif /* (DEVELOPMENT || DEBUG) */
235
236 u_int32_t tcp_do_autorcvbuf = 1;
237 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf,
238 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autorcvbuf, 0,
239 "Enable automatic socket buffer tuning");
240
241 u_int32_t tcp_autorcvbuf_max = 512 * 1024;
242 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax,
243 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max, 0,
244 "Maximum receive socket buffer size");
245
246 int sw_lro = 0;
247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
248 &sw_lro, 0, "Used to coalesce TCP packets");
249
250 int lrodebug = 0;
251 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &lrodebug, 0,
253 "Used to debug SW LRO");
254
255 int lro_start = 4;
256 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt,
257 CTLFLAG_RW | CTLFLAG_LOCKED, &lro_start, 0,
258 "Segments for starting LRO computed as power of 2");
259
260 extern int tcp_do_autosendbuf;
261
262 int limited_txmt = 1;
263 int early_rexmt = 1;
264 int sack_ackadv = 1;
265 int tcp_dsack_enable = 1;
266
267 #if (DEVELOPMENT || DEBUG)
268 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &limited_txmt, 0,
270 "Enable limited transmit");
271
272 SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt,
273 CTLFLAG_RW | CTLFLAG_LOCKED, &early_rexmt, 0,
274 "Enable Early Retransmit");
275
276 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv,
277 CTLFLAG_RW | CTLFLAG_LOCKED, &sack_ackadv, 0,
278 "Use SACK with cumulative ack advancement as a dupack");
279
280 SYSCTL_INT(_net_inet_tcp, OID_AUTO, dsack_enable,
281 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_dsack_enable, 0,
282 "use DSACK TCP option to report duplicate segments");
283 #endif /* (DEVELOPMENT || DEBUG) */
284
285 #if CONFIG_IFEF_NOWINDOWSCALE
286 int tcp_obey_ifef_nowindowscale = 0;
287 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale,
288 CTLFLAG_RW | CTLFLAG_LOCKED,
289 &tcp_obey_ifef_nowindowscale, 0, "");
290 #endif
291
292 extern int tcp_TCPTV_MIN;
293 extern int tcp_acc_iaj_high;
294 extern int tcp_acc_iaj_react_limit;
295
296 int tcprexmtthresh = 3;
297
298 u_int32_t tcp_now;
299 struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
300 lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */
301
302 struct inpcbhead tcb;
303 #define tcb6 tcb /* for KAME src sync over BSD*'s */
304 struct inpcbinfo tcbinfo;
305
306 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
307 struct tcpopt *);
308 static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
309 static void tcp_pulloutofband(struct socket *,
310 struct tcphdr *, struct mbuf *, int);
311 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
312 struct ifnet *);
313 static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
314 static inline unsigned int tcp_maxmtu(struct rtentry *);
315 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
316 static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
317
318 #if TRAFFIC_MGT
319 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
320 int reset_size);
321 void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
322 static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
323 #endif /* TRAFFIC_MGT */
324
325 #if INET6
326 static inline unsigned int tcp_maxmtu6(struct rtentry *);
327 #endif
328
329 static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
330 struct tcpopt *to, u_int32_t tlen);
331
332 void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
333 static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
334 static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
335 static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
336 u_int32_t newsize, u_int32_t idealsize);
337 static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
338 static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
339 struct tcphdr *th);
340 static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
341 static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
342 struct tcpopt *to);
343 /*
344 * Constants used for resizing receive socket buffer
345 * when timestamps are not supported
346 */
347 #define TCPTV_RCVNOTS_QUANTUM 100
348 #define TCP_RCVNOTS_BYTELEVEL 204800
349
350 /*
351 * Constants used for limiting early retransmits
352 * to 10 per minute.
353 */
354 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
355 #define TCP_EARLY_REXMT_LIMIT 10
356
357 extern void ipfwsyslog( int level, const char *format,...);
358 extern int fw_verbose;
359
360 #if IPFIREWALL
361 extern void ipfw_stealth_stats_incr_tcp(void);
362
363 #define log_in_vain_log( a ) { \
364 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
365 ipfwsyslog a ; \
366 } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) { \
367 ipfw_stealth_stats_incr_tcp(); \
368 } \
369 else log a ; \
370 }
371 #else
372 #define log_in_vain_log( a ) { log a; }
373 #endif
374
375 int tcp_rcvunackwin = TCPTV_UNACKWIN;
376 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
377 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
378 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
380
381 #define DELAY_ACK(tp, th) \
382 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
383
384 static int tcp_dropdropablreq(struct socket *head);
385 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
386 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
387 void tcp_set_background_cc(struct socket *so);
388 void tcp_set_foreground_cc(struct socket *so);
389 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
390 static void tcp_bwmeas_check(struct tcpcb *tp);
391
392 #if TRAFFIC_MGT
393 void
394 reset_acc_iaj(struct tcpcb *tp)
395 {
396 tp->acc_iaj = 0;
397 tp->iaj_rwintop = 0;
398 CLEAR_IAJ_STATE(tp);
399 }
400
401 static inline void
402 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
403 {
404 if (rst_size > 0)
405 tp->iaj_size = 0;
406 if (tp->iaj_size == 0 || size >= tp->iaj_size) {
407 tp->iaj_size = size;
408 tp->iaj_rcv_ts = tcp_now;
409 tp->iaj_small_pkt = 0;
410 }
411 }
412
413 /* For every 32 bit unsigned integer(v), this function will find the
414 * largest integer n such that (n*n <= v). This takes at most 16 iterations
415 * irrespective of the value of v and does not involve multiplications.
416 */
417 static inline int
418 isqrt(unsigned int val) {
419 unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
420 unsigned int temp, g=0, b=0x8000, bshft=15;
421 if ( val <= 100) {
422 for (g = 0; g <= 10; ++g) {
423 if (sqrt_cache[g] > val) {
424 g--;
425 break;
426 } else if (sqrt_cache[g] == val) {
427 break;
428 }
429 }
430 } else {
431 do {
432 temp = (((g << 1) + b) << (bshft--));
433 if (val >= temp) {
434 g += b;
435 val -= temp;
436 }
437 b >>= 1;
438 } while ( b > 0 && val > 0);
439 }
440 return(g);
441 }
442
443 /*
444 * With LRO, roughly estimate the inter arrival time between
445 * each sub coalesced packet as an average. Count the delay
446 * cur_iaj to be the delay between the last packet received
447 * and the first packet of the LRO stream. Due to round off errors
448 * cur_iaj may be the same as lro_delay_factor. Averaging has
449 * round off errors too. lro_delay_factor may be close to 0
450 * in steady state leading to lower values fed to compute_iaj_meat.
451 */
452 void
453 compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
454 {
455 uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
456 uint32_t timediff = 0;
457
458 if (cur_iaj >= lro_delay_factor) {
459 cur_iaj = cur_iaj - lro_delay_factor;
460 }
461
462 compute_iaj_meat(tp, cur_iaj);
463
464 if (nlropkts <= 1)
465 return;
466
467 nlropkts--;
468
469 timediff = lro_delay_factor/nlropkts;
470
471 while (nlropkts > 0)
472 {
473 compute_iaj_meat(tp, timediff);
474 nlropkts--;
475 }
476 }
477
478 static
479 void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
480 {
481 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
482 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
483 */
484 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
485 #define IAJ_DIV_SHIFT 4
486 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
487
488 uint32_t allowed_iaj, acc_iaj = 0;
489
490 uint32_t mean, temp;
491 int32_t cur_iaj_dev;
492
493 cur_iaj_dev = (cur_iaj - tp->avg_iaj);
494
495 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
496 * may have a constant jitter more than that. We detect this by
497 * using standard deviation.
498 */
499 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
500 if (allowed_iaj < tcp_allowed_iaj)
501 allowed_iaj = tcp_allowed_iaj;
502
503 /* Initially when the connection starts, the senders congestion
504 * window is small. During this period we avoid throttling a
505 * connection because we do not have a good starting point for
506 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
507 * the first few packets.
508 */
509 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
510 if ( cur_iaj <= allowed_iaj ) {
511 if (tp->acc_iaj >= 2)
512 acc_iaj = tp->acc_iaj - 2;
513 else
514 acc_iaj = 0;
515
516 } else {
517 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
518 }
519
520 if (acc_iaj > MAX_ACC_IAJ)
521 acc_iaj = MAX_ACC_IAJ;
522 tp->acc_iaj = acc_iaj;
523 }
524
525 /* Compute weighted average where the history has a weight of
526 * 15 out of 16 and the current value has a weight of 1 out of 16.
527 * This will make the short-term measurements have more weight.
528 *
529 * The addition of 8 will help to round-up the value
530 * instead of round-down
531 */
532 tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
533 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
534
535 /* Compute Root-mean-square of deviation where mean is a weighted
536 * average as described above.
537 */
538 temp = tp->std_dev_iaj * tp->std_dev_iaj;
539 mean = (((temp << IAJ_DIV_SHIFT) - temp)
540 + (cur_iaj_dev * cur_iaj_dev)
541 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
542
543 tp->std_dev_iaj = isqrt(mean);
544
545 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
546 uint32_t, allowed_iaj);
547
548 return;
549 }
550 #endif /* TRAFFIC_MGT */
551
552 /* Check if enough amount of data has been acknowledged since
553 * bw measurement was started
554 */
555 static void
556 tcp_bwmeas_check(struct tcpcb *tp)
557 {
558 int32_t bw_meas_bytes;
559 uint32_t bw, bytes, elapsed_time;
560 bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
561 if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
562 bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
563 bytes = bw_meas_bytes;
564 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
565 if (elapsed_time > 0) {
566 bw = bytes / elapsed_time;
567 if ( bw > 0) {
568 if (tp->t_bwmeas->bw_sndbw > 0) {
569 tp->t_bwmeas->bw_sndbw =
570 (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
571 } else {
572 tp->t_bwmeas->bw_sndbw = bw;
573 }
574 }
575 }
576 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
577 }
578 }
579
580 static int
581 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
582 struct ifnet *ifp)
583 {
584 struct tseg_qent *q;
585 struct tseg_qent *p = NULL;
586 struct tseg_qent *nq;
587 struct tseg_qent *te = NULL;
588 struct inpcb *inp = tp->t_inpcb;
589 struct socket *so = inp->inp_socket;
590 int flags = 0;
591 int dowakeup = 0;
592 struct mbuf *oodata = NULL;
593 int copy_oodata = 0;
594 u_int16_t qlimit;
595 boolean_t cell = IFNET_IS_CELLULAR(ifp);
596 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
597 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
598 boolean_t dsack_set = FALSE;
599
600 /*
601 * Call with th==0 after become established to
602 * force pre-ESTABLISHED data up to user socket.
603 */
604 if (th == NULL)
605 goto present;
606
607 /*
608 * If the reassembly queue already has entries or if we are going
609 * to add a new one, then the connection has reached a loss state.
610 * Reset the stretch-ack algorithm at this point.
611 */
612 tcp_reset_stretch_ack(tp);
613
614 #if TRAFFIC_MGT
615 if (tp->acc_iaj > 0)
616 reset_acc_iaj(tp);
617 #endif /* TRAFFIC_MGT */
618
619 /*
620 * Limit the number of segments in the reassembly queue to prevent
621 * holding on to too many segments (and thus running out of mbufs).
622 * Make sure to let the missing segment through which caused this
623 * queue. Always keep one global queue entry spare to be able to
624 * process the missing segment.
625 */
626 qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
627 tcp_autorcvbuf_max >> 10);
628 if (th->th_seq != tp->rcv_nxt &&
629 (tp->t_reassqlen + 1) >= qlimit) {
630 tcp_reass_overflows++;
631 tcpstat.tcps_rcvmemdrop++;
632 m_freem(m);
633 *tlenp = 0;
634 return (0);
635 }
636
637 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
638 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
639 if (te == NULL) {
640 tcpstat.tcps_rcvmemdrop++;
641 m_freem(m);
642 return (0);
643 }
644 tp->t_reassqlen++;
645
646 /*
647 * Find a segment which begins after this one does.
648 */
649 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
650 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
651 break;
652 p = q;
653 }
654
655 /*
656 * If there is a preceding segment, it may provide some of
657 * our data already. If so, drop the data from the incoming
658 * segment. If it provides all of our data, drop us.
659 */
660 if (p != NULL) {
661 int i;
662 /* conversion to int (in i) handles seq wraparound */
663 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
664 if (i > 0) {
665 if (TCP_DSACK_ENABLED(tp) && i > 1) {
666 /*
667 * Note duplicate data sequnce numbers
668 * to report in DSACK option
669 */
670 tp->t_dsack_lseq = th->th_seq;
671 tp->t_dsack_rseq = th->th_seq +
672 min(i, *tlenp);
673
674 /*
675 * Report only the first part of partial/
676 * non-contiguous duplicate sequence space
677 */
678 dsack_set = TRUE;
679 }
680 if (i >= *tlenp) {
681 tcpstat.tcps_rcvduppack++;
682 tcpstat.tcps_rcvdupbyte += *tlenp;
683 if (nstat_collect) {
684 nstat_route_rx(inp->inp_route.ro_rt,
685 1, *tlenp,
686 NSTAT_RX_FLAG_DUPLICATE);
687 INP_ADD_STAT(inp, cell, wifi, wired,
688 rxpackets, 1);
689 INP_ADD_STAT(inp, cell, wifi, wired,
690 rxbytes, *tlenp);
691 tp->t_stat.rxduplicatebytes += *tlenp;
692 }
693 m_freem(m);
694 zfree(tcp_reass_zone, te);
695 te = NULL;
696 tp->t_reassqlen--;
697 /*
698 * Try to present any queued data
699 * at the left window edge to the user.
700 * This is needed after the 3-WHS
701 * completes.
702 */
703 goto present;
704 }
705 m_adj(m, i);
706 *tlenp -= i;
707 th->th_seq += i;
708 }
709 }
710 tcpstat.tcps_rcvoopack++;
711 tcpstat.tcps_rcvoobyte += *tlenp;
712 if (nstat_collect) {
713 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
714 NSTAT_RX_FLAG_OUT_OF_ORDER);
715 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
716 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
717 tp->t_stat.rxoutoforderbytes += *tlenp;
718 }
719
720 /*
721 * While we overlap succeeding segments trim them or,
722 * if they are completely covered, dequeue them.
723 */
724 while (q) {
725 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
726 if (i <= 0)
727 break;
728
729 /*
730 * Report only the first part of partial/non-contiguous
731 * duplicate segment in dsack option. The variable
732 * dsack_set will be true if a previous entry has some of
733 * the duplicate sequence space.
734 */
735 if (TCP_DSACK_ENABLED(tp) && i > 1 && !dsack_set) {
736 if (tp->t_dsack_lseq == 0) {
737 tp->t_dsack_lseq = q->tqe_th->th_seq;
738 tp->t_dsack_rseq =
739 tp->t_dsack_lseq + min(i, q->tqe_len);
740 } else {
741 /*
742 * this segment overlaps data in multple
743 * entries in the reassembly queue, move
744 * the right sequence number further.
745 */
746 tp->t_dsack_rseq =
747 tp->t_dsack_rseq + min(i, q->tqe_len);
748 }
749 }
750 if (i < q->tqe_len) {
751 q->tqe_th->th_seq += i;
752 q->tqe_len -= i;
753 m_adj(q->tqe_m, i);
754 break;
755 }
756
757 nq = LIST_NEXT(q, tqe_q);
758 LIST_REMOVE(q, tqe_q);
759 m_freem(q->tqe_m);
760 zfree(tcp_reass_zone, q);
761 tp->t_reassqlen--;
762 q = nq;
763 }
764
765 /* Insert the new segment queue entry into place. */
766 te->tqe_m = m;
767 te->tqe_th = th;
768 te->tqe_len = *tlenp;
769
770 if (p == NULL) {
771 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
772 } else {
773 LIST_INSERT_AFTER(p, te, tqe_q);
774 }
775
776 /*
777 * New out-of-order data exists, and is pointed to by
778 * queue entry te. Set copy_oodata to 1 so out-of-order data
779 * can be copied off to sockbuf after in-order data
780 * is copied off.
781 */
782 if (!(so->so_state & SS_CANTRCVMORE))
783 copy_oodata = 1;
784
785 present:
786 /*
787 * Present data to user, advancing rcv_nxt through
788 * completed sequence space.
789 */
790 if (!TCPS_HAVEESTABLISHED(tp->t_state))
791 return (0);
792 q = LIST_FIRST(&tp->t_segq);
793 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
794 /* Stop using LRO once out of order packets arrive */
795 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
796 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
797 th->th_dport, th->th_sport);
798 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
799 }
800
801 /*
802 * continue processing if out-of-order data
803 * can be delivered
804 */
805 if (q && (so->so_flags & SOF_ENABLE_MSGS))
806 goto msg_unordered_delivery;
807
808 return (0);
809 }
810
811 /* lost packet was recovered, so ooo data can be returned */
812 tcpstat.tcps_recovered_pkts++;
813
814 do {
815 tp->rcv_nxt += q->tqe_len;
816 flags = q->tqe_th->th_flags & TH_FIN;
817 nq = LIST_NEXT(q, tqe_q);
818 LIST_REMOVE(q, tqe_q);
819 if (so->so_state & SS_CANTRCVMORE) {
820 m_freem(q->tqe_m);
821 } else {
822 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
823 if (so->so_flags & SOF_ENABLE_MSGS) {
824 /*
825 * Append the inorder data as a message to the
826 * receive socket buffer. Also check to see if
827 * the data we are about to deliver is the same
828 * data that we wanted to pass up to the user
829 * out of order. If so, reset copy_oodata --
830 * the received data filled a gap, and
831 * is now in order!
832 */
833 if (q == te)
834 copy_oodata = 0;
835 }
836 if (sbappendstream_rcvdemux(so, q->tqe_m,
837 q->tqe_th->th_seq - (tp->irs + 1), 0))
838 dowakeup = 1;
839 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
840 tcp_update_lro_seq(tp->rcv_nxt,
841 inp->inp_laddr, inp->inp_faddr,
842 th->th_dport, th->th_sport);
843 }
844 }
845 zfree(tcp_reass_zone, q);
846 tp->t_reassqlen--;
847 q = nq;
848 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
849
850 #if INET6
851 if ((inp->inp_vflag & INP_IPV6) != 0) {
852
853 KERNEL_DEBUG(DBG_LAYER_BEG,
854 ((inp->inp_fport << 16) | inp->inp_lport),
855 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
856 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
857 0,0,0);
858 }
859 else
860 #endif
861 {
862 KERNEL_DEBUG(DBG_LAYER_BEG,
863 ((inp->inp_fport << 16) | inp->inp_lport),
864 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
865 (inp->inp_faddr.s_addr & 0xffff)),
866 0,0,0);
867 }
868
869 msg_unordered_delivery:
870 /* Deliver out-of-order data as a message */
871 if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
872 /*
873 * make a copy of the mbuf to be delivered up to
874 * the user, and add it to the sockbuf
875 */
876 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
877 if (oodata != NULL) {
878 if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
879 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
880 dowakeup = 1;
881 tcpstat.tcps_msg_unopkts++;
882 } else {
883 tcpstat.tcps_msg_unoappendfail++;
884 }
885 }
886 }
887
888 if (dowakeup)
889 sorwakeup(so); /* done with socket lock held */
890 return (flags);
891 }
892
893 /*
894 * Reduce congestion window -- used when ECN is seen or when a tail loss
895 * probe recovers the last packet.
896 */
897 static void
898 tcp_reduce_congestion_window(
899 struct tcpcb *tp)
900 {
901 /*
902 * If the current tcp cc module has
903 * defined a hook for tasks to run
904 * before entering FR, call it
905 */
906 if (CC_ALGO(tp)->pre_fr != NULL)
907 CC_ALGO(tp)->pre_fr(tp);
908 ENTER_FASTRECOVERY(tp);
909 if (tp->t_flags & TF_SENTFIN)
910 tp->snd_recover = tp->snd_max - 1;
911 else
912 tp->snd_recover = tp->snd_max;
913 tp->t_timer[TCPT_REXMT] = 0;
914 tp->t_timer[TCPT_PTO] = 0;
915 tp->t_rtttime = 0;
916 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
917 tcp_cc_adjust_nonvalidated_cwnd(tp);
918 } else {
919 tp->snd_cwnd = tp->snd_ssthresh +
920 tp->t_maxseg * tcprexmtthresh;
921 }
922 }
923
924 /*
925 * This function is called upon reception of data on a socket. It's purpose is
926 * to handle the adaptive keepalive timers that monitor whether the connection
927 * is making progress. First the adaptive read-timer, second the TFO probe-timer.
928 *
929 * The application wants to get an event if there is a stall during read.
930 * Set the initial keepalive timeout to be equal to twice RTO.
931 *
932 * If the outgoing interface is in marginal conditions, we need to
933 * enable read probes for that too.
934 */
935 static inline void
936 tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
937 {
938 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
939
940 if ((tp->t_adaptive_rtimo > 0 ||
941 (outifp != NULL &&
942 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
943 && tlen > 0 &&
944 tp->t_state == TCPS_ESTABLISHED) {
945 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
946 (TCP_REXMTVAL(tp) << 1));
947 tp->t_flagsext |= TF_DETECT_READSTALL;
948 tp->t_rtimo_probes = 0;
949 }
950 }
951
952 inline void
953 tcp_keepalive_reset(struct tcpcb *tp)
954 {
955 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
956 TCP_CONN_KEEPIDLE(tp));
957 tp->t_flagsext &= ~(TF_DETECT_READSTALL);
958 tp->t_rtimo_probes = 0;
959 }
960
961 /*
962 * TCP input routine, follows pages 65-76 of the
963 * protocol specification dated September, 1981 very closely.
964 */
965 #if INET6
966 int
967 tcp6_input(struct mbuf **mp, int *offp, int proto)
968 {
969 #pragma unused(proto)
970 register struct mbuf *m = *mp;
971 uint32_t ia6_flags;
972 struct ifnet *ifp = m->m_pkthdr.rcvif;
973
974 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
975
976 /* Expect 32-bit aligned data pointer on strict-align platforms */
977 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
978
979 /*
980 * draft-itojun-ipv6-tcp-to-anycast
981 * better place to put this in?
982 */
983 if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
984 if (ia6_flags & IN6_IFF_ANYCAST) {
985 struct ip6_hdr *ip6;
986
987 ip6 = mtod(m, struct ip6_hdr *);
988 icmp6_error(m, ICMP6_DST_UNREACH,
989 ICMP6_DST_UNREACH_ADDR,
990 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
991
992 IF_TCP_STATINC(ifp, icmp6unreach);
993
994 return (IPPROTO_DONE);
995 }
996 }
997
998 tcp_input(m, *offp);
999 return (IPPROTO_DONE);
1000 }
1001 #endif
1002
1003 /* Depending on the usage of mbuf space in the system, this function
1004 * will return true or false. This is used to determine if a socket
1005 * buffer can take more memory from the system for auto-tuning or not.
1006 */
1007 u_int8_t
1008 tcp_cansbgrow(struct sockbuf *sb)
1009 {
1010 /* Calculate the host level space limit in terms of MSIZE buffers.
1011 * We can use a maximum of half of the available mbuf space for
1012 * socket buffers.
1013 */
1014 u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
1015
1016 /* Calculate per sb limit in terms of bytes. We optimize this limit
1017 * for upto 16 socket buffers.
1018 */
1019
1020 u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
1021
1022 if ((total_sbmb_cnt < mblim) &&
1023 (sb->sb_hiwat < sbspacelim)) {
1024 return(1);
1025 } else {
1026 OSIncrementAtomic64(&sbmb_limreached);
1027 }
1028 return(0);
1029 }
1030
1031 static void
1032 tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
1033 u_int32_t newsize, u_int32_t idealsize)
1034 {
1035
1036 /* newsize should not exceed max */
1037 newsize = min(newsize, tcp_autorcvbuf_max);
1038
1039 /* The receive window scale negotiated at the
1040 * beginning of the connection will also set a
1041 * limit on the socket buffer size
1042 */
1043 newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
1044
1045 /* Set new socket buffer size */
1046 if (newsize > sbrcv->sb_hiwat &&
1047 (sbreserve(sbrcv, newsize) == 1)) {
1048 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
1049 (idealsize != 0) ? idealsize : newsize),
1050 tcp_autorcvbuf_max);
1051
1052 /* Again check the limit set by the advertised
1053 * window scale
1054 */
1055 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
1056 TCP_MAXWIN << tp->rcv_scale);
1057 }
1058 }
1059
1060 /*
1061 * This function is used to grow a receive socket buffer. It
1062 * will take into account system-level memory usage and the
1063 * bandwidth available on the link to make a decision.
1064 */
1065 static void
1066 tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1067 struct tcpopt *to, u_int32_t pktlen)
1068 {
1069 struct socket *so = sbrcv->sb_so;
1070
1071 /*
1072 * Do not grow the receive socket buffer if
1073 * - auto resizing is disabled, globally or on this socket
1074 * - the high water mark already reached the maximum
1075 * - the stream is in background and receive side is being
1076 * throttled
1077 * - if there are segments in reassembly queue indicating loss,
1078 * do not need to increase recv window during recovery as more
1079 * data is not going to be sent. A duplicate ack sent during
1080 * recovery should not change the receive window
1081 */
1082 if (tcp_do_autorcvbuf == 0 ||
1083 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1084 tcp_cansbgrow(sbrcv) == 0 ||
1085 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1086 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1087 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
1088 !LIST_EMPTY(&tp->t_segq)) {
1089 /* Can not resize the socket buffer, just return */
1090 goto out;
1091 }
1092
1093 if (TSTMP_GT(tcp_now,
1094 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1095 /* If there has been an idle period in the
1096 * connection, just restart the measurement
1097 */
1098 goto out;
1099 }
1100
1101 if (!TSTMP_SUPPORTED(tp)) {
1102 /*
1103 * Timestamp option is not supported on this connection.
1104 * If the connection reached a state to indicate that
1105 * the receive socket buffer needs to grow, increase
1106 * the high water mark.
1107 */
1108 if (TSTMP_GEQ(tcp_now,
1109 tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1110 if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1111 tcp_sbrcv_reserve(tp, sbrcv,
1112 tcp_autorcvbuf_max, 0);
1113 }
1114 goto out;
1115 } else {
1116 tp->rfbuf_cnt += pktlen;
1117 return;
1118 }
1119 } else if (to->to_tsecr != 0) {
1120 /*
1121 * If the timestamp shows that one RTT has
1122 * completed, we can stop counting the
1123 * bytes. Here we consider increasing
1124 * the socket buffer if the bandwidth measured in
1125 * last rtt, is more than half of sb_hiwat, this will
1126 * help to scale the buffer according to the bandwidth
1127 * on the link.
1128 */
1129 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1130 if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1131 (sbrcv->sb_hiwat >> 1))) {
1132 int32_t rcvbuf_inc, min_incr;
1133 /*
1134 * Increment the receive window by a
1135 * multiple of maximum sized segments.
1136 * This will prevent a connection from
1137 * sending smaller segments on wire if it
1138 * is limited by the receive window.
1139 *
1140 * Set the ideal size based on current
1141 * bandwidth measurements. We set the
1142 * ideal size on receive socket buffer to
1143 * be twice the bandwidth delay product.
1144 */
1145 rcvbuf_inc = (tp->rfbuf_cnt << 1)
1146 - sbrcv->sb_hiwat;
1147
1148 /*
1149 * Make the increment equal to 8 segments
1150 * at least
1151 */
1152 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1153 if (rcvbuf_inc < min_incr)
1154 rcvbuf_inc = min_incr;
1155
1156 rcvbuf_inc =
1157 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1158 tcp_sbrcv_reserve(tp, sbrcv,
1159 sbrcv->sb_hiwat + rcvbuf_inc,
1160 (tp->rfbuf_cnt * 2));
1161 }
1162 goto out;
1163 } else {
1164 tp->rfbuf_cnt += pktlen;
1165 return;
1166 }
1167 }
1168 out:
1169 /* Restart the measurement */
1170 tp->rfbuf_ts = 0;
1171 tp->rfbuf_cnt = 0;
1172 return;
1173 }
1174
1175 /* This function will trim the excess space added to the socket buffer
1176 * to help a slow-reading app. The ideal-size of a socket buffer depends
1177 * on the link bandwidth or it is set by an application and we aim to
1178 * reach that size.
1179 */
1180 void
1181 tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1182 if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1183 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1184 int32_t trim;
1185 /* compute the difference between ideal and current sizes */
1186 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1187
1188 /* Compute the maximum advertised window for
1189 * this connection.
1190 */
1191 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1192
1193 /* How much can we trim the receive socket buffer?
1194 * 1. it can not be trimmed beyond the max rcv win advertised
1195 * 2. if possible, leave 1/16 of bandwidth*delay to
1196 * avoid closing the win completely
1197 */
1198 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1199
1200 /* Sometimes leave can be zero, in that case leave at least
1201 * a few segments worth of space.
1202 */
1203 if (leave == 0)
1204 leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1205
1206 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1207 trim = imin(trim, (int32_t)diff);
1208
1209 if (trim > 0)
1210 sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1211 }
1212 }
1213
1214 /* We may need to trim the send socket buffer size for two reasons:
1215 * 1. if the rtt seen on the connection is climbing up, we do not
1216 * want to fill the buffers any more.
1217 * 2. if the congestion win on the socket backed off, there is no need
1218 * to hold more mbufs for that connection than what the cwnd will allow.
1219 */
1220 void
1221 tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1222 if (tcp_do_autosendbuf == 1 &&
1223 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1224 (SB_AUTOSIZE | SB_TRIM)) &&
1225 (sbsnd->sb_idealsize > 0) &&
1226 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1227 u_int32_t trim = 0;
1228 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1229 trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1230 } else {
1231 trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1232 }
1233 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1234 }
1235 if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1236 sbsnd->sb_flags &= ~(SB_TRIM);
1237 }
1238
1239 /*
1240 * If timestamp option was not negotiated on this connection
1241 * and this connection is on the receiving side of a stream
1242 * then we can not measure the delay on the link accurately.
1243 * Instead of enabling automatic receive socket buffer
1244 * resizing, just give more space to the receive socket buffer.
1245 */
1246 static inline void
1247 tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1248 struct socket *so = tp->t_inpcb->inp_socket;
1249 u_int32_t newsize = 2 * tcp_recvspace;
1250 struct sockbuf *sbrcv = &so->so_rcv;
1251
1252 if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1253 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1254 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1255 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1256 }
1257 }
1258
1259 /* A receiver will evaluate the flow of packets on a connection
1260 * to see if it can reduce ack traffic. The receiver will start
1261 * stretching acks if all of the following conditions are met:
1262 * 1. tcp_delack_enabled is set to 3
1263 * 2. If the bytes received in the last 100ms is greater than a threshold
1264 * defined by maxseg_unacked
1265 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1266 * 4. If the connection has seen enough packets to let the slow-start
1267 * finish after connection establishment or after some packet loss.
1268 *
1269 * The receiver will stop stretching acks if there is congestion/reordering
1270 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1271 * timer fires while stretching acks, it means that the packet flow has gone
1272 * below the threshold defined by maxseg_unacked and the receiver will stop
1273 * stretching acks. The receiver gets no indication when slow-start is completed
1274 * or when the connection reaches an idle state. That is why we use
1275 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1276 * state.
1277 */
1278 static inline int
1279 tcp_stretch_ack_enable(struct tcpcb *tp)
1280 {
1281 if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
1282 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1283 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
1284 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1285 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1286 return(1);
1287 }
1288
1289 return(0);
1290 }
1291
1292 /*
1293 * Reset the state related to stretch-ack algorithm. This will make
1294 * the receiver generate an ack every other packet. The receiver
1295 * will start re-evaluating the rate at which packets come to decide
1296 * if it can benefit by lowering the ack traffic.
1297 */
1298 void
1299 tcp_reset_stretch_ack(struct tcpcb *tp)
1300 {
1301 tp->t_flags &= ~(TF_STRETCHACK);
1302 tp->rcv_by_unackwin = 0;
1303 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1304
1305 /*
1306 * When there is packet loss or packet re-ordering or CWR due to
1307 * ECN, the sender's congestion window is reduced. In these states,
1308 * generate an ack for every other packet for some time to allow
1309 * the sender's congestion window to grow.
1310 */
1311 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1312 tp->rcv_waitforss = 0;
1313 }
1314
1315 /*
1316 * The last packet was a retransmission, check if this ack
1317 * indicates that the retransmission was spurious.
1318 *
1319 * If the connection supports timestamps, we could use it to
1320 * detect if the last retransmit was not needed. Otherwise,
1321 * we check if the ACK arrived within RTT/2 window, then it
1322 * was a mistake to do the retransmit in the first place.
1323 *
1324 * This function will return 1 if it is a spurious retransmit,
1325 * 0 otherwise.
1326 */
1327 int
1328 tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1329 struct tcpopt *to, u_int32_t rxtime)
1330 {
1331 int32_t tdiff, bad_rexmt_win;
1332 bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1333
1334 /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1335 if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))
1336 return (0);
1337 if (TSTMP_SUPPORTED(tp)) {
1338 if (rxtime > 0 && (to->to_flags & TOF_TS)
1339 && to->to_tsecr != 0
1340 && TSTMP_LT(to->to_tsecr, rxtime))
1341 return (1);
1342 } else {
1343 if ((tp->t_rxtshift == 1
1344 || (tp->t_flagsext & TF_SENT_TLPROBE))
1345 && rxtime > 0) {
1346 tdiff = (int32_t)(tcp_now - rxtime);
1347 if (tdiff < bad_rexmt_win)
1348 return(1);
1349 }
1350 }
1351 return(0);
1352 }
1353
1354
1355 /*
1356 * Restore congestion window state if a spurious timeout
1357 * was detected.
1358 */
1359 static void
1360 tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1361 {
1362 if (TSTMP_SUPPORTED(tp)) {
1363 u_int32_t fsize, acked;
1364 fsize = tp->snd_max - th->th_ack;
1365 acked = BYTES_ACKED(th, tp);
1366
1367 /*
1368 * Implement bad retransmit recovery as
1369 * described in RFC 4015.
1370 */
1371 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1372
1373 /* Initialize cwnd to the initial window */
1374 if (CC_ALGO(tp)->cwnd_init != NULL)
1375 CC_ALGO(tp)->cwnd_init(tp);
1376
1377 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1378
1379 } else {
1380 tp->snd_cwnd = tp->snd_cwnd_prev;
1381 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1382 if (tp->t_flags & TF_WASFRECOVERY)
1383 ENTER_FASTRECOVERY(tp);
1384
1385 /* Do not use the loss flight size in this case */
1386 tp->t_lossflightsize = 0;
1387 }
1388 tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
1389 tp->snd_recover = tp->snd_recover_prev;
1390 tp->snd_nxt = tp->snd_max;
1391 tp->t_rxtshift = 0;
1392 tp->t_rxtstart = 0;
1393
1394 /* Fix send socket buffer to reflect the change in cwnd */
1395 tcp_bad_rexmt_fix_sndbuf(tp);
1396
1397 /*
1398 * This RTT might reflect the extra delay induced
1399 * by the network. Skip using this sample for RTO
1400 * calculation and mark the connection so we can
1401 * recompute RTT when the next eligible sample is
1402 * found.
1403 */
1404 tp->t_flagsext |= TF_RECOMPUTE_RTT;
1405 tp->t_badrexmt_time = tcp_now;
1406 tp->t_rtttime = 0;
1407 }
1408
1409 /*
1410 * If the previous packet was sent in retransmission timer, and it was
1411 * not needed, then restore the congestion window to the state before that
1412 * transmission.
1413 *
1414 * If the last packet was sent in tail loss probe timeout, check if that
1415 * recovered the last packet. If so, that will indicate a real loss and
1416 * the congestion window needs to be lowered.
1417 */
1418 static void
1419 tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1420 {
1421 if (tp->t_rxtshift > 0 &&
1422 tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1423 ++tcpstat.tcps_sndrexmitbad;
1424 tcp_bad_rexmt_restore_state(tp, th);
1425 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1426 } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1427 && tp->t_tlphighrxt > 0
1428 && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1429 && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1430 /*
1431 * check DSACK information also to make sure that
1432 * the TLP was indeed needed
1433 */
1434 if (tcp_rxtseg_dsack_for_tlp(tp)) {
1435 /*
1436 * received a DSACK to indicate that TLP was
1437 * not needed
1438 */
1439 tcp_rxtseg_clean(tp);
1440 goto out;
1441 }
1442
1443 /*
1444 * The tail loss probe recovered the last packet and
1445 * we need to adjust the congestion window to take
1446 * this loss into account.
1447 */
1448 ++tcpstat.tcps_tlp_recoverlastpkt;
1449 if (!IN_FASTRECOVERY(tp)) {
1450 tcp_reduce_congestion_window(tp);
1451 EXIT_FASTRECOVERY(tp);
1452 }
1453 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1454 } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1455 /*
1456 * All of the retransmitted segments were duplicated, this
1457 * can be an indication of bad fast retransmit.
1458 */
1459 tcpstat.tcps_dsack_badrexmt++;
1460 tcp_bad_rexmt_restore_state(tp, th);
1461 tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
1462 tcp_rxtseg_clean(tp);
1463 }
1464 out:
1465 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1466 tp->t_tlphighrxt = 0;
1467 tp->t_tlpstart = 0;
1468
1469 /*
1470 * check if the latest ack was for a segment sent during PMTU
1471 * blackhole detection. If the timestamp on the ack is before
1472 * PMTU blackhole detection, then revert the size of the max
1473 * segment to previous size.
1474 */
1475 if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1476 tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1477 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1478 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1479 tcp_pmtud_revert_segment_size(tp);
1480 }
1481 }
1482 if (tp->t_pmtud_start_ts > 0)
1483 tp->t_pmtud_start_ts = 0;
1484 }
1485
1486 /*
1487 * Check if early retransmit can be attempted according to RFC 5827.
1488 *
1489 * If packet reordering is detected on a connection, fast recovery will
1490 * be delayed until it is clear that the packet was lost and not reordered.
1491 * But reordering detection is done only when SACK is enabled.
1492 *
1493 * On connections that do not support SACK, there is a limit on the number
1494 * of early retransmits that can be done per minute. This limit is needed
1495 * to make sure that too many packets are not retransmitted when there is
1496 * packet reordering.
1497 */
1498 static void
1499 tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1500 {
1501 u_int32_t obytes, snd_off;
1502 int32_t snd_len;
1503 struct socket *so = tp->t_inpcb->inp_socket;
1504
1505 if (early_rexmt && (SACK_ENABLED(tp) ||
1506 tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1507 SEQ_GT(tp->snd_max, tp->snd_una) &&
1508 (tp->t_dupacks == 1 ||
1509 (SACK_ENABLED(tp) &&
1510 !TAILQ_EMPTY(&tp->snd_holes)))) {
1511 /*
1512 * If there are only a few outstanding
1513 * segments on the connection, we might need
1514 * to lower the retransmit threshold. This
1515 * will allow us to do Early Retransmit as
1516 * described in RFC 5827.
1517 */
1518 if (SACK_ENABLED(tp) &&
1519 !TAILQ_EMPTY(&tp->snd_holes)) {
1520 obytes = (tp->snd_max - tp->snd_fack) +
1521 tp->sackhint.sack_bytes_rexmit;
1522 } else {
1523 obytes = (tp->snd_max - tp->snd_una);
1524 }
1525
1526 /*
1527 * In order to lower retransmit threshold the
1528 * following two conditions must be met.
1529 * 1. the amount of outstanding data is less
1530 * than 4*SMSS bytes
1531 * 2. there is no unsent data ready for
1532 * transmission or the advertised window
1533 * will limit sending new segments.
1534 */
1535 snd_off = tp->snd_max - tp->snd_una;
1536 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1537 if (obytes < (tp->t_maxseg << 2) &&
1538 snd_len <= 0) {
1539 u_int32_t osegs;
1540
1541 osegs = obytes / tp->t_maxseg;
1542 if ((osegs * tp->t_maxseg) < obytes)
1543 osegs++;
1544
1545 /*
1546 * Since the connection might have already
1547 * received some dupacks, we add them to
1548 * to the outstanding segments count to get
1549 * the correct retransmit threshold.
1550 *
1551 * By checking for early retransmit after
1552 * receiving some duplicate acks when SACK
1553 * is supported, the connection will
1554 * enter fast recovery even if multiple
1555 * segments are lost in the same window.
1556 */
1557 osegs += tp->t_dupacks;
1558 if (osegs < 4) {
1559 tp->t_rexmtthresh =
1560 ((osegs - 1) > 1) ? (osegs - 1) : 1;
1561 tp->t_rexmtthresh =
1562 min(tp->t_rexmtthresh, tcprexmtthresh);
1563 tp->t_rexmtthresh =
1564 max(tp->t_rexmtthresh, tp->t_dupacks);
1565
1566 if (tp->t_early_rexmt_count == 0)
1567 tp->t_early_rexmt_win = tcp_now;
1568
1569 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1570 tcpstat.tcps_tlp_recovery++;
1571 tcp_ccdbg_trace(tp, th,
1572 TCP_CC_TLP_RECOVERY);
1573 } else {
1574 tcpstat.tcps_early_rexmt++;
1575 tp->t_early_rexmt_count++;
1576 tcp_ccdbg_trace(tp, th,
1577 TCP_CC_EARLY_RETRANSMIT);
1578 }
1579 }
1580 }
1581 }
1582
1583 /*
1584 * If we ever sent a TLP probe, the acknowledgement will trigger
1585 * early retransmit because the value of snd_fack will be close
1586 * to snd_max. This will take care of adjustments to the
1587 * congestion window. So we can reset TF_SENT_PROBE flag.
1588 */
1589 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1590 tp->t_tlphighrxt = 0;
1591 tp->t_tlpstart = 0;
1592 }
1593
1594 static boolean_t
1595 tcp_tfo_syn(tp, to)
1596 struct tcpcb *tp;
1597 struct tcpopt *to;
1598 {
1599 u_char out[CCAES_BLOCK_SIZE];
1600 unsigned char len;
1601
1602 if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
1603 !(tcp_fastopen & TCP_FASTOPEN_SERVER))
1604 return (FALSE);
1605
1606 if ((to->to_flags & TOF_TFOREQ)) {
1607 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1608
1609 tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
1610 tcpstat.tcps_tfo_cookie_req_rcv++;
1611 return (FALSE);
1612 }
1613
1614 /* Ok, then it must be an offered cookie. We need to check that ... */
1615 tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
1616
1617 len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1618 to->to_tfo++;
1619 if (memcmp(out, to->to_tfo, len)) {
1620 /* Cookies are different! Let's return and offer a new cookie */
1621 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1622
1623 tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
1624 tcpstat.tcps_tfo_cookie_invalid++;
1625 return (FALSE);
1626 }
1627
1628 if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1629 /* Need to decrement again as we just increased it... */
1630 OSDecrementAtomic(&tcp_tfo_halfcnt);
1631 return (FALSE);
1632 }
1633
1634 tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
1635
1636 tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
1637 tcpstat.tcps_tfo_syn_data_rcv++;
1638
1639 return (TRUE);
1640 }
1641
1642 static void
1643 tcp_tfo_synack(tp, to)
1644 struct tcpcb *tp;
1645 struct tcpopt *to;
1646 {
1647 if (to->to_flags & TOF_TFO) {
1648 unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1649
1650 /*
1651 * If this happens, things have gone terribly wrong. len should
1652 * have been check in tcp_dooptions.
1653 */
1654 VERIFY(len <= TFO_COOKIE_LEN_MAX);
1655
1656 to->to_tfo++;
1657
1658 tcp_cache_set_cookie(tp, to->to_tfo, len);
1659 tcp_heuristic_tfo_success(tp);
1660
1661 tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
1662 tcpstat.tcps_tfo_cookie_rcv++;
1663 } else {
1664 /*
1665 * Thus, no cookie in the response, but we either asked for one
1666 * or sent SYN+DATA. Now, we need to check whether we had to
1667 * rexmit the SYN. If that's the case, it's better to start
1668 * backing of TFO-cookie requests.
1669 */
1670 if (tp->t_tfo_flags & TFO_F_SYN_LOSS)
1671 tcp_heuristic_tfo_inc_loss(tp);
1672 else
1673 tcp_heuristic_tfo_reset_loss(tp);
1674 }
1675 }
1676
1677 static void
1678 tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
1679 {
1680 if (tlen == 0) {
1681 tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1682
1683 /*
1684 * We send the probe out rather quickly (after one RTO). It does not
1685 * really hurt that much, it's only one additional segment on the wire.
1686 */
1687 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1688 } else {
1689 /* If SYN/ACK+data, don't probe. We got the data! */
1690 tcp_heuristic_tfo_rcv_good(tp);
1691 }
1692 }
1693
1694 static void
1695 tcp_tfo_rcv_data(struct tcpcb *tp)
1696 {
1697 /* Transition from PROBING to NONE as data has been received */
1698 if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1699 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1700
1701 /* Data has been received - we are good to go! */
1702 tcp_heuristic_tfo_rcv_good(tp);
1703 }
1704 }
1705
1706 static void
1707 tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
1708 {
1709 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1710 tp->t_tfo_probes > 0) {
1711 if (th->th_seq == tp->rcv_nxt) {
1712 /* No hole, so stop probing */
1713 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1714 } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1715 /* There is a hole! Wait a bit for data... */
1716 tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1717 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1718 TCP_REXMTVAL(tp));
1719 }
1720 }
1721 }
1722
1723 void
1724 tcp_input(m, off0)
1725 struct mbuf *m;
1726 int off0;
1727 {
1728 register struct tcphdr *th;
1729 register struct ip *ip = NULL;
1730 register struct inpcb *inp;
1731 u_char *optp = NULL;
1732 int optlen = 0;
1733 int tlen, off;
1734 int drop_hdrlen;
1735 register struct tcpcb *tp = 0;
1736 register int thflags;
1737 struct socket *so = 0;
1738 int todrop, acked, ourfinisacked, needoutput = 0;
1739 struct in_addr laddr;
1740 #if INET6
1741 struct in6_addr laddr6;
1742 #endif
1743 int dropsocket = 0;
1744 int iss = 0, nosock = 0;
1745 u_int32_t tiwin, sack_bytes_acked = 0;
1746 struct tcpopt to; /* options in this segment */
1747 #if TCPDEBUG
1748 short ostate = 0;
1749 #endif
1750 #if IPFIREWALL
1751 struct sockaddr_in *next_hop = NULL;
1752 struct m_tag *fwd_tag;
1753 #endif /* IPFIREWALL */
1754 u_char ip_ecn = IPTOS_ECN_NOTECT;
1755 unsigned int ifscope;
1756 uint8_t isconnected, isdisconnected;
1757 struct ifnet *ifp = m->m_pkthdr.rcvif;
1758 int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1759 int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1760 int turnoff_lro = 0, win;
1761 #if MPTCP
1762 struct mptcb *mp_tp = NULL;
1763 #endif /* MPTCP */
1764 boolean_t cell = IFNET_IS_CELLULAR(ifp);
1765 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1766 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1767 boolean_t recvd_dsack = FALSE;
1768 struct tcp_respond_args tra;
1769
1770 #define TCP_INC_VAR(stat, npkts) do { \
1771 stat += npkts; \
1772 } while (0)
1773
1774 TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1775 #if IPFIREWALL
1776 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1777 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1778 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1779 KERNEL_TAG_TYPE_IPFORWARD, NULL);
1780 } else {
1781 fwd_tag = NULL;
1782 }
1783 if (fwd_tag != NULL) {
1784 struct ip_fwd_tag *ipfwd_tag =
1785 (struct ip_fwd_tag *)(fwd_tag+1);
1786
1787 next_hop = ipfwd_tag->next_hop;
1788 m_tag_delete(m, fwd_tag);
1789 }
1790 #endif /* IPFIREWALL */
1791
1792 #if INET6
1793 struct ip6_hdr *ip6 = NULL;
1794 int isipv6;
1795 #endif /* INET6 */
1796 int rstreason; /* For badport_bandlim accounting purposes */
1797 struct proc *proc0=current_proc();
1798
1799 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1800
1801 #if INET6
1802 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1803 #endif
1804 bzero((char *)&to, sizeof(to));
1805
1806 #if INET6
1807 if (isipv6) {
1808 /*
1809 * Expect 32-bit aligned data pointer on
1810 * strict-align platforms
1811 */
1812 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1813
1814 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1815 ip6 = mtod(m, struct ip6_hdr *);
1816 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1817 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1818
1819 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1820 goto dropnosock;
1821
1822 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1823 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1824 th->th_seq, th->th_ack, th->th_win);
1825 /*
1826 * Be proactive about unspecified IPv6 address in source.
1827 * As we use all-zero to indicate unbounded/unconnected pcb,
1828 * unspecified IPv6 address can be used to confuse us.
1829 *
1830 * Note that packets with unspecified IPv6 destination is
1831 * already dropped in ip6_input.
1832 */
1833 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1834 /* XXX stat */
1835 IF_TCP_STATINC(ifp, unspecv6);
1836 goto dropnosock;
1837 }
1838 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1839 struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1840 struct tcphdr *, th);
1841
1842 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1843 } else
1844 #endif /* INET6 */
1845 {
1846 /*
1847 * Get IP and TCP header together in first mbuf.
1848 * Note: IP leaves IP header in first mbuf.
1849 */
1850 if (off0 > sizeof (struct ip)) {
1851 ip_stripoptions(m, (struct mbuf *)0);
1852 off0 = sizeof(struct ip);
1853 }
1854 if (m->m_len < sizeof (struct tcpiphdr)) {
1855 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1856 tcpstat.tcps_rcvshort++;
1857 return;
1858 }
1859 }
1860
1861 /* Expect 32-bit aligned data pointer on strict-align platforms */
1862 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1863
1864 ip = mtod(m, struct ip *);
1865 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1866 tlen = ip->ip_len;
1867
1868 if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1869 goto dropnosock;
1870
1871 #if INET6
1872 /* Re-initialization for later version check */
1873 ip->ip_v = IPVERSION;
1874 #endif
1875 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1876
1877 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1878 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1879
1880 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1881 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1882 th->th_seq, th->th_ack, th->th_win);
1883
1884 }
1885
1886 /*
1887 * Check that TCP offset makes sense,
1888 * pull out TCP options and adjust length. XXX
1889 */
1890 off = th->th_off << 2;
1891 if (off < sizeof (struct tcphdr) || off > tlen) {
1892 tcpstat.tcps_rcvbadoff++;
1893 IF_TCP_STATINC(ifp, badformat);
1894 goto dropnosock;
1895 }
1896 tlen -= off; /* tlen is used instead of ti->ti_len */
1897 if (off > sizeof (struct tcphdr)) {
1898 #if INET6
1899 if (isipv6) {
1900 IP6_EXTHDR_CHECK(m, off0, off, return);
1901 ip6 = mtod(m, struct ip6_hdr *);
1902 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1903 } else
1904 #endif /* INET6 */
1905 {
1906 if (m->m_len < sizeof(struct ip) + off) {
1907 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1908 tcpstat.tcps_rcvshort++;
1909 return;
1910 }
1911 ip = mtod(m, struct ip *);
1912 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1913 }
1914 }
1915 optlen = off - sizeof (struct tcphdr);
1916 optp = (u_char *)(th + 1);
1917 /*
1918 * Do quick retrieval of timestamp options ("options
1919 * prediction?"). If timestamp is the only option and it's
1920 * formatted as recommended in RFC 1323 appendix A, we
1921 * quickly get the values now and not bother calling
1922 * tcp_dooptions(), etc.
1923 */
1924 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1925 (optlen > TCPOLEN_TSTAMP_APPA &&
1926 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1927 *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1928 (th->th_flags & TH_SYN) == 0) {
1929 to.to_flags |= TOF_TS;
1930 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1931 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1932 optp = NULL; /* we've parsed the options */
1933 }
1934 }
1935 thflags = th->th_flags;
1936
1937 #if TCP_DROP_SYNFIN
1938 /*
1939 * If the drop_synfin option is enabled, drop all packets with
1940 * both the SYN and FIN bits set. This prevents e.g. nmap from
1941 * identifying the TCP/IP stack.
1942 *
1943 * This is a violation of the TCP specification.
1944 */
1945 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
1946 IF_TCP_STATINC(ifp, synfin);
1947 goto dropnosock;
1948 }
1949 #endif
1950
1951 /*
1952 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1953 * until after ip6_savecontrol() is called and before other functions
1954 * which don't want those proto headers.
1955 * Because ip6_savecontrol() is going to parse the mbuf to
1956 * search for data to be passed up to user-land, it wants mbuf
1957 * parameters to be unchanged.
1958 */
1959 drop_hdrlen = off0 + off;
1960
1961 /* Since this is an entry point for input processing of tcp packets, we
1962 * can update the tcp clock here.
1963 */
1964 calculate_tcp_clock();
1965
1966 /*
1967 * Record the interface where this segment arrived on; this does not
1968 * affect normal data output (for non-detached TCP) as it provides a
1969 * hint about which route and interface to use for sending in the
1970 * absence of a PCB, when scoped routing (and thus source interface
1971 * selection) are enabled.
1972 */
1973 if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1974 ifscope = IFSCOPE_NONE;
1975 else
1976 ifscope = m->m_pkthdr.rcvif->if_index;
1977
1978 /*
1979 * Convert TCP protocol specific fields to host format.
1980 */
1981
1982 #if BYTE_ORDER != BIG_ENDIAN
1983 NTOHL(th->th_seq);
1984 NTOHL(th->th_ack);
1985 NTOHS(th->th_win);
1986 NTOHS(th->th_urp);
1987 #endif
1988
1989 /*
1990 * Locate pcb for segment.
1991 */
1992 findpcb:
1993
1994 isconnected = FALSE;
1995 isdisconnected = FALSE;
1996
1997 #if IPFIREWALL_FORWARD
1998 if (next_hop != NULL
1999 #if INET6
2000 && isipv6 == 0 /* IPv6 support is not yet */
2001 #endif /* INET6 */
2002 ) {
2003 /*
2004 * Diverted. Pretend to be the destination.
2005 * already got one like this?
2006 */
2007 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2008 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
2009 if (!inp) {
2010 /*
2011 * No, then it's new. Try find the ambushing socket
2012 */
2013 if (!next_hop->sin_port) {
2014 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
2015 th->th_sport, next_hop->sin_addr,
2016 th->th_dport, 1, m->m_pkthdr.rcvif);
2017 } else {
2018 inp = in_pcblookup_hash(&tcbinfo,
2019 ip->ip_src, th->th_sport,
2020 next_hop->sin_addr,
2021 ntohs(next_hop->sin_port), 1,
2022 m->m_pkthdr.rcvif);
2023 }
2024 }
2025 } else
2026 #endif /* IPFIREWALL_FORWARD */
2027 {
2028 #if INET6
2029 if (isipv6)
2030 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
2031 &ip6->ip6_dst, th->th_dport, 1,
2032 m->m_pkthdr.rcvif);
2033 else
2034 #endif /* INET6 */
2035 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2036 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
2037 }
2038
2039 /*
2040 * Use the interface scope information from the PCB for outbound
2041 * segments. If the PCB isn't present and if scoped routing is
2042 * enabled, tcp_respond will use the scope of the interface where
2043 * the segment arrived on.
2044 */
2045 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
2046 ifscope = inp->inp_boundifp->if_index;
2047
2048 /*
2049 * If the state is CLOSED (i.e., TCB does not exist) then
2050 * all data in the incoming segment is discarded.
2051 * If the TCB exists but is in CLOSED state, it is embryonic,
2052 * but should either do a listen or a connect soon.
2053 */
2054 if (inp == NULL) {
2055 if (log_in_vain) {
2056 #if INET6
2057 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2058 #else /* INET6 */
2059 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
2060 #endif /* INET6 */
2061
2062 #if INET6
2063 if (isipv6) {
2064 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2065 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2066 } else
2067 #endif
2068 {
2069 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2070 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2071 }
2072 switch (log_in_vain) {
2073 case 1:
2074 if(thflags & TH_SYN)
2075 log(LOG_INFO,
2076 "Connection attempt to TCP %s:%d from %s:%d\n",
2077 dbuf, ntohs(th->th_dport),
2078 sbuf,
2079 ntohs(th->th_sport));
2080 break;
2081 case 2:
2082 log(LOG_INFO,
2083 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2084 dbuf, ntohs(th->th_dport), sbuf,
2085 ntohs(th->th_sport), thflags);
2086 break;
2087 case 3:
2088 case 4:
2089 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2090 !(m->m_flags & (M_BCAST | M_MCAST)) &&
2091 #if INET6
2092 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
2093 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
2094 #else
2095 ip->ip_dst.s_addr != ip->ip_src.s_addr
2096 #endif
2097 )
2098 log_in_vain_log((LOG_INFO,
2099 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2100 dbuf, ntohs(th->th_dport),
2101 sbuf,
2102 ntohs(th->th_sport)));
2103 break;
2104 default:
2105 break;
2106 }
2107 }
2108 if (blackhole) {
2109 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
2110
2111 switch (blackhole) {
2112 case 1:
2113 if (thflags & TH_SYN)
2114 goto dropnosock;
2115 break;
2116 case 2:
2117 goto dropnosock;
2118 default:
2119 goto dropnosock;
2120 }
2121 }
2122 rstreason = BANDLIM_RST_CLOSEDPORT;
2123 IF_TCP_STATINC(ifp, noconnnolist);
2124 goto dropwithresetnosock;
2125 }
2126 so = inp->inp_socket;
2127 if (so == NULL) {
2128 /* This case shouldn't happen as the socket shouldn't be null
2129 * if inp_state isn't set to INPCB_STATE_DEAD
2130 * But just in case, we pretend we didn't find the socket if we hit this case
2131 * as this isn't cause for a panic (the socket might be leaked however)...
2132 */
2133 inp = NULL;
2134 #if TEMPDEBUG
2135 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2136 #endif
2137 goto dropnosock;
2138 }
2139
2140 tcp_lock(so, 1, 0);
2141 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2142 tcp_unlock(so, 1, (void *)2);
2143 inp = NULL; // pretend we didn't find it
2144 goto dropnosock;
2145 }
2146
2147 #if NECP
2148 #if INET6
2149 if (isipv6) {
2150 if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport,
2151 th->th_sport,
2152 &ip6->ip6_dst,
2153 &ip6->ip6_src,
2154 ifp, NULL, NULL)) {
2155 IF_TCP_STATINC(ifp, badformatipsec);
2156 goto drop;
2157 }
2158 } else
2159 #endif
2160 {
2161 if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport,
2162 th->th_sport,
2163 &ip->ip_dst,
2164 &ip->ip_src,
2165 ifp, NULL, NULL)) {
2166 IF_TCP_STATINC(ifp, badformatipsec);
2167 goto drop;
2168 }
2169 }
2170 #endif /* NECP */
2171
2172 tp = intotcpcb(inp);
2173 if (tp == 0) {
2174 rstreason = BANDLIM_RST_CLOSEDPORT;
2175 IF_TCP_STATINC(ifp, noconnlist);
2176 goto dropwithreset;
2177 }
2178 if (tp->t_state == TCPS_CLOSED)
2179 goto drop;
2180
2181 /* Unscale the window into a 32-bit value. */
2182 if ((thflags & TH_SYN) == 0)
2183 tiwin = th->th_win << tp->snd_scale;
2184 else
2185 tiwin = th->th_win;
2186
2187 #if CONFIG_MACF_NET
2188 if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
2189 goto drop;
2190 #endif
2191
2192 /* Avoid processing packets while closing a listen socket */
2193 if (tp->t_state == TCPS_LISTEN &&
2194 (so->so_options & SO_ACCEPTCONN) == 0)
2195 goto drop;
2196
2197 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
2198 #if TCPDEBUG
2199 if (so->so_options & SO_DEBUG) {
2200 ostate = tp->t_state;
2201 #if INET6
2202 if (isipv6)
2203 bcopy((char *)ip6, (char *)tcp_saveipgen,
2204 sizeof(*ip6));
2205 else
2206 #endif /* INET6 */
2207 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
2208 tcp_savetcp = *th;
2209 }
2210 #endif
2211 if (so->so_options & SO_ACCEPTCONN) {
2212 register struct tcpcb *tp0 = tp;
2213 struct socket *so2;
2214 struct socket *oso;
2215 struct sockaddr_storage from;
2216 #if INET6
2217 struct inpcb *oinp = sotoinpcb(so);
2218 #endif /* INET6 */
2219 struct ifnet *head_ifscope;
2220 unsigned int head_nocell, head_recvanyif,
2221 head_noexpensive, head_awdl_unrestricted;
2222
2223 /* Get listener's bound-to-interface, if any */
2224 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2225 inp->inp_boundifp : NULL;
2226 /* Get listener's no-cellular information, if any */
2227 head_nocell = INP_NO_CELLULAR(inp);
2228 /* Get listener's recv-any-interface, if any */
2229 head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2230 /* Get listener's no-expensive information, if any */
2231 head_noexpensive = INP_NO_EXPENSIVE(inp);
2232 head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2233
2234 /*
2235 * If the state is LISTEN then ignore segment if it contains an RST.
2236 * If the segment contains an ACK then it is bad and send a RST.
2237 * If it does not contain a SYN then it is not interesting; drop it.
2238 * If it is from this socket, drop it, it must be forged.
2239 */
2240 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
2241 IF_TCP_STATINC(ifp, listbadsyn);
2242
2243 if (thflags & TH_RST) {
2244 goto drop;
2245 }
2246 if (thflags & TH_ACK) {
2247 tp = NULL;
2248 tcpstat.tcps_badsyn++;
2249 rstreason = BANDLIM_RST_OPENPORT;
2250 goto dropwithreset;
2251 }
2252
2253 /* We come here if there is no SYN set */
2254 tcpstat.tcps_badsyn++;
2255 goto drop;
2256 }
2257 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
2258 if (th->th_dport == th->th_sport) {
2259 #if INET6
2260 if (isipv6) {
2261 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2262 &ip6->ip6_src))
2263 goto drop;
2264 } else
2265 #endif /* INET6 */
2266 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2267 goto drop;
2268 }
2269 /*
2270 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2271 * in_broadcast() should never return true on a received
2272 * packet with M_BCAST not set.
2273 *
2274 * Packets with a multicast source address should also
2275 * be discarded.
2276 */
2277 if (m->m_flags & (M_BCAST|M_MCAST))
2278 goto drop;
2279 #if INET6
2280 if (isipv6) {
2281 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2282 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2283 goto drop;
2284 } else
2285 #endif
2286 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2287 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2288 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2289 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2290 goto drop;
2291
2292
2293 #if INET6
2294 /*
2295 * If deprecated address is forbidden,
2296 * we do not accept SYN to deprecated interface
2297 * address to prevent any new inbound connection from
2298 * getting established.
2299 * When we do not accept SYN, we send a TCP RST,
2300 * with deprecated source address (instead of dropping
2301 * it). We compromise it as it is much better for peer
2302 * to send a RST, and RST will be the final packet
2303 * for the exchange.
2304 *
2305 * If we do not forbid deprecated addresses, we accept
2306 * the SYN packet. RFC 4862 forbids dropping SYN in
2307 * this case.
2308 */
2309 if (isipv6 && !ip6_use_deprecated) {
2310 uint32_t ia6_flags;
2311
2312 if (ip6_getdstifaddr_info(m, NULL,
2313 &ia6_flags) == 0) {
2314 if (ia6_flags & IN6_IFF_DEPRECATED) {
2315 tp = NULL;
2316 rstreason = BANDLIM_RST_OPENPORT;
2317 IF_TCP_STATINC(ifp, deprecate6);
2318 goto dropwithreset;
2319 }
2320 }
2321 }
2322 #endif
2323 if (so->so_filt) {
2324 #if INET6
2325 if (isipv6) {
2326 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
2327
2328 sin6->sin6_len = sizeof(*sin6);
2329 sin6->sin6_family = AF_INET6;
2330 sin6->sin6_port = th->th_sport;
2331 sin6->sin6_flowinfo = 0;
2332 sin6->sin6_addr = ip6->ip6_src;
2333 sin6->sin6_scope_id = 0;
2334 }
2335 else
2336 #endif
2337 {
2338 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2339
2340 sin->sin_len = sizeof(*sin);
2341 sin->sin_family = AF_INET;
2342 sin->sin_port = th->th_sport;
2343 sin->sin_addr = ip->ip_src;
2344 }
2345 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2346 } else {
2347 so2 = sonewconn(so, 0, NULL);
2348 }
2349 if (so2 == 0) {
2350 tcpstat.tcps_listendrop++;
2351 if (tcp_dropdropablreq(so)) {
2352 if (so->so_filt)
2353 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2354 else
2355 so2 = sonewconn(so, 0, NULL);
2356 }
2357 if (!so2)
2358 goto drop;
2359 }
2360
2361 /* Point "inp" and "tp" in tandem to new socket */
2362 inp = (struct inpcb *)so2->so_pcb;
2363 tp = intotcpcb(inp);
2364
2365 oso = so;
2366 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2367
2368 so = so2;
2369 tcp_lock(so, 1, 0);
2370 /*
2371 * Mark socket as temporary until we're
2372 * committed to keeping it. The code at
2373 * ``drop'' and ``dropwithreset'' check the
2374 * flag dropsocket to see if the temporary
2375 * socket created here should be discarded.
2376 * We mark the socket as discardable until
2377 * we're committed to it below in TCPS_LISTEN.
2378 * There are some error conditions in which we
2379 * have to drop the temporary socket.
2380 */
2381 dropsocket++;
2382 /*
2383 * Inherit INP_BOUND_IF from listener; testing if
2384 * head_ifscope is non-NULL is sufficient, since it
2385 * can only be set to a non-zero value earlier if
2386 * the listener has such a flag set.
2387 */
2388 if (head_ifscope != NULL) {
2389 inp->inp_flags |= INP_BOUND_IF;
2390 inp->inp_boundifp = head_ifscope;
2391 } else {
2392 inp->inp_flags &= ~INP_BOUND_IF;
2393 }
2394 /*
2395 * Inherit restrictions from listener.
2396 */
2397 if (head_nocell)
2398 inp_set_nocellular(inp);
2399 if (head_noexpensive)
2400 inp_set_noexpensive(inp);
2401 if (head_awdl_unrestricted)
2402 inp_set_awdl_unrestricted(inp);
2403 /*
2404 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2405 */
2406 if (head_recvanyif)
2407 inp->inp_flags |= INP_RECV_ANYIF;
2408 else
2409 inp->inp_flags &= ~INP_RECV_ANYIF;
2410 #if INET6
2411 if (isipv6)
2412 inp->in6p_laddr = ip6->ip6_dst;
2413 else {
2414 inp->inp_vflag &= ~INP_IPV6;
2415 inp->inp_vflag |= INP_IPV4;
2416 #endif /* INET6 */
2417 inp->inp_laddr = ip->ip_dst;
2418 #if INET6
2419 }
2420 #endif /* INET6 */
2421 inp->inp_lport = th->th_dport;
2422 if (in_pcbinshash(inp, 0) != 0) {
2423 /*
2424 * Undo the assignments above if we failed to
2425 * put the PCB on the hash lists.
2426 */
2427 #if INET6
2428 if (isipv6)
2429 inp->in6p_laddr = in6addr_any;
2430 else
2431 #endif /* INET6 */
2432 inp->inp_laddr.s_addr = INADDR_ANY;
2433 inp->inp_lport = 0;
2434 tcp_lock(oso, 0, 0); /* release ref on parent */
2435 tcp_unlock(oso, 1, 0);
2436 goto drop;
2437 }
2438 #if INET6
2439 if (isipv6) {
2440 /*
2441 * Inherit socket options from the listening
2442 * socket.
2443 * Note that in6p_inputopts are not (even
2444 * should not be) copied, since it stores
2445 * previously received options and is used to
2446 * detect if each new option is different than
2447 * the previous one and hence should be passed
2448 * to a user.
2449 * If we copied in6p_inputopts, a user would
2450 * not be able to receive options just after
2451 * calling the accept system call.
2452 */
2453 inp->inp_flags |=
2454 oinp->inp_flags & INP_CONTROLOPTS;
2455 if (oinp->in6p_outputopts)
2456 inp->in6p_outputopts =
2457 ip6_copypktopts(oinp->in6p_outputopts,
2458 M_NOWAIT);
2459 } else
2460 #endif /* INET6 */
2461 {
2462 inp->inp_options = ip_srcroute();
2463 inp->inp_ip_tos = oinp->inp_ip_tos;
2464 }
2465 tcp_lock(oso, 0, 0);
2466 #if IPSEC
2467 /* copy old policy into new socket's */
2468 if (sotoinpcb(oso)->inp_sp)
2469 {
2470 int error = 0;
2471 /* Is it a security hole here to silently fail to copy the policy? */
2472 if (inp->inp_sp != NULL)
2473 error = ipsec_init_policy(so, &inp->inp_sp);
2474 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2475 printf("tcp_input: could not copy policy\n");
2476 }
2477 #endif
2478 /* inherit states from the listener */
2479 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2480 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2481 tp->t_state = TCPS_LISTEN;
2482 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2483 tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT|TF_FASTOPEN));
2484 tp->t_keepinit = tp0->t_keepinit;
2485 tp->t_keepcnt = tp0->t_keepcnt;
2486 tp->t_keepintvl = tp0->t_keepintvl;
2487 tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2488 tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2489 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2490 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2491 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2492
2493 /* now drop the reference on the listener */
2494 tcp_unlock(oso, 1, 0);
2495
2496 tcp_set_max_rwinscale(tp, so);
2497
2498 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2499 }
2500 }
2501 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2502 LCK_MTX_ASSERT_OWNED);
2503
2504 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2505 /*
2506 * Evaluate the rate of arrival of packets to see if the
2507 * receiver can reduce the ack traffic. The algorithm to
2508 * stretch acks will be enabled if the connection meets
2509 * certain criteria defined in tcp_stretch_ack_enable function.
2510 */
2511 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2512 TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2513 }
2514 if (tcp_stretch_ack_enable(tp)) {
2515 tp->t_flags |= TF_STRETCHACK;
2516 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2517 tp->rcv_waitforss = 0;
2518 } else {
2519 tp->t_flags &= ~(TF_STRETCHACK);
2520 }
2521 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2522 tp->rcv_by_unackwin += (tlen + off);
2523 } else {
2524 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2525 tp->rcv_by_unackwin = tlen + off;
2526 }
2527 }
2528
2529 /*
2530 * Keep track of how many bytes were received in the LRO packet
2531 */
2532 if ((pktf_sw_lro_pkt) && (nlropkts > 2)) {
2533 tp->t_lropktlen += tlen;
2534 }
2535 /*
2536 * Explicit Congestion Notification - Flag that we need to send ECT if
2537 * + The IP Congestion experienced flag was set.
2538 * + Socket is in established state
2539 * + We negotiated ECN in the TCP setup
2540 * + This isn't a pure ack (tlen > 0)
2541 * + The data is in the valid window
2542 *
2543 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2544 */
2545 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2546 TCP_ECN_ENABLED(tp) && tlen > 0 &&
2547 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2548 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2549 tcpstat.tcps_ecn_recv_ce++;
2550 /* Mark this connection as it received CE from network */
2551 tp->ecn_flags |= TE_RECV_ECN_CE;
2552 tp->ecn_flags |= TE_SENDECE;
2553 }
2554
2555 /*
2556 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2557 * bother doing extensive checks for state and whatnot.
2558 */
2559 if (thflags & TH_CWR) {
2560 tp->ecn_flags &= ~TE_SENDECE;
2561 }
2562
2563 /*
2564 * If we received an explicit notification of congestion in
2565 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2566 * the ack-strteching state. We need to handle ECN notification if
2567 * an ECN setup SYN was sent even once.
2568 */
2569 if (tp->t_state == TCPS_ESTABLISHED
2570 && (tp->ecn_flags & TE_SETUPSENT)
2571 && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
2572 tcp_reset_stretch_ack(tp);
2573 CLEAR_IAJ_STATE(tp);
2574 }
2575
2576 /*
2577 * Try to determine if we are receiving a packet after a long time.
2578 * Use our own approximation of idletime to roughly measure remote
2579 * end's idle time. Since slowstart is used after an idle period
2580 * we want to avoid doing LRO if the remote end is not up to date
2581 * on initial window support and starts with 1 or 2 packets as its IW.
2582 */
2583 if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2584 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2585 turnoff_lro = 1;
2586 }
2587
2588 /* Update rcvtime as a new segment was received on the connection */
2589 tp->t_rcvtime = tcp_now;
2590
2591 /*
2592 * Segment received on connection.
2593 * Reset idle time and keep-alive timer.
2594 */
2595 if (TCPS_HAVEESTABLISHED(tp->t_state))
2596 tcp_keepalive_reset(tp);
2597
2598 /*
2599 * Process options if not in LISTEN state,
2600 * else do it below (after getting remote address).
2601 */
2602 if (tp->t_state != TCPS_LISTEN && optp) {
2603 tcp_dooptions(tp, optp, optlen, th, &to);
2604 #if MPTCP
2605 if (mptcp_input_preproc(tp, m, drop_hdrlen) != 0) {
2606 tp->t_flags |= TF_ACKNOW;
2607 (void) tcp_output(tp);
2608 tcp_check_timer_state(tp);
2609 tcp_unlock(so, 1, 0);
2610 KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2611 DBG_FUNC_END,0,0,0,0,0);
2612 return;
2613 }
2614 #endif /* MPTCP */
2615 }
2616 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2617 if (!(thflags & TH_ACK) ||
2618 (SEQ_GT(th->th_ack, tp->iss) &&
2619 SEQ_LEQ(th->th_ack, tp->snd_max)))
2620 tcp_finalize_options(tp, &to, ifscope);
2621 }
2622
2623 #if TRAFFIC_MGT
2624 /*
2625 * Compute inter-packet arrival jitter. According to RFC 3550,
2626 * inter-packet arrival jitter is defined as the difference in
2627 * packet spacing at the receiver compared to the sender for a
2628 * pair of packets. When two packets of maximum segment size come
2629 * one after the other with consecutive sequence numbers, we
2630 * consider them as packets sent together at the sender and use
2631 * them as a pair to compute inter-packet arrival jitter. This
2632 * metric indicates the delay induced by the network components due
2633 * to queuing in edge/access routers.
2634 */
2635 if (tp->t_state == TCPS_ESTABLISHED &&
2636 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2637 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2638 ((to.to_flags & TOF_TS) == 0 ||
2639 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2640 th->th_seq == tp->rcv_nxt &&
2641 LIST_EMPTY(&tp->t_segq)) {
2642 int seg_size = tlen;
2643 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2644 TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2645 }
2646
2647 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2648 seg_size = m->m_pkthdr.lro_pktlen;
2649 }
2650 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2651 (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2652 /*
2653 * State related to inter-arrival jitter is
2654 * uninitialized or we are trying to find a good
2655 * first packet to start computing the metric
2656 */
2657 update_iaj_state(tp, seg_size, 0);
2658 } else {
2659 if (seg_size == tp->iaj_size) {
2660 /*
2661 * Compute inter-arrival jitter taking
2662 * this packet as the second packet
2663 */
2664 if (pktf_sw_lro_pkt)
2665 compute_iaj(tp, nlropkts,
2666 m->m_pkthdr.lro_elapsed);
2667 else
2668 compute_iaj(tp, 1, 0);
2669 }
2670 if (seg_size < tp->iaj_size) {
2671 /*
2672 * There is a smaller packet in the stream.
2673 * Some times the maximum size supported
2674 * on a path can change if there is a new
2675 * link with smaller MTU. The receiver will
2676 * not know about this change. If there
2677 * are too many packets smaller than
2678 * iaj_size, we try to learn the iaj_size
2679 * again.
2680 */
2681 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2682 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2683 update_iaj_state(tp, seg_size, 1);
2684 } else {
2685 CLEAR_IAJ_STATE(tp);
2686 }
2687 } else {
2688 update_iaj_state(tp, seg_size, 0);
2689 }
2690 }
2691 } else {
2692 CLEAR_IAJ_STATE(tp);
2693 }
2694 #endif /* TRAFFIC_MGT */
2695
2696 /*
2697 * Header prediction: check for the two common cases
2698 * of a uni-directional data xfer. If the packet has
2699 * no control flags, is in-sequence, the window didn't
2700 * change and we're not retransmitting, it's a
2701 * candidate. If the length is zero and the ack moved
2702 * forward, we're the sender side of the xfer. Just
2703 * free the data acked & wake any higher level process
2704 * that was blocked waiting for space. If the length
2705 * is non-zero and the ack didn't move, we're the
2706 * receiver side. If we're getting packets in-order
2707 * (the reassembly queue is empty), add the data to
2708 * the socket buffer and note that we need a delayed ack.
2709 * Make sure that the hidden state-flags are also off.
2710 * Since we check for TCPS_ESTABLISHED above, it can only
2711 * be TH_NEEDSYN.
2712 */
2713 if (tp->t_state == TCPS_ESTABLISHED &&
2714 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
2715 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2716 ((to.to_flags & TOF_TS) == 0 ||
2717 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2718 th->th_seq == tp->rcv_nxt &&
2719 tiwin && tiwin == tp->snd_wnd &&
2720 tp->snd_nxt == tp->snd_max) {
2721
2722 /*
2723 * If last ACK falls within this segment's sequence numbers,
2724 * record the timestamp.
2725 * NOTE that the test is modified according to the latest
2726 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2727 */
2728 if ((to.to_flags & TOF_TS) != 0 &&
2729 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2730 tp->ts_recent_age = tcp_now;
2731 tp->ts_recent = to.to_tsval;
2732 }
2733
2734 /* Force acknowledgment if we received a FIN */
2735
2736 if (thflags & TH_FIN)
2737 tp->t_flags |= TF_ACKNOW;
2738
2739 if (tlen == 0) {
2740 if (SEQ_GT(th->th_ack, tp->snd_una) &&
2741 SEQ_LEQ(th->th_ack, tp->snd_max) &&
2742 tp->snd_cwnd >= tp->snd_ssthresh &&
2743 (!IN_FASTRECOVERY(tp) &&
2744 ((!(SACK_ENABLED(tp)) &&
2745 tp->t_dupacks < tp->t_rexmtthresh) ||
2746 (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2747 TAILQ_EMPTY(&tp->snd_holes))))) {
2748 /*
2749 * this is a pure ack for outstanding data.
2750 */
2751 ++tcpstat.tcps_predack;
2752
2753 tcp_bad_rexmt_check(tp, th, &to),
2754
2755 /* Recalculate the RTT */
2756 tcp_compute_rtt(tp, &to, th);
2757
2758 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
2759 acked = BYTES_ACKED(th, tp);
2760 tcpstat.tcps_rcvackpack++;
2761 tcpstat.tcps_rcvackbyte += acked;
2762
2763 /*
2764 * Handle an ack that is in sequence during
2765 * congestion avoidance phase. The
2766 * calculations in this function
2767 * assume that snd_una is not updated yet.
2768 */
2769 if (CC_ALGO(tp)->congestion_avd != NULL)
2770 CC_ALGO(tp)->congestion_avd(tp, th);
2771 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
2772 sbdrop(&so->so_snd, acked);
2773 if (so->so_flags & SOF_ENABLE_MSGS) {
2774 VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2775 so->so_msg_state->msg_serial_bytes -= acked;
2776 }
2777 tcp_sbsnd_trim(&so->so_snd);
2778
2779 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2780 SEQ_LEQ(th->th_ack, tp->snd_recover))
2781 tp->snd_recover = th->th_ack - 1;
2782 tp->snd_una = th->th_ack;
2783
2784 /*
2785 * pull snd_wl2 up to prevent seq wrap relative
2786 * to th_ack.
2787 */
2788 tp->snd_wl2 = th->th_ack;
2789
2790 if (tp->t_dupacks > 0) {
2791 tp->t_dupacks = 0;
2792 tp->t_rexmtthresh = tcprexmtthresh;
2793 }
2794
2795 m_freem(m);
2796
2797 /*
2798 * If all outstanding data are acked, stop
2799 * retransmit timer, otherwise restart timer
2800 * using current (possibly backed-off) value.
2801 * If process is waiting for space,
2802 * wakeup/selwakeup/signal. If data
2803 * are ready to send, let tcp_output
2804 * decide between more output or persist.
2805 */
2806 if (tp->snd_una == tp->snd_max) {
2807 tp->t_timer[TCPT_REXMT] = 0;
2808 tp->t_timer[TCPT_PTO] = 0;
2809 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
2810 tp->t_timer[TCPT_REXMT] =
2811 OFFSET_FROM_START(tp,
2812 tp->t_rxtcur);
2813 }
2814 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
2815 !TCP_DSACK_SEQ_IN_WINDOW(tp,
2816 tp->t_dsack_lastuna, tp->snd_una))
2817 tcp_rxtseg_clean(tp);
2818
2819 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2820 tp->t_bwmeas != NULL)
2821 tcp_bwmeas_check(tp);
2822 sowwakeup(so); /* has to be done with socket lock held */
2823 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2824 (void) tcp_output(tp);
2825 }
2826
2827 tcp_tfo_rcv_ack(tp, th);
2828
2829 tcp_check_timer_state(tp);
2830 tcp_unlock(so, 1, 0);
2831 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2832 return;
2833 }
2834 } else if (th->th_ack == tp->snd_una &&
2835 LIST_EMPTY(&tp->t_segq) &&
2836 tlen <= tcp_sbspace(tp)) {
2837 /*
2838 * this is a pure, in-sequence data packet
2839 * with nothing on the reassembly queue and
2840 * we have enough buffer space to take it.
2841 */
2842
2843 /*
2844 * If this is a connection in steady state, start
2845 * coalescing packets belonging to this flow.
2846 */
2847 if (turnoff_lro) {
2848 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2849 tp->t_inpcb->inp_faddr,
2850 tp->t_inpcb->inp_lport,
2851 tp->t_inpcb->inp_fport);
2852 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2853 tp->t_idleat = tp->rcv_nxt;
2854 } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2855 (so->so_flags & SOF_USELRO) &&
2856 !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2857 (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2858 ((th->th_seq - tp->irs) >
2859 (tp->t_maxseg << lro_start)) &&
2860 ((tp->t_idleat == 0) || ((th->th_seq -
2861 tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2862 tp->t_flagsext |= TF_LRO_OFFLOADED;
2863 tcp_start_coalescing(ip, th, tlen);
2864 tp->t_idleat = 0;
2865 }
2866
2867 /* Clean receiver SACK report if present */
2868 if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2869 tcp_clean_sackreport(tp);
2870 ++tcpstat.tcps_preddat;
2871 tp->rcv_nxt += tlen;
2872 /*
2873 * Pull snd_wl1 up to prevent seq wrap relative to
2874 * th_seq.
2875 */
2876 tp->snd_wl1 = th->th_seq;
2877 /*
2878 * Pull rcv_up up to prevent seq wrap relative to
2879 * rcv_nxt.
2880 */
2881 tp->rcv_up = tp->rcv_nxt;
2882 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2883 tcpstat.tcps_rcvbyte += tlen;
2884 if (nstat_collect) {
2885 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2886 INP_ADD_STAT(inp, cell, wifi, wired,
2887 rxpackets, m->m_pkthdr.lro_npkts);
2888 } else {
2889 INP_ADD_STAT(inp, cell, wifi, wired,
2890 rxpackets, 1);
2891 }
2892 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2893 tlen);
2894 }
2895
2896 /*
2897 * Calculate the RTT on the receiver only if the
2898 * connection is in streaming mode and the last
2899 * packet was not an end-of-write
2900 */
2901 if ((tp->t_flags & TF_STRETCHACK) &&
2902 !(tp->t_flagsext & TF_STREAMEOW))
2903 tcp_compute_rtt(tp, &to, th);
2904
2905 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2906
2907 /*
2908 * Add data to socket buffer.
2909 */
2910 so_recv_data_stat(so, m, 0);
2911 m_adj(m, drop_hdrlen); /* delayed header drop */
2912
2913 /*
2914 * If message delivery (SOF_ENABLE_MSGS) is enabled on
2915 * this socket, deliver the packet received as an
2916 * in-order message with sequence number attached to it.
2917 */
2918 if (sbappendstream_rcvdemux(so, m,
2919 th->th_seq - (tp->irs + 1), 0)) {
2920 sorwakeup(so);
2921 }
2922 #if INET6
2923 if (isipv6) {
2924 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2925 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2926 th->th_seq, th->th_ack, th->th_win);
2927 }
2928 else
2929 #endif
2930 {
2931 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2932 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2933 th->th_seq, th->th_ack, th->th_win);
2934 }
2935 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2936 if (DELAY_ACK(tp, th)) {
2937 if ((tp->t_flags & TF_DELACK) == 0) {
2938 tp->t_flags |= TF_DELACK;
2939 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2940 }
2941 } else {
2942 tp->t_flags |= TF_ACKNOW;
2943 tcp_output(tp);
2944 }
2945
2946 tcp_adaptive_rwtimo_check(tp, tlen);
2947
2948 if (tlen > 0)
2949 tcp_tfo_rcv_data(tp);
2950
2951 tcp_check_timer_state(tp);
2952 tcp_unlock(so, 1, 0);
2953 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2954 return;
2955 }
2956 }
2957
2958 /*
2959 * Calculate amount of space in receive window,
2960 * and then do TCP input processing.
2961 * Receive window is amount of space in rcv queue,
2962 * but not less than advertised window.
2963 */
2964 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2965 LCK_MTX_ASSERT_OWNED);
2966 win = tcp_sbspace(tp);
2967 if (win < 0)
2968 win = 0;
2969 else { /* clip rcv window to 4K for modems */
2970 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2971 win = min(win, slowlink_wsize);
2972 }
2973 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2974 #if MPTCP
2975 /*
2976 * Ensure that the subflow receive window isn't greater
2977 * than the connection level receive window.
2978 */
2979 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
2980 (mp_tp = tptomptp(tp))) {
2981 MPT_LOCK(mp_tp);
2982 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
2983 tp->rcv_wnd = mp_tp->mpt_rcvwnd;
2984 tcpstat.tcps_mp_reducedwin++;
2985 }
2986 MPT_UNLOCK(mp_tp);
2987 }
2988 #endif /* MPTCP */
2989
2990 switch (tp->t_state) {
2991
2992 /*
2993 * Initialize tp->rcv_nxt, and tp->irs, select an initial
2994 * tp->iss, and send a segment:
2995 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2996 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2997 * Fill in remote peer address fields if not previously specified.
2998 * Enter SYN_RECEIVED state, and process any other fields of this
2999 * segment in this state.
3000 */
3001 case TCPS_LISTEN: {
3002 register struct sockaddr_in *sin;
3003 #if INET6
3004 register struct sockaddr_in6 *sin6;
3005 #endif
3006
3007 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3008 LCK_MTX_ASSERT_OWNED);
3009 #if INET6
3010 if (isipv6) {
3011 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
3012 M_SONAME, M_NOWAIT);
3013 if (sin6 == NULL)
3014 goto drop;
3015 bzero(sin6, sizeof(*sin6));
3016 sin6->sin6_family = AF_INET6;
3017 sin6->sin6_len = sizeof(*sin6);
3018 sin6->sin6_addr = ip6->ip6_src;
3019 sin6->sin6_port = th->th_sport;
3020 laddr6 = inp->in6p_laddr;
3021 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
3022 inp->in6p_laddr = ip6->ip6_dst;
3023 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
3024 proc0)) {
3025 inp->in6p_laddr = laddr6;
3026 FREE(sin6, M_SONAME);
3027 goto drop;
3028 }
3029 FREE(sin6, M_SONAME);
3030 } else
3031 #endif
3032 {
3033 lck_mtx_assert(
3034 &((struct inpcb *)so->so_pcb)->inpcb_mtx,
3035 LCK_MTX_ASSERT_OWNED);
3036 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
3037 M_NOWAIT);
3038 if (sin == NULL)
3039 goto drop;
3040 sin->sin_family = AF_INET;
3041 sin->sin_len = sizeof(*sin);
3042 sin->sin_addr = ip->ip_src;
3043 sin->sin_port = th->th_sport;
3044 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
3045 laddr = inp->inp_laddr;
3046 if (inp->inp_laddr.s_addr == INADDR_ANY)
3047 inp->inp_laddr = ip->ip_dst;
3048 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
3049 IFSCOPE_NONE, NULL)) {
3050 inp->inp_laddr = laddr;
3051 FREE(sin, M_SONAME);
3052 goto drop;
3053 }
3054 FREE(sin, M_SONAME);
3055 }
3056
3057 tcp_dooptions(tp, optp, optlen, th, &to);
3058 tcp_finalize_options(tp, &to, ifscope);
3059
3060 if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to))
3061 isconnected = TRUE;
3062
3063 if (iss)
3064 tp->iss = iss;
3065 else {
3066 tp->iss = tcp_new_isn(tp);
3067 }
3068 tp->irs = th->th_seq;
3069 tcp_sendseqinit(tp);
3070 tcp_rcvseqinit(tp);
3071 tp->snd_recover = tp->snd_una;
3072 /*
3073 * Initialization of the tcpcb for transaction;
3074 * set SND.WND = SEG.WND,
3075 * initialize CCsend and CCrecv.
3076 */
3077 tp->snd_wnd = tiwin; /* initial send-window */
3078 tp->t_flags |= TF_ACKNOW;
3079 tp->t_unacksegs = 0;
3080 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3081 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3082 tp->t_state = TCPS_SYN_RECEIVED;
3083 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3084 TCP_CONN_KEEPINIT(tp));
3085 dropsocket = 0; /* committed to socket */
3086
3087 if (inp->inp_flowhash == 0)
3088 inp->inp_flowhash = inp_calc_flowhash(inp);
3089 #if INET6
3090 /* update flowinfo - RFC 6437 */
3091 if (inp->inp_flow == 0 &&
3092 inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3093 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3094 inp->inp_flow |=
3095 (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
3096 }
3097 #endif /* INET6 */
3098
3099 /* reset the incomp processing flag */
3100 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3101 tcpstat.tcps_accepts++;
3102 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
3103 /* ECN-setup SYN */
3104 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
3105 }
3106
3107 #if CONFIG_IFEF_NOWINDOWSCALE
3108 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
3109 (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
3110 /* Window scaling is not enabled on this interface */
3111 tp->t_flags &= ~TF_REQ_SCALE;
3112 }
3113 #endif
3114 goto trimthenstep6;
3115 }
3116
3117 /*
3118 * If the state is SYN_RECEIVED and the seg contains an ACK,
3119 * but not for our SYN/ACK, send a RST.
3120 */
3121 case TCPS_SYN_RECEIVED:
3122 if ((thflags & TH_ACK) &&
3123 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
3124 SEQ_GT(th->th_ack, tp->snd_max))) {
3125 rstreason = BANDLIM_RST_OPENPORT;
3126 IF_TCP_STATINC(ifp, ooopacket);
3127 goto dropwithreset;
3128 }
3129
3130 /*
3131 * In SYN_RECEIVED state, if we recv some SYNS with
3132 * window scale and others without, window scaling should
3133 * be disabled. Otherwise the window advertised will be
3134 * lower if we assume scaling and the other end does not.
3135 */
3136 if ((thflags & TH_SYN) &&
3137 (tp->irs == th->th_seq) &&
3138 !(to.to_flags & TOF_SCALE))
3139 tp->t_flags &= ~TF_RCVD_SCALE;
3140 break;
3141
3142 /*
3143 * If the state is SYN_SENT:
3144 * if seg contains an ACK, but not for our SYN, drop the input.
3145 * if seg contains a RST, then drop the connection.
3146 * if seg does not contain SYN, then drop it.
3147 * Otherwise this is an acceptable SYN segment
3148 * initialize tp->rcv_nxt and tp->irs
3149 * if seg contains ack then advance tp->snd_una
3150 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3151 * arrange for segment to be acked (eventually)
3152 * continue processing rest of data/controls, beginning with URG
3153 */
3154 case TCPS_SYN_SENT:
3155 if ((thflags & TH_ACK) &&
3156 (SEQ_LEQ(th->th_ack, tp->iss) ||
3157 SEQ_GT(th->th_ack, tp->snd_max))) {
3158 rstreason = BANDLIM_UNLIMITED;
3159 IF_TCP_STATINC(ifp, ooopacket);
3160 goto dropwithreset;
3161 }
3162 if (thflags & TH_RST) {
3163 if ((thflags & TH_ACK) != 0) {
3164 #if MPTCP
3165 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
3166 SEQ_GT(th->th_ack, tp->iss+1)) {
3167 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
3168 /* ignore the RST and retransmit SYN */
3169 goto drop;
3170 }
3171 #endif /* MPTCP */
3172 soevent(so,
3173 (SO_FILT_HINT_LOCKED |
3174 SO_FILT_HINT_CONNRESET));
3175 tp = tcp_drop(tp, ECONNREFUSED);
3176 postevent(so, 0, EV_RESET);
3177 }
3178 goto drop;
3179 }
3180 if ((thflags & TH_SYN) == 0)
3181 goto drop;
3182 tp->snd_wnd = th->th_win; /* initial send window */
3183
3184 tp->irs = th->th_seq;
3185 tcp_rcvseqinit(tp);
3186 if (thflags & TH_ACK) {
3187 tcpstat.tcps_connects++;
3188
3189 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
3190 /* ECN-setup SYN-ACK */
3191 tp->ecn_flags |= TE_SETUPRECEIVED;
3192 if (TCP_ECN_ENABLED(tp))
3193 tcpstat.tcps_ecn_client_success++;
3194 } else {
3195 if (tp->ecn_flags & TE_SETUPSENT &&
3196 tp->t_rxtshift == 0)
3197 tcpstat.tcps_ecn_not_supported++;
3198 /* non-ECN-setup SYN-ACK */
3199 tp->ecn_flags &= ~TE_SENDIPECT;
3200 }
3201
3202 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
3203 /* XXXMAC: recursive lock: SOCK_LOCK(so); */
3204 mac_socketpeer_label_associate_mbuf(m, so);
3205 /* XXXMAC: SOCK_UNLOCK(so); */
3206 #endif
3207 /* Do window scaling on this connection? */
3208 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3209 tp->snd_scale = tp->requested_s_scale;
3210 tp->rcv_scale = tp->request_r_scale;
3211 }
3212
3213 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3214 tp->snd_una++; /* SYN is acked */
3215 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3216 tp->snd_nxt = tp->snd_una;
3217
3218 /*
3219 * We have sent more in the SYN than what is being
3220 * acked. (e.g., TFO)
3221 * We should restart the sending from what the receiver
3222 * has acknowledged immediately.
3223 */
3224 if (SEQ_GT(tp->snd_nxt, th->th_ack))
3225 tp->snd_nxt = th->th_ack;
3226
3227 /*
3228 * If there's data, delay ACK; if there's also a FIN
3229 * ACKNOW will be turned on later.
3230 */
3231 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
3232 if (DELAY_ACK(tp, th) && tlen != 0 ) {
3233 if ((tp->t_flags & TF_DELACK) == 0) {
3234 tp->t_flags |= TF_DELACK;
3235 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3236 }
3237 }
3238 else {
3239 tp->t_flags |= TF_ACKNOW;
3240 }
3241 /*
3242 * Received <SYN,ACK> in SYN_SENT[*] state.
3243 * Transitions:
3244 * SYN_SENT --> ESTABLISHED
3245 * SYN_SENT* --> FIN_WAIT_1
3246 */
3247 tp->t_starttime = tcp_now;
3248 tcp_sbrcv_tstmp_check(tp);
3249 if (tp->t_flags & TF_NEEDFIN) {
3250 DTRACE_TCP4(state__change, void, NULL,
3251 struct inpcb *, inp,
3252 struct tcpcb *, tp, int32_t,
3253 TCPS_FIN_WAIT_1);
3254 tp->t_state = TCPS_FIN_WAIT_1;
3255 tp->t_flags &= ~TF_NEEDFIN;
3256 thflags &= ~TH_SYN;
3257 } else {
3258 DTRACE_TCP4(state__change, void, NULL,
3259 struct inpcb *, inp, struct tcpcb *,
3260 tp, int32_t, TCPS_ESTABLISHED);
3261 tp->t_state = TCPS_ESTABLISHED;
3262 tp->t_timer[TCPT_KEEP] =
3263 OFFSET_FROM_START(tp,
3264 TCP_CONN_KEEPIDLE(tp));
3265 if (nstat_collect)
3266 nstat_route_connect_success(
3267 tp->t_inpcb->inp_route.ro_rt);
3268 }
3269 #if MPTCP
3270 /*
3271 * Do not send the connect notification for additional
3272 * subflows until ACK for 3-way handshake arrives.
3273 */
3274 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3275 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3276 isconnected = FALSE;
3277 /* Start data xmit if fastjoin */
3278 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
3279 soevent(so, (SO_FILT_HINT_LOCKED |
3280 SO_FILT_HINT_MPFASTJ));
3281 }
3282 } else
3283 #endif /* MPTCP */
3284 isconnected = TRUE;
3285
3286 if (tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) {
3287 tcp_tfo_synack(tp, &to);
3288
3289 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3290 SEQ_LT(tp->snd_una, th->th_ack)) {
3291 tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
3292 tcpstat.tcps_tfo_syn_data_acked++;
3293
3294 if (!(tp->t_tfo_flags & TFO_F_NO_RCVPROBING))
3295 tcp_tfo_rcv_probe(tp, tlen);
3296 }
3297 }
3298 } else {
3299 /*
3300 * Received initial SYN in SYN-SENT[*] state => simul-
3301 * taneous open. If segment contains CC option and there is
3302 * a cached CC, apply TAO test; if it succeeds, connection is
3303 * half-synchronized. Otherwise, do 3-way handshake:
3304 * SYN-SENT -> SYN-RECEIVED
3305 * SYN-SENT* -> SYN-RECEIVED*
3306 */
3307 tp->t_flags |= TF_ACKNOW;
3308 tp->t_timer[TCPT_REXMT] = 0;
3309 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3310 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3311 tp->t_state = TCPS_SYN_RECEIVED;
3312
3313 /*
3314 * During simultaneous open, TFO should not be used.
3315 * So, we disable it here, to prevent that data gets
3316 * sent on the SYN/ACK.
3317 */
3318 tcp_disable_tfo(tp);
3319 }
3320
3321 trimthenstep6:
3322 /*
3323 * Advance th->th_seq to correspond to first data byte.
3324 * If data, trim to stay within window,
3325 * dropping FIN if necessary.
3326 */
3327 th->th_seq++;
3328 if (tlen > tp->rcv_wnd) {
3329 todrop = tlen - tp->rcv_wnd;
3330 m_adj(m, -todrop);
3331 tlen = tp->rcv_wnd;
3332 thflags &= ~TH_FIN;
3333 tcpstat.tcps_rcvpackafterwin++;
3334 tcpstat.tcps_rcvbyteafterwin += todrop;
3335 }
3336 tp->snd_wl1 = th->th_seq - 1;
3337 tp->rcv_up = th->th_seq;
3338 /*
3339 * Client side of transaction: already sent SYN and data.
3340 * If the remote host used T/TCP to validate the SYN,
3341 * our data will be ACK'd; if so, enter normal data segment
3342 * processing in the middle of step 5, ack processing.
3343 * Otherwise, goto step 6.
3344 */
3345 if (thflags & TH_ACK)
3346 goto process_ACK;
3347 goto step6;
3348 /*
3349 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3350 * do normal processing.
3351 *
3352 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
3353 */
3354 case TCPS_LAST_ACK:
3355 case TCPS_CLOSING:
3356 case TCPS_TIME_WAIT:
3357 break; /* continue normal processing */
3358
3359 /* Received a SYN while connection is already established.
3360 * This is a "half open connection and other anomalies" described
3361 * in RFC793 page 34, send an ACK so the remote reset the connection
3362 * or recovers by adjusting its sequence numberering
3363 */
3364 case TCPS_ESTABLISHED:
3365 if (thflags & TH_SYN)
3366 goto dropafterack;
3367 break;
3368 }
3369
3370 /*
3371 * States other than LISTEN or SYN_SENT.
3372 * First check the RST flag and sequence number since reset segments
3373 * are exempt from the timestamp and connection count tests. This
3374 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3375 * below which allowed reset segments in half the sequence space
3376 * to fall though and be processed (which gives forged reset
3377 * segments with a random sequence number a 50 percent chance of
3378 * killing a connection).
3379 * Then check timestamp, if present.
3380 * Then check the connection count, if present.
3381 * Then check that at least some bytes of segment are within
3382 * receive window. If segment begins before rcv_nxt,
3383 * drop leading data (and SYN); if nothing left, just ack.
3384 *
3385 *
3386 * If the RST bit is set, check the sequence number to see
3387 * if this is a valid reset segment.
3388 * RFC 793 page 37:
3389 * In all states except SYN-SENT, all reset (RST) segments
3390 * are validated by checking their SEQ-fields. A reset is
3391 * valid if its sequence number is in the window.
3392 * Note: this does not take into account delayed ACKs, so
3393 * we should test against last_ack_sent instead of rcv_nxt.
3394 * The sequence number in the reset segment is normally an
3395 * echo of our outgoing acknowlegement numbers, but some hosts
3396 * send a reset with the sequence number at the rightmost edge
3397 * of our receive window, and we have to handle this case.
3398 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3399 * that brute force RST attacks are possible. To combat this,
3400 * we use a much stricter check while in the ESTABLISHED state,
3401 * only accepting RSTs where the sequence number is equal to
3402 * last_ack_sent. In all other states (the states in which a
3403 * RST is more likely), the more permissive check is used.
3404 * If we have multiple segments in flight, the intial reset
3405 * segment sequence numbers will be to the left of last_ack_sent,
3406 * but they will eventually catch up.
3407 * In any case, it never made sense to trim reset segments to
3408 * fit the receive window since RFC 1122 says:
3409 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
3410 *
3411 * A TCP SHOULD allow a received RST segment to include data.
3412 *
3413 * DISCUSSION
3414 * It has been suggested that a RST segment could contain
3415 * ASCII text that encoded and explained the cause of the
3416 * RST. No standard has yet been established for such
3417 * data.
3418 *
3419 * If the reset segment passes the sequence number test examine
3420 * the state:
3421 * SYN_RECEIVED STATE:
3422 * If passive open, return to LISTEN state.
3423 * If active open, inform user that connection was refused.
3424 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3425 * Inform user that connection was reset, and close tcb.
3426 * CLOSING, LAST_ACK STATES:
3427 * Close the tcb.
3428 * TIME_WAIT STATE:
3429 * Drop the segment - see Stevens, vol. 2, p. 964 and
3430 * RFC 1337.
3431 *
3432 * Radar 4803931: Allows for the case where we ACKed the FIN but
3433 * there is already a RST in flight from the peer.
3434 * In that case, accept the RST for non-established
3435 * state if it's one off from last_ack_sent.
3436
3437 */
3438 if (thflags & TH_RST) {
3439 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3440 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
3441 (tp->rcv_wnd == 0 &&
3442 ((tp->last_ack_sent == th->th_seq) ||
3443 ((tp->last_ack_sent -1) == th->th_seq)))) {
3444 switch (tp->t_state) {
3445
3446 case TCPS_SYN_RECEIVED:
3447 IF_TCP_STATINC(ifp, rstinsynrcv);
3448 so->so_error = ECONNREFUSED;
3449 goto close;
3450
3451 case TCPS_ESTABLISHED:
3452 if (tp->last_ack_sent != th->th_seq) {
3453 tcpstat.tcps_badrst++;
3454 goto drop;
3455 }
3456 case TCPS_FIN_WAIT_1:
3457 case TCPS_CLOSE_WAIT:
3458 /*
3459 Drop through ...
3460 */
3461 case TCPS_FIN_WAIT_2:
3462 so->so_error = ECONNRESET;
3463 close:
3464 postevent(so, 0, EV_RESET);
3465 soevent(so,
3466 (SO_FILT_HINT_LOCKED |
3467 SO_FILT_HINT_CONNRESET));
3468
3469 tcpstat.tcps_drops++;
3470 tp = tcp_close(tp);
3471 break;
3472
3473 case TCPS_CLOSING:
3474 case TCPS_LAST_ACK:
3475 tp = tcp_close(tp);
3476 break;
3477
3478 case TCPS_TIME_WAIT:
3479 break;
3480 }
3481 }
3482 goto drop;
3483 }
3484
3485 /*
3486 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3487 * and it's less than ts_recent, drop it.
3488 */
3489 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3490 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3491
3492 /* Check to see if ts_recent is over 24 days old. */
3493 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3494 /*
3495 * Invalidate ts_recent. If this segment updates
3496 * ts_recent, the age will be reset later and ts_recent
3497 * will get a valid value. If it does not, setting
3498 * ts_recent to zero will at least satisfy the
3499 * requirement that zero be placed in the timestamp
3500 * echo reply when ts_recent isn't valid. The
3501 * age isn't reset until we get a valid ts_recent
3502 * because we don't want out-of-order segments to be
3503 * dropped when ts_recent is old.
3504 */
3505 tp->ts_recent = 0;
3506 } else {
3507 tcpstat.tcps_rcvduppack++;
3508 tcpstat.tcps_rcvdupbyte += tlen;
3509 tcpstat.tcps_pawsdrop++;
3510 if (nstat_collect) {
3511 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3512 1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3513 INP_ADD_STAT(inp, cell, wifi, wired,
3514 rxpackets, 1);
3515 INP_ADD_STAT(inp, cell, wifi, wired,
3516 rxbytes, tlen);
3517 tp->t_stat.rxduplicatebytes += tlen;
3518 }
3519 if (tlen > 0)
3520 goto dropafterack;
3521 goto drop;
3522 }
3523 }
3524
3525 /*
3526 * In the SYN-RECEIVED state, validate that the packet belongs to
3527 * this connection before trimming the data to fit the receive
3528 * window. Check the sequence number versus IRS since we know
3529 * the sequence numbers haven't wrapped. This is a partial fix
3530 * for the "LAND" DoS attack.
3531 */
3532 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3533 rstreason = BANDLIM_RST_OPENPORT;
3534 IF_TCP_STATINC(ifp, dospacket);
3535 goto dropwithreset;
3536 }
3537
3538 todrop = tp->rcv_nxt - th->th_seq;
3539 if (todrop > 0) {
3540 if (thflags & TH_SYN) {
3541 thflags &= ~TH_SYN;
3542 th->th_seq++;
3543 if (th->th_urp > 1)
3544 th->th_urp--;
3545 else
3546 thflags &= ~TH_URG;
3547 todrop--;
3548 }
3549 /*
3550 * Following if statement from Stevens, vol. 2, p. 960.
3551 */
3552 if (todrop > tlen
3553 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3554 /*
3555 * Any valid FIN must be to the left of the window.
3556 * At this point the FIN must be a duplicate or out
3557 * of sequence; drop it.
3558 */
3559 thflags &= ~TH_FIN;
3560
3561 /*
3562 * Send an ACK to resynchronize and drop any data.
3563 * But keep on processing for RST or ACK.
3564 */
3565 tp->t_flags |= TF_ACKNOW;
3566 if (todrop == 1) {
3567 /* This could be a keepalive */
3568 soevent(so, SO_FILT_HINT_LOCKED |
3569 SO_FILT_HINT_KEEPALIVE);
3570 }
3571 todrop = tlen;
3572 tcpstat.tcps_rcvduppack++;
3573 tcpstat.tcps_rcvdupbyte += todrop;
3574 } else {
3575 tcpstat.tcps_rcvpartduppack++;
3576 tcpstat.tcps_rcvpartdupbyte += todrop;
3577 }
3578
3579 if (TCP_DSACK_ENABLED(tp) && todrop > 1) {
3580 /*
3581 * Note the duplicate data sequence space so that
3582 * it can be reported in DSACK option.
3583 */
3584 tp->t_dsack_lseq = th->th_seq;
3585 tp->t_dsack_rseq = th->th_seq + todrop;
3586 tp->t_flags |= TF_ACKNOW;
3587 }
3588 if (nstat_collect) {
3589 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3590 todrop, NSTAT_RX_FLAG_DUPLICATE);
3591 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3592 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
3593 tp->t_stat.rxduplicatebytes += todrop;
3594 }
3595 drop_hdrlen += todrop; /* drop from the top afterwards */
3596 th->th_seq += todrop;
3597 tlen -= todrop;
3598 if (th->th_urp > todrop)
3599 th->th_urp -= todrop;
3600 else {
3601 thflags &= ~TH_URG;
3602 th->th_urp = 0;
3603 }
3604 }
3605
3606 /*
3607 * If new data are received on a connection after the user
3608 * processes are gone, then RST the other end.
3609 * Send also a RST when we received a data segment after we've
3610 * sent our FIN when the socket is defunct.
3611 * Note that an MPTCP subflow socket would have SS_NOFDREF set
3612 * by default so check to make sure that we test for SOF_MP_SUBFLOW
3613 * socket flag (which would be cleared when the socket is closed.)
3614 */
3615 if (!(so->so_flags & SOF_MP_SUBFLOW) && tlen &&
3616 (((so->so_state & SS_NOFDREF) &&
3617 tp->t_state > TCPS_CLOSE_WAIT) ||
3618 ((so->so_flags & SOF_DEFUNCT) &&
3619 tp->t_state > TCPS_FIN_WAIT_1))) {
3620 tp = tcp_close(tp);
3621 tcpstat.tcps_rcvafterclose++;
3622 rstreason = BANDLIM_UNLIMITED;
3623 IF_TCP_STATINC(ifp, cleanup);
3624 goto dropwithreset;
3625 }
3626
3627 /*
3628 * If segment ends after window, drop trailing data
3629 * (and PUSH and FIN); if nothing left, just ACK.
3630 */
3631 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3632 if (todrop > 0) {
3633 tcpstat.tcps_rcvpackafterwin++;
3634 if (todrop >= tlen) {
3635 tcpstat.tcps_rcvbyteafterwin += tlen;
3636 /*
3637 * If a new connection request is received
3638 * while in TIME_WAIT, drop the old connection
3639 * and start over if the sequence numbers
3640 * are above the previous ones.
3641 */
3642 if (thflags & TH_SYN &&
3643 tp->t_state == TCPS_TIME_WAIT &&
3644 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3645 iss = tcp_new_isn(tp);
3646 tp = tcp_close(tp);
3647 tcp_unlock(so, 1, 0);
3648 goto findpcb;
3649 }
3650 /*
3651 * If window is closed can only take segments at
3652 * window edge, and have to drop data and PUSH from
3653 * incoming segments. Continue processing, but
3654 * remember to ack. Otherwise, drop segment
3655 * and ack.
3656 */
3657 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3658 tp->t_flags |= TF_ACKNOW;
3659 tcpstat.tcps_rcvwinprobe++;
3660 } else
3661 goto dropafterack;
3662 } else
3663 tcpstat.tcps_rcvbyteafterwin += todrop;
3664 m_adj(m, -todrop);
3665 tlen -= todrop;
3666 thflags &= ~(TH_PUSH|TH_FIN);
3667 }
3668
3669 /*
3670 * If last ACK falls within this segment's sequence numbers,
3671 * record its timestamp.
3672 * NOTE:
3673 * 1) That the test incorporates suggestions from the latest
3674 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
3675 * 2) That updating only on newer timestamps interferes with
3676 * our earlier PAWS tests, so this check should be solely
3677 * predicated on the sequence space of this segment.
3678 * 3) That we modify the segment boundary check to be
3679 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
3680 * instead of RFC1323's
3681 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
3682 * This modified check allows us to overcome RFC1323's
3683 * limitations as described in Stevens TCP/IP Illustrated
3684 * Vol. 2 p.869. In such cases, we can still calculate the
3685 * RTT correctly when RCV.NXT == Last.ACK.Sent.
3686 */
3687 if ((to.to_flags & TOF_TS) != 0 &&
3688 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3689 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3690 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
3691 tp->ts_recent_age = tcp_now;
3692 tp->ts_recent = to.to_tsval;
3693 }
3694
3695 /*
3696 * If a SYN is in the window, then this is an
3697 * error and we send an RST and drop the connection.
3698 */
3699 if (thflags & TH_SYN) {
3700 tp = tcp_drop(tp, ECONNRESET);
3701 rstreason = BANDLIM_UNLIMITED;
3702 postevent(so, 0, EV_RESET);
3703 IF_TCP_STATINC(ifp, synwindow);
3704 goto dropwithreset;
3705 }
3706
3707 /*
3708 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
3709 * flag is on (half-synchronized state), then queue data for
3710 * later processing; else drop segment and return.
3711 */
3712 if ((thflags & TH_ACK) == 0) {
3713 if (tp->t_state == TCPS_SYN_RECEIVED ||
3714 (tp->t_flags & TF_NEEDSYN)) {
3715 if ((tfo_enabled(tp))) {
3716 /*
3717 * So, we received a valid segment while in
3718 * SYN-RECEIVED (TF_NEEDSYN is actually never
3719 * set, so this is dead code).
3720 * As this cannot be an RST (see that if a bit
3721 * higher), and it does not have the ACK-flag
3722 * set, we want to retransmit the SYN/ACK.
3723 * Thus, we have to reset snd_nxt to snd_una to
3724 * trigger the going back to sending of the
3725 * SYN/ACK. This is more consistent with the
3726 * behavior of tcp_output(), which expects
3727 * to send the segment that is pointed to by
3728 * snd_nxt.
3729 */
3730 tp->snd_nxt = tp->snd_una;
3731
3732 /*
3733 * We need to make absolutely sure that we are
3734 * going to reply upon a duplicate SYN-segment.
3735 */
3736 if (th->th_flags & TH_SYN)
3737 needoutput = 1;
3738 }
3739
3740 goto step6;
3741 } else if (tp->t_flags & TF_ACKNOW)
3742 goto dropafterack;
3743 else
3744 goto drop;
3745 }
3746
3747 /*
3748 * Ack processing.
3749 */
3750
3751 switch (tp->t_state) {
3752
3753 /*
3754 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3755 * ESTABLISHED state and continue processing.
3756 * The ACK was checked above.
3757 */
3758 case TCPS_SYN_RECEIVED:
3759
3760 tcpstat.tcps_connects++;
3761
3762 /* Do window scaling? */
3763 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3764 tp->snd_scale = tp->requested_s_scale;
3765 tp->rcv_scale = tp->request_r_scale;
3766 tp->snd_wnd = th->th_win << tp->snd_scale;
3767 tiwin = tp->snd_wnd;
3768 }
3769 /*
3770 * Make transitions:
3771 * SYN-RECEIVED -> ESTABLISHED
3772 * SYN-RECEIVED* -> FIN-WAIT-1
3773 */
3774 tp->t_starttime = tcp_now;
3775 tcp_sbrcv_tstmp_check(tp);
3776 if (tp->t_flags & TF_NEEDFIN) {
3777 DTRACE_TCP4(state__change, void, NULL,
3778 struct inpcb *, inp,
3779 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3780 tp->t_state = TCPS_FIN_WAIT_1;
3781 tp->t_flags &= ~TF_NEEDFIN;
3782 } else {
3783 DTRACE_TCP4(state__change, void, NULL,
3784 struct inpcb *, inp,
3785 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3786 tp->t_state = TCPS_ESTABLISHED;
3787 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3788 TCP_CONN_KEEPIDLE(tp));
3789 if (nstat_collect)
3790 nstat_route_connect_success(
3791 tp->t_inpcb->inp_route.ro_rt);
3792 }
3793 /*
3794 * If segment contains data or ACK, will call tcp_reass()
3795 * later; if not, do so now to pass queued data to user.
3796 */
3797 if (tlen == 0 && (thflags & TH_FIN) == 0)
3798 (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3799 NULL, ifp);
3800 tp->snd_wl1 = th->th_seq - 1;
3801
3802 #if MPTCP
3803 /*
3804 * Do not send the connect notification for additional subflows
3805 * until ACK for 3-way handshake arrives.
3806 */
3807 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3808 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3809 isconnected = FALSE;
3810 } else
3811 #endif /* MPTCP */
3812 isconnected = TRUE;
3813 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
3814 /* Done this when receiving the SYN */
3815 isconnected = FALSE;
3816
3817 OSDecrementAtomic(&tcp_tfo_halfcnt);
3818
3819 /* Panic if something has gone terribly wrong. */
3820 VERIFY(tcp_tfo_halfcnt >= 0);
3821
3822 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
3823 }
3824
3825 /*
3826 * In case there is data in the send-queue (e.g., TFO is being
3827 * used, or connectx+data has been done), then if we would
3828 * "FALLTHROUGH", we would handle this ACK as if data has been
3829 * acknowledged. But, we have to prevent this. And this
3830 * can be prevented by increasing snd_una by 1, so that the
3831 * SYN is not considered as data (snd_una++ is actually also
3832 * done in SYN_SENT-state as part of the regular TCP stack).
3833 *
3834 * In case there is data on this ack as well, the data will be
3835 * handled by the label "dodata" right after step6.
3836 */
3837 if (so->so_snd.sb_cc) {
3838 tp->snd_una++; /* SYN is acked */
3839 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3840 tp->snd_nxt = tp->snd_una;
3841
3842 /*
3843 * No duplicate-ACK handling is needed. So, we
3844 * directly advance to processing the ACK (aka,
3845 * updating the RTT estimation,...)
3846 *
3847 * But, we first need to handle eventual SACKs,
3848 * because TFO will start sending data with the
3849 * SYN/ACK, so it might be that the client
3850 * includes a SACK with its ACK.
3851 */
3852 if (SACK_ENABLED(tp) &&
3853 (to.to_nsacks > 0 ||
3854 !TAILQ_EMPTY(&tp->snd_holes)))
3855 tcp_sack_doack(tp, &to, th,
3856 &sack_bytes_acked);
3857
3858 goto process_ACK;
3859 }
3860
3861 /* FALLTHROUGH */
3862
3863 /*
3864 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3865 * ACKs. If the ack is in the range
3866 * tp->snd_una < th->th_ack <= tp->snd_max
3867 * then advance tp->snd_una to th->th_ack and drop
3868 * data from the retransmission queue. If this ACK reflects
3869 * more up to date window information we update our window information.
3870 */
3871 case TCPS_ESTABLISHED:
3872 case TCPS_FIN_WAIT_1:
3873 case TCPS_FIN_WAIT_2:
3874 case TCPS_CLOSE_WAIT:
3875 case TCPS_CLOSING:
3876 case TCPS_LAST_ACK:
3877 case TCPS_TIME_WAIT:
3878 if (SEQ_GT(th->th_ack, tp->snd_max)) {
3879 tcpstat.tcps_rcvacktoomuch++;
3880 goto dropafterack;
3881 }
3882 if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
3883 recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
3884 /*
3885 * If DSACK is received and this packet has no
3886 * other SACK information, it can be dropped.
3887 * We do not want to treat it as a duplicate ack.
3888 */
3889 if (recvd_dsack &&
3890 SEQ_LEQ(th->th_ack, tp->snd_una) &&
3891 to.to_nsacks == 0) {
3892 tcp_bad_rexmt_check(tp, th, &to);
3893 goto drop;
3894 }
3895 }
3896
3897 if (SACK_ENABLED(tp) &&
3898 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
3899 tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
3900
3901 #if MPTCP
3902 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
3903 if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3904 /* MP TCP establishment succeeded */
3905 tp->t_mpuna = 0;
3906 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3907 if (tp->t_mpflags & TMPF_SENT_JOIN) {
3908 tp->t_mpflags &=
3909 ~TMPF_PREESTABLISHED;
3910 tp->t_mpflags |=
3911 TMPF_MPTCP_TRUE;
3912 so->so_flags |= SOF_MPTCP_TRUE;
3913 mptcplog((LOG_DEBUG, "MPTCP "
3914 "Sockets: %s \n",__func__),
3915 MPTCP_SOCKET_DBG,
3916 MPTCP_LOGLVL_LOG);
3917
3918 tp->t_timer[TCPT_JACK_RXMT] = 0;
3919 tp->t_mprxtshift = 0;
3920 isconnected = TRUE;
3921 } else {
3922 isconnected = FALSE;
3923 }
3924 } else {
3925 isconnected = TRUE;
3926 tp->t_mpflags &= ~TMPF_SENT_KEYS;
3927 }
3928 }
3929 }
3930 #endif /* MPTCP */
3931
3932 tcp_tfo_rcv_ack(tp, th);
3933
3934 /*
3935 * If we have outstanding data (other than
3936 * a window probe), this is a completely
3937 * duplicate ack (ie, window info didn't
3938 * change) and the ack is the biggest we've seen.
3939 */
3940 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
3941 if (tlen == 0 && tiwin == tp->snd_wnd) {
3942 /*
3943 * If both ends send FIN at the same time,
3944 * then the ack will be a duplicate ack
3945 * but we have to process the FIN. Check
3946 * for this condition and process the FIN
3947 * instead of the dupack
3948 */
3949 if ((thflags & TH_FIN) &&
3950 (tp->t_flags & TF_SENTFIN) &&
3951 !TCPS_HAVERCVDFIN(tp->t_state) &&
3952 (th->th_ack + 1) == tp->snd_max)
3953 break;
3954 process_dupack:
3955 #if MPTCP
3956 /*
3957 * MPTCP options that are ignored must
3958 * not be treated as duplicate ACKs.
3959 */
3960 if (to.to_flags & TOF_MPTCP) {
3961 goto drop;
3962 }
3963
3964 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
3965 mptcplog((LOG_DEBUG, "MPTCP "
3966 "Sockets: bypass ack recovery\n"),
3967 MPTCP_SOCKET_DBG,
3968 MPTCP_LOGLVL_VERBOSE);
3969 break;
3970 }
3971 #endif /* MPTCP */
3972 /*
3973 * If a duplicate acknowledgement was seen
3974 * after ECN, it indicates packet loss in
3975 * addition to ECN. Reset INRECOVERY flag
3976 * so that we can process partial acks
3977 * correctly
3978 */
3979 if (tp->ecn_flags & TE_INRECOVERY)
3980 tp->ecn_flags &= ~TE_INRECOVERY;
3981
3982 tcpstat.tcps_rcvdupack++;
3983 ++tp->t_dupacks;
3984
3985 /*
3986 * Check if we need to reset the limit on
3987 * early retransmit
3988 */
3989 if (tp->t_early_rexmt_count > 0 &&
3990 TSTMP_GEQ(tcp_now,
3991 (tp->t_early_rexmt_win +
3992 TCP_EARLY_REXMT_WIN)))
3993 tp->t_early_rexmt_count = 0;
3994
3995 /*
3996 * Is early retransmit needed? We check for
3997 * this when the connection is waiting for
3998 * duplicate acks to enter fast recovery.
3999 */
4000 if (!IN_FASTRECOVERY(tp))
4001 tcp_early_rexmt_check(tp, th);
4002
4003 /*
4004 * If we've seen exactly rexmt threshold
4005 * of duplicate acks, assume a packet
4006 * has been dropped and retransmit it.
4007 * Kludge snd_nxt & the congestion
4008 * window so we send only this one
4009 * packet.
4010 *
4011 * We know we're losing at the current
4012 * window size so do congestion avoidance
4013 * (set ssthresh to half the current window
4014 * and pull our congestion window back to
4015 * the new ssthresh).
4016 *
4017 * Dup acks mean that packets have left the
4018 * network (they're now cached at the receiver)
4019 * so bump cwnd by the amount in the receiver
4020 * to keep a constant cwnd packets in the
4021 * network.
4022 */
4023 if (tp->t_timer[TCPT_REXMT] == 0 ||
4024 (th->th_ack != tp->snd_una
4025 && sack_bytes_acked == 0)) {
4026 tp->t_dupacks = 0;
4027 tp->t_rexmtthresh = tcprexmtthresh;
4028 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
4029 IN_FASTRECOVERY(tp)) {
4030
4031 /*
4032 * If this connection was seeing packet
4033 * reordering, then recovery might be
4034 * delayed to disambiguate between
4035 * reordering and loss
4036 */
4037 if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4038 (tp->t_flagsext &
4039 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
4040 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4041 /*
4042 * Since the SACK information is already
4043 * updated, this ACK will be dropped
4044 */
4045 break;
4046 }
4047
4048 if (SACK_ENABLED(tp)
4049 && IN_FASTRECOVERY(tp)) {
4050 int awnd;
4051
4052 /*
4053 * Compute the amount of data in flight first.
4054 * We can inject new data into the pipe iff
4055 * we have less than 1/2 the original window's
4056 * worth of data in flight.
4057 */
4058 awnd = (tp->snd_nxt - tp->snd_fack) +
4059 tp->sackhint.sack_bytes_rexmit;
4060 if (awnd < tp->snd_ssthresh) {
4061 tp->snd_cwnd += tp->t_maxseg;
4062 if (tp->snd_cwnd > tp->snd_ssthresh)
4063 tp->snd_cwnd = tp->snd_ssthresh;
4064 }
4065 } else
4066 tp->snd_cwnd += tp->t_maxseg;
4067
4068 tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY);
4069
4070 (void) tcp_output(tp);
4071 goto drop;
4072 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
4073 tcp_seq onxt = tp->snd_nxt;
4074
4075 /*
4076 * If we're doing sack, check to
4077 * see if we're already in sack
4078 * recovery. If we're not doing sack,
4079 * check to see if we're in newreno
4080 * recovery.
4081 */
4082 if (SACK_ENABLED(tp)) {
4083 if (IN_FASTRECOVERY(tp)) {
4084 tp->t_dupacks = 0;
4085 break;
4086 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4087 break;
4088 }
4089 } else {
4090 if (SEQ_LEQ(th->th_ack,
4091 tp->snd_recover)) {
4092 tp->t_dupacks = 0;
4093 break;
4094 }
4095 }
4096 if (tp->t_flags & TF_SENTFIN)
4097 tp->snd_recover = tp->snd_max - 1;
4098 else
4099 tp->snd_recover = tp->snd_max;
4100 tp->t_timer[TCPT_PTO] = 0;
4101 tp->t_rtttime = 0;
4102
4103 /*
4104 * If the connection has seen pkt
4105 * reordering, delay recovery until
4106 * it is clear that the packet
4107 * was lost.
4108 */
4109 if (SACK_ENABLED(tp) &&
4110 (tp->t_flagsext &
4111 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4112 == TF_PKTS_REORDERED &&
4113 !IN_FASTRECOVERY(tp) &&
4114 tp->t_reorderwin > 0 &&
4115 (tp->t_state == TCPS_ESTABLISHED ||
4116 tp->t_state == TCPS_FIN_WAIT_1)) {
4117 tp->t_timer[TCPT_DELAYFR] =
4118 OFFSET_FROM_START(tp,
4119 tp->t_reorderwin);
4120 tp->t_flagsext |= TF_DELAY_RECOVERY;
4121 tcpstat.tcps_delay_recovery++;
4122 tcp_ccdbg_trace(tp, th,
4123 TCP_CC_DELAY_FASTRECOVERY);
4124 break;
4125 }
4126
4127 tcp_rexmt_save_state(tp);
4128 /*
4129 * If the current tcp cc module has
4130 * defined a hook for tasks to run
4131 * before entering FR, call it
4132 */
4133 if (CC_ALGO(tp)->pre_fr != NULL)
4134 CC_ALGO(tp)->pre_fr(tp);
4135 ENTER_FASTRECOVERY(tp);
4136 tp->t_timer[TCPT_REXMT] = 0;
4137 if (TCP_ECN_ENABLED(tp))
4138 tp->ecn_flags |= TE_SENDCWR;
4139
4140 if (SACK_ENABLED(tp)) {
4141 tcpstat.tcps_sack_recovery_episode++;
4142 tp->sack_newdata = tp->snd_nxt;
4143 tp->snd_cwnd = tp->t_maxseg;
4144 tp->t_flagsext &=
4145 ~TF_CWND_NONVALIDATED;
4146 tcp_ccdbg_trace(tp, th,
4147 TCP_CC_ENTER_FASTRECOVERY);
4148 (void) tcp_output(tp);
4149 goto drop;
4150 }
4151 tp->snd_nxt = th->th_ack;
4152 tp->snd_cwnd = tp->t_maxseg;
4153 (void) tcp_output(tp);
4154 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4155 tcp_cc_adjust_nonvalidated_cwnd(tp);
4156 } else {
4157 tp->snd_cwnd = tp->snd_ssthresh +
4158 tp->t_maxseg * tp->t_dupacks;
4159 }
4160 if (SEQ_GT(onxt, tp->snd_nxt))
4161 tp->snd_nxt = onxt;
4162 tcp_ccdbg_trace(tp, th,
4163 TCP_CC_ENTER_FASTRECOVERY);
4164 goto drop;
4165 } else if (limited_txmt &&
4166 ALLOW_LIMITED_TRANSMIT(tp) &&
4167 (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
4168 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
4169 u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
4170
4171 /* Use Limited Transmit algorithm on the first two
4172 * duplicate acks when there is new data to transmit
4173 */
4174 tp->snd_cwnd += incr;
4175 tcpstat.tcps_limited_txt++;
4176 (void) tcp_output(tp);
4177
4178 tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
4179
4180 /* Reset snd_cwnd back to normal */
4181 tp->snd_cwnd -= incr;
4182 }
4183 } else {
4184 tp->t_dupacks = 0;
4185 tp->t_rexmtthresh = tcprexmtthresh;
4186 }
4187 break;
4188 }
4189 /*
4190 * If the congestion window was inflated to account
4191 * for the other side's cached packets, retract it.
4192 */
4193 if (IN_FASTRECOVERY(tp)) {
4194 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4195 /*
4196 * If we received an ECE and entered
4197 * recovery, the subsequent ACKs should
4198 * not be treated as partial acks.
4199 */
4200 if (tp->ecn_flags & TE_INRECOVERY)
4201 goto process_ACK;
4202
4203 if (SACK_ENABLED(tp))
4204 tcp_sack_partialack(tp, th);
4205 else
4206 tcp_newreno_partial_ack(tp, th);
4207 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
4208 } else {
4209 EXIT_FASTRECOVERY(tp);
4210 if (CC_ALGO(tp)->post_fr != NULL)
4211 CC_ALGO(tp)->post_fr(tp, th);
4212 tp->t_pipeack = 0;
4213 tcp_clear_pipeack_state(tp);
4214 tcp_ccdbg_trace(tp, th,
4215 TCP_CC_EXIT_FASTRECOVERY);
4216 }
4217 } else if ((tp->t_flagsext &
4218 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4219 == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4220 /*
4221 * If the ack acknowledges upto snd_recover or if
4222 * it acknowledges all the snd holes, exit
4223 * recovery and cancel the timer. Otherwise,
4224 * this is a partial ack. Wait for recovery timer
4225 * to enter recovery. The snd_holes have already
4226 * been updated.
4227 */
4228 if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
4229 TAILQ_EMPTY(&tp->snd_holes)) {
4230 tp->t_timer[TCPT_DELAYFR] = 0;
4231 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
4232 EXIT_FASTRECOVERY(tp);
4233 tcp_ccdbg_trace(tp, th,
4234 TCP_CC_EXIT_FASTRECOVERY);
4235 }
4236 } else {
4237 /*
4238 * We were not in fast recovery. Reset the
4239 * duplicate ack counter.
4240 */
4241 tp->t_dupacks = 0;
4242 tp->t_rexmtthresh = tcprexmtthresh;
4243 }
4244
4245
4246 /*
4247 * If we reach this point, ACK is not a duplicate,
4248 * i.e., it ACKs something we sent.
4249 */
4250 if (tp->t_flags & TF_NEEDSYN) {
4251 /*
4252 * T/TCP: Connection was half-synchronized, and our
4253 * SYN has been ACK'd (so connection is now fully
4254 * synchronized). Go to non-starred state,
4255 * increment snd_una for ACK of SYN, and check if
4256 * we can do window scaling.
4257 */
4258 tp->t_flags &= ~TF_NEEDSYN;
4259 tp->snd_una++;
4260 /* Do window scaling? */
4261 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4262 tp->snd_scale = tp->requested_s_scale;
4263 tp->rcv_scale = tp->request_r_scale;
4264 }
4265 }
4266
4267 process_ACK:
4268 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
4269 acked = BYTES_ACKED(th, tp);
4270 tcpstat.tcps_rcvackpack++;
4271 tcpstat.tcps_rcvackbyte += acked;
4272
4273 /*
4274 * If the last packet was a retransmit, make sure
4275 * it was not spurious.
4276 *
4277 * This will also take care of congestion window
4278 * adjustment if a last packet was recovered due to a
4279 * tail loss probe.
4280 */
4281 tcp_bad_rexmt_check(tp, th, &to);
4282
4283 /* Recalculate the RTT */
4284 tcp_compute_rtt(tp, &to, th);
4285
4286 /*
4287 * If all outstanding data is acked, stop retransmit
4288 * timer and remember to restart (more output or persist).
4289 * If there is more data to be acked, restart retransmit
4290 * timer, using current (possibly backed-off) value.
4291 */
4292 if (th->th_ack == tp->snd_max) {
4293 tp->t_timer[TCPT_REXMT] = 0;
4294 tp->t_timer[TCPT_PTO] = 0;
4295 needoutput = 1;
4296 } else if (tp->t_timer[TCPT_PERSIST] == 0)
4297 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
4298 tp->t_rxtcur);
4299
4300 /*
4301 * If no data (only SYN) was ACK'd, skip rest of ACK
4302 * processing.
4303 */
4304 if (acked == 0)
4305 goto step6;
4306
4307 /*
4308 * When outgoing data has been acked (except the SYN+data), we
4309 * mark this connection as "sending good" for TFO.
4310 */
4311 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
4312 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
4313 !(th->th_flags & TH_SYN))
4314 tcp_heuristic_tfo_snd_good(tp);
4315
4316 /*
4317 * If TH_ECE is received, make sure that ECN is enabled
4318 * on that connection and we have sent ECT on data packets.
4319 */
4320 if ((thflags & TH_ECE) != 0 && TCP_ECN_ENABLED(tp) &&
4321 (tp->ecn_flags & TE_SENDIPECT)) {
4322 /*
4323 * Reduce the congestion window if we haven't
4324 * done so.
4325 */
4326 if (!IN_FASTRECOVERY(tp)) {
4327 tcp_reduce_congestion_window(tp);
4328 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
4329 /*
4330 * Also note that the connection received
4331 * ECE atleast once
4332 */
4333 tp->ecn_flags |= TE_RECV_ECN_ECE;
4334 tcpstat.tcps_ecn_recv_ece++;
4335 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
4336 }
4337 }
4338
4339 /*
4340 * When new data is acked, open the congestion window.
4341 * The specifics of how this is achieved are up to the
4342 * congestion control algorithm in use for this connection.
4343 *
4344 * The calculations in this function assume that snd_una is
4345 * not updated yet.
4346 */
4347 if (!IN_FASTRECOVERY(tp)) {
4348 if (CC_ALGO(tp)->ack_rcvd != NULL)
4349 CC_ALGO(tp)->ack_rcvd(tp, th);
4350 tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
4351 }
4352 if (acked > so->so_snd.sb_cc) {
4353 tp->snd_wnd -= so->so_snd.sb_cc;
4354 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
4355 if (so->so_flags & SOF_ENABLE_MSGS) {
4356 so->so_msg_state->msg_serial_bytes -=
4357 (int)so->so_snd.sb_cc;
4358 }
4359 ourfinisacked = 1;
4360 } else {
4361 sbdrop(&so->so_snd, acked);
4362 if (so->so_flags & SOF_ENABLE_MSGS) {
4363 so->so_msg_state->msg_serial_bytes -=
4364 acked;
4365 }
4366 tcp_sbsnd_trim(&so->so_snd);
4367 tp->snd_wnd -= acked;
4368 ourfinisacked = 0;
4369 }
4370 /* detect una wraparound */
4371 if ( !IN_FASTRECOVERY(tp) &&
4372 SEQ_GT(tp->snd_una, tp->snd_recover) &&
4373 SEQ_LEQ(th->th_ack, tp->snd_recover))
4374 tp->snd_recover = th->th_ack - 1;
4375
4376 if (IN_FASTRECOVERY(tp) &&
4377 SEQ_GEQ(th->th_ack, tp->snd_recover))
4378 EXIT_FASTRECOVERY(tp);
4379
4380 tp->snd_una = th->th_ack;
4381 if (SACK_ENABLED(tp)) {
4382 if (SEQ_GT(tp->snd_una, tp->snd_recover))
4383 tp->snd_recover = tp->snd_una;
4384 }
4385 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
4386 tp->snd_nxt = tp->snd_una;
4387 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
4388 !TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
4389 tp->snd_una))
4390 tcp_rxtseg_clean(tp);
4391 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
4392 tp->t_bwmeas != NULL)
4393 tcp_bwmeas_check(tp);
4394
4395 /*
4396 * sowwakeup must happen after snd_una, et al. are updated so that
4397 * the sequence numbers are in sync with so_snd
4398 */
4399 sowwakeup(so);
4400
4401 switch (tp->t_state) {
4402
4403 /*
4404 * In FIN_WAIT_1 STATE in addition to the processing
4405 * for the ESTABLISHED state if our FIN is now acknowledged
4406 * then enter FIN_WAIT_2.
4407 */
4408 case TCPS_FIN_WAIT_1:
4409 if (ourfinisacked) {
4410 /*
4411 * If we can't receive any more
4412 * data, then closing user can proceed.
4413 * Starting the TCPT_2MSL timer is contrary to the
4414 * specification, but if we don't get a FIN
4415 * we'll hang forever.
4416 */
4417 if (so->so_state & SS_CANTRCVMORE) {
4418 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
4419 TCP_CONN_MAXIDLE(tp));
4420 isconnected = FALSE;
4421 isdisconnected = TRUE;
4422 }
4423 DTRACE_TCP4(state__change, void, NULL,
4424 struct inpcb *, inp,
4425 struct tcpcb *, tp,
4426 int32_t, TCPS_FIN_WAIT_2);
4427 tp->t_state = TCPS_FIN_WAIT_2;
4428 /* fall through and make sure we also recognize
4429 * data ACKed with the FIN
4430 */
4431 }
4432 tp->t_flags |= TF_ACKNOW;
4433 break;
4434
4435 /*
4436 * In CLOSING STATE in addition to the processing for
4437 * the ESTABLISHED state if the ACK acknowledges our FIN
4438 * then enter the TIME-WAIT state, otherwise ignore
4439 * the segment.
4440 */
4441 case TCPS_CLOSING:
4442 if (ourfinisacked) {
4443 DTRACE_TCP4(state__change, void, NULL,
4444 struct inpcb *, inp,
4445 struct tcpcb *, tp,
4446 int32_t, TCPS_TIME_WAIT);
4447 tp->t_state = TCPS_TIME_WAIT;
4448 tcp_canceltimers(tp);
4449 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4450 tp->t_flags |= TF_CLOSING;
4451 } else {
4452 add_to_time_wait(tp, 2 * tcp_msl);
4453 }
4454 isconnected = FALSE;
4455 isdisconnected = TRUE;
4456 }
4457 tp->t_flags |= TF_ACKNOW;
4458 break;
4459
4460 /*
4461 * In LAST_ACK, we may still be waiting for data to drain
4462 * and/or to be acked, as well as for the ack of our FIN.
4463 * If our FIN is now acknowledged, delete the TCB,
4464 * enter the closed state and return.
4465 */
4466 case TCPS_LAST_ACK:
4467 if (ourfinisacked) {
4468 tp = tcp_close(tp);
4469 goto drop;
4470 }
4471 break;
4472
4473 /*
4474 * In TIME_WAIT state the only thing that should arrive
4475 * is a retransmission of the remote FIN. Acknowledge
4476 * it and restart the finack timer.
4477 */
4478 case TCPS_TIME_WAIT:
4479 add_to_time_wait(tp, 2 * tcp_msl);
4480 goto dropafterack;
4481 }
4482
4483 /*
4484 * If there is a SACK option on the ACK and we
4485 * haven't seen any duplicate acks before, count
4486 * it as a duplicate ack even if the cumulative
4487 * ack is advanced. If the receiver delayed an
4488 * ack and detected loss afterwards, then the ack
4489 * will advance cumulative ack and will also have
4490 * a SACK option. So counting it as one duplicate
4491 * ack is ok.
4492 */
4493 if (sack_ackadv == 1 &&
4494 tp->t_state == TCPS_ESTABLISHED &&
4495 SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4496 to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4497 SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4498 !(tp->t_flagsext & TF_PKTS_REORDERED)) {
4499 tcpstat.tcps_sack_ackadv++;
4500 goto process_dupack;
4501 }
4502 }
4503
4504 step6:
4505 /*
4506 * Update window information.
4507 * Don't look at window if no ACK: TAC's send garbage on first SYN.
4508 */
4509 if ((thflags & TH_ACK) &&
4510 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4511 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4512 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4513 /* keep track of pure window updates */
4514 if (tlen == 0 &&
4515 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4516 tcpstat.tcps_rcvwinupd++;
4517 tp->snd_wnd = tiwin;
4518 tp->snd_wl1 = th->th_seq;
4519 tp->snd_wl2 = th->th_ack;
4520 if (tp->snd_wnd > tp->max_sndwnd)
4521 tp->max_sndwnd = tp->snd_wnd;
4522 needoutput = 1;
4523 }
4524
4525 /*
4526 * Process segments with URG.
4527 */
4528 if ((thflags & TH_URG) && th->th_urp &&
4529 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4530 /*
4531 * This is a kludge, but if we receive and accept
4532 * random urgent pointers, we'll crash in
4533 * soreceive. It's hard to imagine someone
4534 * actually wanting to send this much urgent data.
4535 */
4536 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4537 th->th_urp = 0; /* XXX */
4538 thflags &= ~TH_URG; /* XXX */
4539 goto dodata; /* XXX */
4540 }
4541 /*
4542 * If this segment advances the known urgent pointer,
4543 * then mark the data stream. This should not happen
4544 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4545 * a FIN has been received from the remote side.
4546 * In these states we ignore the URG.
4547 *
4548 * According to RFC961 (Assigned Protocols),
4549 * the urgent pointer points to the last octet
4550 * of urgent data. We continue, however,
4551 * to consider it to indicate the first octet
4552 * of data past the urgent section as the original
4553 * spec states (in one of two places).
4554 */
4555 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4556 tp->rcv_up = th->th_seq + th->th_urp;
4557 so->so_oobmark = so->so_rcv.sb_cc +
4558 (tp->rcv_up - tp->rcv_nxt) - 1;
4559 if (so->so_oobmark == 0) {
4560 so->so_state |= SS_RCVATMARK;
4561 postevent(so, 0, EV_OOB);
4562 }
4563 sohasoutofband(so);
4564 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4565 }
4566 /*
4567 * Remove out of band data so doesn't get presented to user.
4568 * This can happen independent of advancing the URG pointer,
4569 * but if two URG's are pending at once, some out-of-band
4570 * data may creep in... ick.
4571 */
4572 if (th->th_urp <= (u_int32_t)tlen
4573 #if SO_OOBINLINE
4574 && (so->so_options & SO_OOBINLINE) == 0
4575 #endif
4576 )
4577 tcp_pulloutofband(so, th, m,
4578 drop_hdrlen); /* hdr drop is delayed */
4579 } else {
4580 /*
4581 * If no out of band data is expected,
4582 * pull receive urgent pointer along
4583 * with the receive window.
4584 */
4585 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4586 tp->rcv_up = tp->rcv_nxt;
4587 }
4588 dodata:
4589
4590 /* Set socket's connect or disconnect state correcly before doing data.
4591 * The following might unlock the socket if there is an upcall or a socket
4592 * filter.
4593 */
4594 if (isconnected) {
4595 soisconnected(so);
4596 } else if (isdisconnected) {
4597 soisdisconnected(so);
4598 }
4599
4600 /* Let's check the state of pcb just to make sure that it did not get closed
4601 * when we unlocked above
4602 */
4603 if (inp->inp_state == INPCB_STATE_DEAD) {
4604 /* Just drop the packet that we are processing and return */
4605 goto drop;
4606 }
4607
4608 /*
4609 * Process the segment text, merging it into the TCP sequencing queue,
4610 * and arranging for acknowledgment of receipt if necessary.
4611 * This process logically involves adjusting tp->rcv_wnd as data
4612 * is presented to the user (this happens in tcp_usrreq.c,
4613 * case PRU_RCVD). If a FIN has already been received on this
4614 * connection then we just ignore the text.
4615 *
4616 * If we are in SYN-received state and got a valid TFO cookie, we want
4617 * to process the data.
4618 */
4619 if ((tlen || (thflags & TH_FIN)) &&
4620 TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
4621 (TCPS_HAVEESTABLISHED(tp->t_state) ||
4622 (tp->t_state == TCPS_SYN_RECEIVED &&
4623 (tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
4624 tcp_seq save_start = th->th_seq;
4625 tcp_seq save_end = th->th_seq + tlen;
4626 m_adj(m, drop_hdrlen); /* delayed header drop */
4627 /*
4628 * Insert segment which includes th into TCP reassembly queue
4629 * with control block tp. Set thflags to whether reassembly now
4630 * includes a segment with FIN. This handles the common case
4631 * inline (segment is the next to be received on an established
4632 * connection, and the queue is empty), avoiding linkage into
4633 * and removal from the queue and repetition of various
4634 * conversions.
4635 * Set DELACK for segments received in order, but ack
4636 * immediately when segments are out of order (so
4637 * fast retransmit can work).
4638 */
4639 if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
4640 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4641 /*
4642 * Calculate the RTT on the receiver only if the
4643 * connection is in streaming mode and the last
4644 * packet was not an end-of-write
4645 */
4646 if ((tp->t_flags & TF_STRETCHACK) &&
4647 !(tp->t_flagsext & TF_STREAMEOW))
4648 tcp_compute_rtt(tp, &to, th);
4649
4650 if (DELAY_ACK(tp, th) &&
4651 ((tp->t_flags & TF_ACKNOW) == 0) ) {
4652 if ((tp->t_flags & TF_DELACK) == 0) {
4653 tp->t_flags |= TF_DELACK;
4654 tp->t_timer[TCPT_DELACK] =
4655 OFFSET_FROM_START(tp, tcp_delack);
4656 }
4657 }
4658 else {
4659 tp->t_flags |= TF_ACKNOW;
4660 }
4661 tp->rcv_nxt += tlen;
4662 thflags = th->th_flags & TH_FIN;
4663 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4664 tcpstat.tcps_rcvbyte += tlen;
4665 if (nstat_collect) {
4666 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4667 INP_ADD_STAT(inp, cell, wifi, wired,
4668 rxpackets, m->m_pkthdr.lro_npkts);
4669 } else {
4670 INP_ADD_STAT(inp, cell, wifi, wired,
4671 rxpackets, 1);
4672 }
4673 INP_ADD_STAT(inp, cell, wifi, wired,
4674 rxbytes, tlen);
4675 }
4676 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
4677 so_recv_data_stat(so, m, drop_hdrlen);
4678
4679 if (sbappendstream_rcvdemux(so, m,
4680 th->th_seq - (tp->irs + 1), 0)) {
4681 sorwakeup(so);
4682 }
4683 } else {
4684 thflags = tcp_reass(tp, th, &tlen, m, ifp);
4685 tp->t_flags |= TF_ACKNOW;
4686 }
4687
4688 if (tlen > 0 && SACK_ENABLED(tp))
4689 tcp_update_sack_list(tp, save_start, save_end);
4690
4691 tcp_adaptive_rwtimo_check(tp, tlen);
4692
4693 if (tlen > 0)
4694 tcp_tfo_rcv_data(tp);
4695
4696 if (tp->t_flags & TF_DELACK)
4697 {
4698 #if INET6
4699 if (isipv6) {
4700 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4701 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4702 th->th_seq, th->th_ack, th->th_win);
4703 }
4704 else
4705 #endif
4706 {
4707 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4708 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4709 th->th_seq, th->th_ack, th->th_win);
4710 }
4711
4712 }
4713 } else {
4714 m_freem(m);
4715 thflags &= ~TH_FIN;
4716 }
4717
4718 /*
4719 * If FIN is received ACK the FIN and let the user know
4720 * that the connection is closing.
4721 */
4722 if (thflags & TH_FIN) {
4723 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4724 socantrcvmore(so);
4725 postevent(so, 0, EV_FIN);
4726 /*
4727 * If connection is half-synchronized
4728 * (ie NEEDSYN flag on) then delay ACK,
4729 * so it may be piggybacked when SYN is sent.
4730 * Otherwise, since we received a FIN then no
4731 * more input can be expected, send ACK now.
4732 */
4733 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4734 if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4735 if ((tp->t_flags & TF_DELACK) == 0) {
4736 tp->t_flags |= TF_DELACK;
4737 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4738 }
4739 }
4740 else {
4741 tp->t_flags |= TF_ACKNOW;
4742 }
4743 tp->rcv_nxt++;
4744 }
4745 switch (tp->t_state) {
4746
4747 /*
4748 * In SYN_RECEIVED and ESTABLISHED STATES
4749 * enter the CLOSE_WAIT state.
4750 */
4751 case TCPS_SYN_RECEIVED:
4752 tp->t_starttime = tcp_now;
4753 case TCPS_ESTABLISHED:
4754 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4755 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4756 tp->t_state = TCPS_CLOSE_WAIT;
4757 break;
4758
4759 /*
4760 * If still in FIN_WAIT_1 STATE FIN has not been acked so
4761 * enter the CLOSING state.
4762 */
4763 case TCPS_FIN_WAIT_1:
4764 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4765 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4766 tp->t_state = TCPS_CLOSING;
4767 break;
4768
4769 /*
4770 * In FIN_WAIT_2 state enter the TIME_WAIT state,
4771 * starting the time-wait timer, turning off the other
4772 * standard timers.
4773 */
4774 case TCPS_FIN_WAIT_2:
4775 DTRACE_TCP4(state__change, void, NULL,
4776 struct inpcb *, inp,
4777 struct tcpcb *, tp,
4778 int32_t, TCPS_TIME_WAIT);
4779 tp->t_state = TCPS_TIME_WAIT;
4780 tcp_canceltimers(tp);
4781 tp->t_flags |= TF_ACKNOW;
4782 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4783 tp->t_flags |= TF_CLOSING;
4784 } else {
4785 add_to_time_wait(tp, 2 * tcp_msl);
4786 }
4787 soisdisconnected(so);
4788 break;
4789
4790 /*
4791 * In TIME_WAIT state restart the 2 MSL time_wait timer.
4792 */
4793 case TCPS_TIME_WAIT:
4794 add_to_time_wait(tp, 2 * tcp_msl);
4795 break;
4796 }
4797 }
4798 #if TCPDEBUG
4799 if (so->so_options & SO_DEBUG)
4800 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4801 &tcp_savetcp, 0);
4802 #endif
4803
4804 /*
4805 * Return any desired output.
4806 */
4807 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4808 (void) tcp_output(tp);
4809 }
4810
4811 tcp_check_timer_state(tp);
4812
4813
4814 tcp_unlock(so, 1, 0);
4815 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4816 return;
4817
4818 dropafterack:
4819 /*
4820 * Generate an ACK dropping incoming segment if it occupies
4821 * sequence space, where the ACK reflects our state.
4822 *
4823 * We can now skip the test for the RST flag since all
4824 * paths to this code happen after packets containing
4825 * RST have been dropped.
4826 *
4827 * In the SYN-RECEIVED state, don't send an ACK unless the
4828 * segment we received passes the SYN-RECEIVED ACK test.
4829 * If it fails send a RST. This breaks the loop in the
4830 * "LAND" DoS attack, and also prevents an ACK storm
4831 * between two listening ports that have been sent forged
4832 * SYN segments, each with the source address of the other.
4833 */
4834 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4835 (SEQ_GT(tp->snd_una, th->th_ack) ||
4836 SEQ_GT(th->th_ack, tp->snd_max)) ) {
4837 rstreason = BANDLIM_RST_OPENPORT;
4838 IF_TCP_STATINC(ifp, dospacket);
4839 goto dropwithreset;
4840 }
4841 #if TCPDEBUG
4842 if (so->so_options & SO_DEBUG)
4843 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4844 &tcp_savetcp, 0);
4845 #endif
4846 m_freem(m);
4847 tp->t_flags |= TF_ACKNOW;
4848 (void) tcp_output(tp);
4849
4850 /* Don't need to check timer state as we should have done it during tcp_output */
4851 tcp_unlock(so, 1, 0);
4852 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4853 return;
4854 dropwithresetnosock:
4855 nosock = 1;
4856 dropwithreset:
4857 /*
4858 * Generate a RST, dropping incoming segment.
4859 * Make ACK acceptable to originator of segment.
4860 * Don't bother to respond if destination was broadcast/multicast.
4861 */
4862 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4863 goto drop;
4864 #if INET6
4865 if (isipv6) {
4866 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4867 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4868 goto drop;
4869 } else
4870 #endif /* INET6 */
4871 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4872 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4873 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4874 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
4875 goto drop;
4876 /* IPv6 anycast check is done at tcp6_input() */
4877
4878 /*
4879 * Perform bandwidth limiting.
4880 */
4881 #if ICMP_BANDLIM
4882 if (badport_bandlim(rstreason) < 0)
4883 goto drop;
4884 #endif
4885
4886 #if TCPDEBUG
4887 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4888 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4889 &tcp_savetcp, 0);
4890 #endif
4891 bzero(&tra, sizeof(tra));
4892 tra.ifscope = ifscope;
4893 tra.awdl_unrestricted = 1;
4894 if (thflags & TH_ACK)
4895 /* mtod() below is safe as long as hdr dropping is delayed */
4896 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
4897 TH_RST, &tra);
4898 else {
4899 if (thflags & TH_SYN)
4900 tlen++;
4901 /* mtod() below is safe as long as hdr dropping is delayed */
4902 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
4903 (tcp_seq)0, TH_RST|TH_ACK, &tra);
4904 }
4905 /* destroy temporarily created socket */
4906 if (dropsocket) {
4907 (void) soabort(so);
4908 tcp_unlock(so, 1, 0);
4909 } else if ((inp != NULL) && (nosock == 0)) {
4910 tcp_unlock(so, 1, 0);
4911 }
4912 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4913 return;
4914 dropnosock:
4915 nosock = 1;
4916 drop:
4917 /*
4918 * Drop space held by incoming segment and return.
4919 */
4920 #if TCPDEBUG
4921 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4922 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4923 &tcp_savetcp, 0);
4924 #endif
4925 m_freem(m);
4926 /* destroy temporarily created socket */
4927 if (dropsocket) {
4928 (void) soabort(so);
4929 tcp_unlock(so, 1, 0);
4930 }
4931 else if (nosock == 0) {
4932 tcp_unlock(so, 1, 0);
4933 }
4934 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4935 return;
4936 }
4937
4938 /*
4939 * Parse TCP options and place in tcpopt.
4940 */
4941 static void
4942 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
4943 struct tcpopt *to)
4944 {
4945 u_short mss = 0;
4946 int opt, optlen;
4947
4948 for (; cnt > 0; cnt -= optlen, cp += optlen) {
4949 opt = cp[0];
4950 if (opt == TCPOPT_EOL)
4951 break;
4952 if (opt == TCPOPT_NOP)
4953 optlen = 1;
4954 else {
4955 if (cnt < 2)
4956 break;
4957 optlen = cp[1];
4958 if (optlen < 2 || optlen > cnt)
4959 break;
4960 }
4961 switch (opt) {
4962
4963 default:
4964 continue;
4965
4966 case TCPOPT_MAXSEG:
4967 if (optlen != TCPOLEN_MAXSEG)
4968 continue;
4969 if (!(th->th_flags & TH_SYN))
4970 continue;
4971 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
4972 NTOHS(mss);
4973 to->to_mss = mss;
4974 to->to_flags |= TOF_MSS;
4975 break;
4976
4977 case TCPOPT_WINDOW:
4978 if (optlen != TCPOLEN_WINDOW)
4979 continue;
4980 if (!(th->th_flags & TH_SYN))
4981 continue;
4982 to->to_flags |= TOF_SCALE;
4983 to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
4984 break;
4985
4986 case TCPOPT_TIMESTAMP:
4987 if (optlen != TCPOLEN_TIMESTAMP)
4988 continue;
4989 to->to_flags |= TOF_TS;
4990 bcopy((char *)cp + 2,
4991 (char *)&to->to_tsval, sizeof(to->to_tsval));
4992 NTOHL(to->to_tsval);
4993 bcopy((char *)cp + 6,
4994 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
4995 NTOHL(to->to_tsecr);
4996 /* Re-enable sending Timestamps if we received them */
4997 if (!(tp->t_flags & TF_REQ_TSTMP) &&
4998 tcp_do_rfc1323 == 1)
4999 tp->t_flags |= TF_REQ_TSTMP;
5000 break;
5001 case TCPOPT_SACK_PERMITTED:
5002 if (!tcp_do_sack ||
5003 optlen != TCPOLEN_SACK_PERMITTED)
5004 continue;
5005 if (th->th_flags & TH_SYN)
5006 to->to_flags |= TOF_SACK;
5007 break;
5008 case TCPOPT_SACK:
5009 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
5010 continue;
5011 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
5012 to->to_sacks = cp + 2;
5013 tcpstat.tcps_sack_rcv_blocks++;
5014
5015 break;
5016 case TCPOPT_FASTOPEN:
5017 if (optlen == TCPOLEN_FASTOPEN_REQ) {
5018 if (tp->t_state != TCPS_LISTEN)
5019 continue;
5020
5021 to->to_flags |= TOF_TFOREQ;
5022 } else {
5023 if (optlen < TCPOLEN_FASTOPEN_REQ ||
5024 (optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX ||
5025 (optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN)
5026 continue;
5027 if (tp->t_state != TCPS_LISTEN &&
5028 tp->t_state != TCPS_SYN_SENT)
5029 continue;
5030
5031 to->to_flags |= TOF_TFO;
5032 to->to_tfo = cp + 1;
5033 }
5034
5035 break;
5036 #if MPTCP
5037 case TCPOPT_MULTIPATH:
5038 tcp_do_mptcp_options(tp, cp, th, to, optlen);
5039 break;
5040 #endif /* MPTCP */
5041 }
5042 }
5043 }
5044
5045 static void
5046 tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope)
5047 {
5048 if (to->to_flags & TOF_TS) {
5049 tp->t_flags |= TF_RCVD_TSTMP;
5050 tp->ts_recent = to->to_tsval;
5051 tp->ts_recent_age = tcp_now;
5052
5053 }
5054 if (to->to_flags & TOF_MSS)
5055 tcp_mss(tp, to->to_mss, ifscope);
5056 if (SACK_ENABLED(tp)) {
5057 if (!(to->to_flags & TOF_SACK))
5058 tp->t_flagsext &= ~(TF_SACK_ENABLE);
5059 else
5060 tp->t_flags |= TF_SACK_PERMIT;
5061 }
5062 if (to->to_flags & TOF_SCALE) {
5063 tp->t_flags |= TF_RCVD_SCALE;
5064 tp->requested_s_scale = to->to_requested_s_scale;
5065
5066 /* Re-enable window scaling, if the option is received */
5067 if (tp->request_r_scale > 0)
5068 tp->t_flags |= TF_REQ_SCALE;
5069 }
5070 }
5071
5072 /*
5073 * Pull out of band byte out of a segment so
5074 * it doesn't appear in the user's data queue.
5075 * It is still reflected in the segment length for
5076 * sequencing purposes.
5077 */
5078 static void
5079 tcp_pulloutofband(so, th, m, off)
5080 struct socket *so;
5081 struct tcphdr *th;
5082 register struct mbuf *m;
5083 int off; /* delayed to be droped hdrlen */
5084 {
5085 int cnt = off + th->th_urp - 1;
5086
5087 while (cnt >= 0) {
5088 if (m->m_len > cnt) {
5089 char *cp = mtod(m, caddr_t) + cnt;
5090 struct tcpcb *tp = sototcpcb(so);
5091
5092 tp->t_iobc = *cp;
5093 tp->t_oobflags |= TCPOOB_HAVEDATA;
5094 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
5095 m->m_len--;
5096 if (m->m_flags & M_PKTHDR)
5097 m->m_pkthdr.len--;
5098 return;
5099 }
5100 cnt -= m->m_len;
5101 m = m->m_next;
5102 if (m == 0)
5103 break;
5104 }
5105 panic("tcp_pulloutofband");
5106 }
5107
5108 uint32_t
5109 get_base_rtt(struct tcpcb *tp)
5110 {
5111 uint32_t base_rtt = 0, i;
5112 for (i = 0; i < N_RTT_BASE; ++i) {
5113 if (tp->rtt_hist[i] != 0 &&
5114 (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
5115 base_rtt = tp->rtt_hist[i];
5116 }
5117 return base_rtt;
5118 }
5119
5120 /* Each value of RTT base represents the minimum RTT seen in a minute.
5121 * We keep upto N_RTT_BASE minutes worth of history.
5122 */
5123 void
5124 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
5125 {
5126 int32_t i, qdelay;
5127 u_int32_t base_rtt;
5128
5129 if (++tp->rtt_count >= rtt_samples_per_slot) {
5130 #if TRAFFIC_MGT
5131 /*
5132 * If the recv side is being throttled, check if the
5133 * current RTT is closer to the base RTT seen in
5134 * first (recent) two slots. If so, unthrottle the stream.
5135 */
5136 if (tp->t_flagsext & TF_RECV_THROTTLE) {
5137 base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
5138 qdelay = tp->t_rttcur - base_rtt;
5139 if (qdelay < target_qdelay)
5140 tp->t_flagsext &= ~(TF_RECV_THROTTLE);
5141 }
5142 #endif /* TRAFFIC_MGT */
5143
5144 for (i = (N_RTT_BASE-1); i > 0; --i) {
5145 tp->rtt_hist[i] = tp->rtt_hist[i-1];
5146 }
5147 tp->rtt_hist[0] = rtt;
5148 tp->rtt_count = 0;
5149 } else {
5150 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
5151 }
5152 }
5153
5154 /*
5155 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
5156 * present but transmit timer is running and timed sequence number was
5157 * acked, update smoothed RTT.
5158 *
5159 * If timestamps are supported, a receiver can update RTT even if
5160 * there is no outstanding data.
5161 *
5162 * Some boxes send broken timestamp replies during the SYN+ACK phase,
5163 * ignore timestamps of 0or we could calculate a huge RTT and blow up
5164 * the retransmit timer.
5165 */
5166 static void
5167 tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
5168 {
5169 int rtt = 0;
5170 VERIFY(to != NULL && th != NULL);
5171 if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
5172 u_int32_t pipe_ack_val;
5173 rtt = tcp_now - tp->t_rtttime;
5174 /*
5175 * Compute pipe ack -- the amount of data acknowledged
5176 * in the last RTT
5177 */
5178 if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
5179 pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
5180 /* Update the sample */
5181 tp->t_pipeack_sample[tp->t_pipeack_ind++] =
5182 pipe_ack_val;
5183 tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
5184
5185 /* Compute the max of the pipeack samples */
5186 pipe_ack_val = tcp_get_max_pipeack(tp);
5187 tp->t_pipeack = (pipe_ack_val >
5188 TCP_CC_CWND_INIT_BYTES) ?
5189 pipe_ack_val : 0;
5190 }
5191 /* start another measurement */
5192 tp->t_rtttime = 0;
5193 }
5194 if (((to->to_flags & TOF_TS) != 0) &&
5195 (to->to_tsecr != 0) &&
5196 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
5197 tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
5198 to->to_tsecr, th->th_ack);
5199 } else if (rtt > 0) {
5200 tcp_xmit_timer(tp, rtt, 0, th->th_ack);
5201 }
5202 }
5203
5204 /*
5205 * Collect new round-trip time estimate
5206 * and update averages and current timeout.
5207 */
5208 static void
5209 tcp_xmit_timer(register struct tcpcb *tp, int rtt,
5210 u_int32_t tsecr, tcp_seq th_ack)
5211 {
5212 register int delta;
5213
5214 if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
5215 if (SEQ_GT(th_ack, tp->snd_una) &&
5216 SEQ_LEQ(th_ack, tp->snd_max) &&
5217 (tsecr == 0 ||
5218 TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
5219 /*
5220 * We received a new ACk after a
5221 * spurious timeout. Adapt retransmission
5222 * timer as described in rfc 4015.
5223 */
5224 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
5225 tp->t_badrexmt_time = 0;
5226 tp->t_srtt = max(tp->t_srtt_prev, rtt);
5227 tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
5228 tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
5229 tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
5230
5231 if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5232 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5233
5234 goto compute_rto;
5235 } else {
5236 return;
5237 }
5238 }
5239
5240 tcpstat.tcps_rttupdated++;
5241 tp->t_rttupdated++;
5242
5243 if (rtt > 0) {
5244 tp->t_rttcur = rtt;
5245 update_base_rtt(tp, rtt);
5246 }
5247
5248 if (tp->t_srtt != 0) {
5249 /*
5250 * srtt is stored as fixed point with 5 bits after the
5251 * binary point (i.e., scaled by 32). The following magic
5252 * is equivalent to the smoothing algorithm in rfc793 with
5253 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
5254 * point).
5255 *
5256 * Freebsd adjusts rtt to origin 0 by subtracting 1
5257 * from the provided rtt value. This was required because
5258 * of the way t_rtttime was initiailised to 1 before.
5259 * Since we changed t_rtttime to be based on
5260 * tcp_now, this extra adjustment is not needed.
5261 */
5262 delta = (rtt << TCP_DELTA_SHIFT)
5263 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
5264
5265 if ((tp->t_srtt += delta) <= 0)
5266 tp->t_srtt = 1;
5267
5268 /*
5269 * We accumulate a smoothed rtt variance (actually, a
5270 * smoothed mean difference), then set the retransmit
5271 * timer to smoothed rtt + 4 times the smoothed variance.
5272 * rttvar is stored as fixed point with 4 bits after the
5273 * binary point (scaled by 16). The following is
5274 * equivalent to rfc793 smoothing with an alpha of .75
5275 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
5276 * rfc793's wired-in beta.
5277 */
5278 if (delta < 0)
5279 delta = -delta;
5280 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
5281 if ((tp->t_rttvar += delta) <= 0)
5282 tp->t_rttvar = 1;
5283 if (tp->t_rttbest == 0 ||
5284 tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5285 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5286 } else {
5287 /*
5288 * No rtt measurement yet - use the unsmoothed rtt.
5289 * Set the variance to half the rtt (so our first
5290 * retransmit happens at 3*rtt).
5291 */
5292 tp->t_srtt = rtt << TCP_RTT_SHIFT;
5293 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
5294 }
5295
5296 compute_rto:
5297 nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
5298 tp->t_rttvar);
5299 tp->t_rxtshift = 0;
5300 tp->t_rxtstart = 0;
5301
5302 /*
5303 * the retransmit should happen at rtt + 4 * rttvar.
5304 * Because of the way we do the smoothing, srtt and rttvar
5305 * will each average +1/2 tick of bias. When we compute
5306 * the retransmit timer, we want 1/2 tick of rounding and
5307 * 1 extra tick because of +-1/2 tick uncertainty in the
5308 * firing of the timer. The bias will give us exactly the
5309 * 1.5 tick we need. But, because the bias is
5310 * statistical, we have to test that we don't drop below
5311 * the minimum feasible timer (which is 2 ticks).
5312 */
5313 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5314 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
5315 TCP_ADD_REXMTSLOP(tp));
5316
5317 /*
5318 * We received an ack for a packet that wasn't retransmitted;
5319 * it is probably safe to discard any error indications we've
5320 * received recently. This isn't quite right, but close enough
5321 * for now (a route might have failed after we sent a segment,
5322 * and the return path might not be symmetrical).
5323 */
5324 tp->t_softerror = 0;
5325 }
5326
5327 static inline unsigned int
5328 tcp_maxmtu(struct rtentry *rt)
5329 {
5330 unsigned int maxmtu;
5331
5332 RT_LOCK_ASSERT_HELD(rt);
5333 if (rt->rt_rmx.rmx_mtu == 0)
5334 maxmtu = rt->rt_ifp->if_mtu;
5335 else
5336 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
5337
5338 return (maxmtu);
5339 }
5340
5341 #if INET6
5342 static inline unsigned int
5343 tcp_maxmtu6(struct rtentry *rt)
5344 {
5345 unsigned int maxmtu;
5346 struct nd_ifinfo *ndi = NULL;
5347
5348 RT_LOCK_ASSERT_HELD(rt);
5349 if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
5350 ndi = NULL;
5351 if (ndi != NULL)
5352 lck_mtx_lock(&ndi->lock);
5353 if (rt->rt_rmx.rmx_mtu == 0)
5354 maxmtu = IN6_LINKMTU(rt->rt_ifp);
5355 else
5356 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
5357 if (ndi != NULL)
5358 lck_mtx_unlock(&ndi->lock);
5359
5360 return (maxmtu);
5361 }
5362 #endif
5363
5364 /*
5365 * Determine a reasonable value for maxseg size.
5366 * If the route is known, check route for mtu.
5367 * If none, use an mss that can be handled on the outgoing
5368 * interface without forcing IP to fragment; if bigger than
5369 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
5370 * to utilize large mbufs. If no route is found, route has no mtu,
5371 * or the destination isn't local, use a default, hopefully conservative
5372 * size (usually 512 or the default IP max size, but no more than the mtu
5373 * of the interface), as we can't discover anything about intervening
5374 * gateways or networks. We also initialize the congestion/slow start
5375 * window. While looking at the routing entry, we also initialize
5376 * other path-dependent parameters from pre-set or cached values
5377 * in the routing entry.
5378 *
5379 * Also take into account the space needed for options that we
5380 * send regularly. Make maxseg shorter by that amount to assure
5381 * that we can send maxseg amount of data even when the options
5382 * are present. Store the upper limit of the length of options plus
5383 * data in maxopd.
5384 *
5385 * NOTE that this routine is only called when we process an incoming
5386 * segment, for outgoing segments only tcp_mssopt is called.
5387 *
5388 */
5389 void
5390 tcp_mss(tp, offer, input_ifscope)
5391 struct tcpcb *tp;
5392 int offer;
5393 unsigned int input_ifscope;
5394 {
5395 register struct rtentry *rt;
5396 struct ifnet *ifp;
5397 register int rtt, mss;
5398 u_int32_t bufsize;
5399 struct inpcb *inp;
5400 struct socket *so;
5401 struct rmxp_tao *taop;
5402 int origoffer = offer;
5403 u_int32_t sb_max_corrected;
5404 int isnetlocal = 0;
5405 #if INET6
5406 int isipv6;
5407 int min_protoh;
5408 #endif
5409
5410 inp = tp->t_inpcb;
5411 #if INET6
5412 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5413 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5414 : sizeof (struct tcpiphdr);
5415 #else
5416 #define min_protoh (sizeof (struct tcpiphdr))
5417 #endif
5418
5419 #if INET6
5420 if (isipv6) {
5421 rt = tcp_rtlookup6(inp, input_ifscope);
5422 }
5423 else
5424 #endif /* INET6 */
5425 {
5426 rt = tcp_rtlookup(inp, input_ifscope);
5427 }
5428 isnetlocal = (tp->t_flags & TF_LOCAL);
5429
5430 if (rt == NULL) {
5431 tp->t_maxopd = tp->t_maxseg =
5432 #if INET6
5433 isipv6 ? tcp_v6mssdflt :
5434 #endif /* INET6 */
5435 tcp_mssdflt;
5436 return;
5437 }
5438 ifp = rt->rt_ifp;
5439 /*
5440 * Slower link window correction:
5441 * If a value is specificied for slowlink_wsize use it for
5442 * PPP links believed to be on a serial modem (speed <128Kbps).
5443 * Excludes 9600bps as it is the default value adversized
5444 * by pseudo-devices over ppp.
5445 */
5446 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5447 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
5448 tp->t_flags |= TF_SLOWLINK;
5449 }
5450 so = inp->inp_socket;
5451
5452 taop = rmx_taop(rt->rt_rmx);
5453 /*
5454 * Offer == -1 means that we didn't receive SYN yet,
5455 * use cached value in that case;
5456 */
5457 if (offer == -1)
5458 offer = taop->tao_mssopt;
5459 /*
5460 * Offer == 0 means that there was no MSS on the SYN segment,
5461 * in this case we use tcp_mssdflt.
5462 */
5463 if (offer == 0)
5464 offer =
5465 #if INET6
5466 isipv6 ? tcp_v6mssdflt :
5467 #endif /* INET6 */
5468 tcp_mssdflt;
5469 else {
5470 /*
5471 * Prevent DoS attack with too small MSS. Round up
5472 * to at least minmss.
5473 */
5474 offer = max(offer, tcp_minmss);
5475 /*
5476 * Sanity check: make sure that maxopd will be large
5477 * enough to allow some data on segments even is the
5478 * all the option space is used (40bytes). Otherwise
5479 * funny things may happen in tcp_output.
5480 */
5481 offer = max(offer, 64);
5482 }
5483 taop->tao_mssopt = offer;
5484
5485 /*
5486 * While we're here, check if there's an initial rtt
5487 * or rttvar. Convert from the route-table units
5488 * to scaled multiples of the slow timeout timer.
5489 */
5490 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
5491 tcp_getrt_rtt(tp, rt);
5492 } else {
5493 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
5494 }
5495
5496 #if INET6
5497 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5498 #else
5499 mss = tcp_maxmtu(rt);
5500 #endif
5501
5502 #if NECP
5503 // At this point, the mss is just the MTU. Adjust if necessary.
5504 mss = necp_socket_get_effective_mtu(inp, mss);
5505 #endif /* NECP */
5506
5507 mss -= min_protoh;
5508
5509 if (rt->rt_rmx.rmx_mtu == 0) {
5510 #if INET6
5511 if (isipv6) {
5512 if (!isnetlocal)
5513 mss = min(mss, tcp_v6mssdflt);
5514 } else
5515 #endif /* INET6 */
5516 if (!isnetlocal)
5517 mss = min(mss, tcp_mssdflt);
5518 }
5519
5520 mss = min(mss, offer);
5521 /*
5522 * maxopd stores the maximum length of data AND options
5523 * in a segment; maxseg is the amount of data in a normal
5524 * segment. We need to store this value (maxopd) apart
5525 * from maxseg, because now every segment carries options
5526 * and thus we normally have somewhat less data in segments.
5527 */
5528 tp->t_maxopd = mss;
5529
5530 /*
5531 * origoffer==-1 indicates, that no segments were received yet.
5532 * In this case we just guess.
5533 */
5534 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
5535 (origoffer == -1 ||
5536 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5537 mss -= TCPOLEN_TSTAMP_APPA;
5538
5539 #if MPTCP
5540 mss -= mptcp_adj_mss(tp, FALSE);
5541 #endif /* MPTCP */
5542 tp->t_maxseg = mss;
5543
5544 /*
5545 * Calculate corrected value for sb_max; ensure to upgrade the
5546 * numerator for large sb_max values else it will overflow.
5547 */
5548 sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5549
5550 /*
5551 * If there's a pipesize (ie loopback), change the socket
5552 * buffer to that size only if it's bigger than the current
5553 * sockbuf size. Make the socket buffers an integral
5554 * number of mss units; if the mss is larger than
5555 * the socket buffer, decrease the mss.
5556 */
5557 #if RTV_SPIPE
5558 bufsize = rt->rt_rmx.rmx_sendpipe;
5559 if (bufsize < so->so_snd.sb_hiwat)
5560 #endif
5561 bufsize = so->so_snd.sb_hiwat;
5562 if (bufsize < mss)
5563 mss = bufsize;
5564 else {
5565 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5566 if (bufsize > sb_max_corrected)
5567 bufsize = sb_max_corrected;
5568 (void)sbreserve(&so->so_snd, bufsize);
5569 }
5570 tp->t_maxseg = mss;
5571
5572 #if RTV_RPIPE
5573 bufsize = rt->rt_rmx.rmx_recvpipe;
5574 if (bufsize < so->so_rcv.sb_hiwat)
5575 #endif
5576 bufsize = so->so_rcv.sb_hiwat;
5577 if (bufsize > mss) {
5578 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5579 if (bufsize > sb_max_corrected)
5580 bufsize = sb_max_corrected;
5581 (void)sbreserve(&so->so_rcv, bufsize);
5582 }
5583
5584 set_tcp_stream_priority(so);
5585
5586 if (rt->rt_rmx.rmx_ssthresh) {
5587 /*
5588 * There's some sort of gateway or interface
5589 * buffer limit on the path. Use this to set
5590 * slow-start threshold, but set the threshold to
5591 * no less than 2*mss.
5592 */
5593 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5594 tcpstat.tcps_usedssthresh++;
5595 } else {
5596 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5597 }
5598
5599 /*
5600 * Set the slow-start flight size depending on whether this
5601 * is a local network or not.
5602 */
5603 if (CC_ALGO(tp)->cwnd_init != NULL)
5604 CC_ALGO(tp)->cwnd_init(tp);
5605
5606 tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
5607
5608 /* Route locked during lookup above */
5609 RT_UNLOCK(rt);
5610 }
5611
5612 /*
5613 * Determine the MSS option to send on an outgoing SYN.
5614 */
5615 int
5616 tcp_mssopt(tp)
5617 struct tcpcb *tp;
5618 {
5619 struct rtentry *rt;
5620 int mss;
5621 #if INET6
5622 int isipv6;
5623 int min_protoh;
5624 #endif
5625
5626 #if INET6
5627 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5628 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5629 : sizeof (struct tcpiphdr);
5630 #else
5631 #define min_protoh (sizeof (struct tcpiphdr))
5632 #endif
5633
5634 #if INET6
5635 if (isipv6)
5636 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
5637 else
5638 #endif /* INET6 */
5639 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
5640 if (rt == NULL) {
5641 return (
5642 #if INET6
5643 isipv6 ? tcp_v6mssdflt :
5644 #endif /* INET6 */
5645 tcp_mssdflt);
5646 }
5647 /*
5648 * Slower link window correction:
5649 * If a value is specificied for slowlink_wsize use it for PPP links
5650 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5651 * it is the default value adversized by pseudo-devices over ppp.
5652 */
5653 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5654 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5655 tp->t_flags |= TF_SLOWLINK;
5656 }
5657
5658 #if INET6
5659 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5660 #else
5661 mss = tcp_maxmtu(rt);
5662 #endif
5663 /* Route locked during lookup above */
5664 RT_UNLOCK(rt);
5665
5666 #if NECP
5667 // At this point, the mss is just the MTU. Adjust if necessary.
5668 mss = necp_socket_get_effective_mtu(tp->t_inpcb, mss);
5669 #endif /* NECP */
5670
5671 return (mss - min_protoh);
5672 }
5673
5674 /*
5675 * On a partial ack arrives, force the retransmission of the
5676 * next unacknowledged segment. Do not clear tp->t_dupacks.
5677 * By setting snd_nxt to th_ack, this forces retransmission timer to
5678 * be started again.
5679 */
5680 static void
5681 tcp_newreno_partial_ack(tp, th)
5682 struct tcpcb *tp;
5683 struct tcphdr *th;
5684 {
5685 tcp_seq onxt = tp->snd_nxt;
5686 u_int32_t ocwnd = tp->snd_cwnd;
5687 tp->t_timer[TCPT_REXMT] = 0;
5688 tp->t_timer[TCPT_PTO] = 0;
5689 tp->t_rtttime = 0;
5690 tp->snd_nxt = th->th_ack;
5691 /*
5692 * Set snd_cwnd to one segment beyond acknowledged offset
5693 * (tp->snd_una has not yet been updated when this function
5694 * is called)
5695 */
5696 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
5697 tp->t_flags |= TF_ACKNOW;
5698 (void) tcp_output(tp);
5699 tp->snd_cwnd = ocwnd;
5700 if (SEQ_GT(onxt, tp->snd_nxt))
5701 tp->snd_nxt = onxt;
5702 /*
5703 * Partial window deflation. Relies on fact that tp->snd_una
5704 * not updated yet.
5705 */
5706 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5707 tp->snd_cwnd -= BYTES_ACKED(th, tp);
5708 else
5709 tp->snd_cwnd = 0;
5710 tp->snd_cwnd += tp->t_maxseg;
5711
5712 }
5713
5714 /*
5715 * Drop a random TCP connection that hasn't been serviced yet and
5716 * is eligible for discard. There is a one in qlen chance that
5717 * we will return a null, saying that there are no dropable
5718 * requests. In this case, the protocol specific code should drop
5719 * the new request. This insures fairness.
5720 *
5721 * The listening TCP socket "head" must be locked
5722 */
5723 static int
5724 tcp_dropdropablreq(struct socket *head)
5725 {
5726 struct socket *so, *sonext;
5727 unsigned int i, j, qlen;
5728 static u_int32_t rnd = 0;
5729 static u_int64_t old_runtime;
5730 static unsigned int cur_cnt, old_cnt;
5731 u_int64_t now_sec;
5732 struct inpcb *inp = NULL;
5733 struct tcpcb *tp;
5734
5735 if ((head->so_options & SO_ACCEPTCONN) == 0)
5736 return (0);
5737
5738 if (TAILQ_EMPTY(&head->so_incomp))
5739 return (0);
5740
5741 /*
5742 * Check if there is any socket in the incomp queue
5743 * that is closed because of a reset from the peer and is
5744 * waiting to be garbage collected. If so, pick that as
5745 * the victim
5746 */
5747 TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5748 inp = sotoinpcb(so);
5749 tp = intotcpcb(inp);
5750 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5751 so->so_head != NULL &&
5752 (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5753 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5754 /*
5755 * The listen socket is already locked but we
5756 * can lock this socket here without lock ordering
5757 * issues because it is in the incomp queue and
5758 * is not visible to others.
5759 */
5760 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5761 so->so_usecount++;
5762 goto found_victim;
5763 } else {
5764 continue;
5765 }
5766 }
5767 }
5768
5769 so = TAILQ_FIRST(&head->so_incomp);
5770
5771 now_sec = net_uptime();
5772 if ((i = (now_sec - old_runtime)) != 0) {
5773 old_runtime = now_sec;
5774 old_cnt = cur_cnt / i;
5775 cur_cnt = 0;
5776 }
5777
5778
5779 qlen = head->so_incqlen;
5780 if (rnd == 0)
5781 rnd = RandomULong();
5782
5783 if (++cur_cnt > qlen || old_cnt > qlen) {
5784 rnd = (314159 * rnd + 66329) & 0xffff;
5785 j = ((qlen + 1) * rnd) >> 16;
5786
5787 while (j-- && so)
5788 so = TAILQ_NEXT(so, so_list);
5789 }
5790 /* Find a connection that is not already closing (or being served) */
5791 while (so) {
5792 inp = (struct inpcb *)so->so_pcb;
5793
5794 sonext = TAILQ_NEXT(so, so_list);
5795
5796 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5797 != WNT_STOPUSING) {
5798 /*
5799 * Avoid the issue of a socket being accepted
5800 * by one input thread and being dropped by
5801 * another input thread. If we can't get a hold
5802 * on this mutex, then grab the next socket in
5803 * line.
5804 */
5805 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5806 so->so_usecount++;
5807 if ((so->so_usecount == 2) &&
5808 (so->so_state & SS_INCOMP) &&
5809 !(so->so_flags & SOF_INCOMP_INPROGRESS)) {
5810 break;
5811 } else {
5812 /*
5813 * don't use if being accepted or
5814 * used in any other way
5815 */
5816 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5817 tcp_unlock(so, 1, 0);
5818 }
5819 } else {
5820 /*
5821 * do not try to lock the inp in
5822 * in_pcb_checkstate because the lock
5823 * is already held in some other thread.
5824 * Only drop the inp_wntcnt reference.
5825 */
5826 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5827 }
5828 }
5829 so = sonext;
5830
5831 }
5832 if (so == NULL) {
5833 return (0);
5834 }
5835
5836 /* Makes sure socket is still in the right state to be discarded */
5837
5838 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5839 tcp_unlock(so, 1, 0);
5840 return (0);
5841 }
5842
5843 found_victim:
5844 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
5845 /* do not discard: that socket is being accepted */
5846 tcp_unlock(so, 1, 0);
5847 return (0);
5848 }
5849
5850 TAILQ_REMOVE(&head->so_incomp, so, so_list);
5851 tcp_unlock(head, 0, 0);
5852
5853 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
5854 tp = sototcpcb(so);
5855 so->so_flags |= SOF_OVERFLOW;
5856 so->so_head = NULL;
5857
5858 tcp_close(tp);
5859 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
5860 /*
5861 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
5862 * doesn't require a lock, it could have happened while
5863 * we are holding the lock. This pcb will have to
5864 * be garbage collected later.
5865 * Release the reference held for so_incomp queue
5866 */
5867 so->so_usecount--;
5868 tcp_unlock(so, 1, 0);
5869 } else {
5870 /*
5871 * Unlock this socket and leave the reference on.
5872 * We need to acquire the pcbinfo lock in order to
5873 * fully dispose it off
5874 */
5875 tcp_unlock(so, 0, 0);
5876
5877 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
5878
5879 tcp_lock(so, 0, 0);
5880 /* Release the reference held for so_incomp queue */
5881 so->so_usecount--;
5882
5883 if (so->so_usecount != 1 ||
5884 (inp->inp_wantcnt > 0 &&
5885 inp->inp_wantcnt != WNT_STOPUSING)) {
5886 /*
5887 * There is an extra wantcount or usecount
5888 * that must have been added when the socket
5889 * was unlocked. This socket will have to be
5890 * garbage collected later
5891 */
5892 tcp_unlock(so, 1, 0);
5893 } else {
5894
5895 /* Drop the reference held for this function */
5896 so->so_usecount--;
5897
5898 in_pcbdispose(inp);
5899 }
5900 lck_rw_done(tcbinfo.ipi_lock);
5901 }
5902 tcpstat.tcps_drops++;
5903
5904 tcp_lock(head, 0, 0);
5905 head->so_incqlen--;
5906 head->so_qlen--;
5907 return(1);
5908 }
5909
5910 /* Set background congestion control on a socket */
5911 void
5912 tcp_set_background_cc(struct socket *so)
5913 {
5914 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5915 }
5916
5917 /* Set foreground congestion control on a socket */
5918 void
5919 tcp_set_foreground_cc(struct socket *so)
5920 {
5921 if (tcp_use_newreno)
5922 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5923 else
5924 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
5925 }
5926
5927 static void
5928 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5929 {
5930 struct inpcb *inp = sotoinpcb(so);
5931 struct tcpcb *tp = intotcpcb(inp);
5932 u_char old_cc_index = 0;
5933 if (tp->tcp_cc_index != cc_index) {
5934
5935 old_cc_index = tp->tcp_cc_index;
5936
5937 if (CC_ALGO(tp)->cleanup != NULL)
5938 CC_ALGO(tp)->cleanup(tp);
5939 tp->tcp_cc_index = cc_index;
5940
5941 tcp_cc_allocate_state(tp);
5942
5943 if (CC_ALGO(tp)->switch_to != NULL)
5944 CC_ALGO(tp)->switch_to(tp, old_cc_index);
5945
5946 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
5947 }
5948 }
5949
5950 void
5951 tcp_set_recv_bg(struct socket *so)
5952 {
5953 if (!IS_TCP_RECV_BG(so))
5954 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
5955
5956 /* Unset Large Receive Offload on background sockets */
5957 so_set_lro(so, SO_TC_BK);
5958 }
5959
5960 void
5961 tcp_clear_recv_bg(struct socket *so)
5962 {
5963 if (IS_TCP_RECV_BG(so))
5964 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
5965
5966 /*
5967 * Set/unset use of Large Receive Offload depending on
5968 * the traffic class
5969 */
5970 so_set_lro(so, so->so_traffic_class);
5971 }
5972
5973 void
5974 inp_fc_unthrottle_tcp(struct inpcb *inp)
5975 {
5976 struct tcpcb *tp = inp->inp_ppcb;
5977 /*
5978 * Back off the slow-start threshold and enter
5979 * congestion avoidance phase
5980 */
5981 if (CC_ALGO(tp)->pre_fr != NULL)
5982 CC_ALGO(tp)->pre_fr(tp);
5983
5984 tp->snd_cwnd = tp->snd_ssthresh;
5985 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
5986 /*
5987 * Restart counting for ABC as we changed the
5988 * congestion window just now.
5989 */
5990 tp->t_bytes_acked = 0;
5991
5992 /* Reset retransmit shift as we know that the reason
5993 * for delay in sending a packet is due to flow
5994 * control on the outgoing interface. There is no need
5995 * to backoff retransmit timer.
5996 */
5997 tp->t_rxtshift = 0;
5998 tp->t_rtttime = 0;
5999
6000 /*
6001 * Start the output stream again. Since we are
6002 * not retransmitting data, do not reset the
6003 * retransmit timer or rtt calculation.
6004 */
6005 tcp_output(tp);
6006 }
6007
6008 static int
6009 tcp_getstat SYSCTL_HANDLER_ARGS
6010 {
6011 #pragma unused(oidp, arg1, arg2)
6012
6013 int error;
6014
6015 proc_t caller = PROC_NULL;
6016 proc_t caller_parent = PROC_NULL;
6017 char command_name[MAXCOMLEN + 1] = "";
6018 char parent_name[MAXCOMLEN + 1] = "";
6019
6020 if ((caller = proc_self()) != PROC_NULL) {
6021 /* get process name */
6022 strlcpy(command_name, caller->p_comm, sizeof(command_name));
6023
6024 /* get parent process name if possible */
6025 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
6026 strlcpy(parent_name, caller_parent->p_comm,
6027 sizeof(parent_name));
6028 proc_rele(caller_parent);
6029 }
6030
6031 if ((escape_str(command_name, strlen(command_name),
6032 sizeof(command_name)) == 0) &&
6033 (escape_str(parent_name, strlen(parent_name),
6034 sizeof(parent_name)) == 0)) {
6035 kern_asl_msg(LOG_DEBUG, "messagetracer",
6036 5,
6037 "com.apple.message.domain",
6038 "com.apple.kernel.tcpstat", /* 1 */
6039 "com.apple.message.signature",
6040 "tcpstat", /* 2 */
6041 "com.apple.message.signature2", command_name, /* 3 */
6042 "com.apple.message.signature3", parent_name, /* 4 */
6043 "com.apple.message.summarize", "YES", /* 5 */
6044 NULL);
6045 }
6046 }
6047 if (caller != PROC_NULL)
6048 proc_rele(caller);
6049
6050 if (req->oldptr == 0) {
6051 req->oldlen= (size_t)sizeof(struct tcpstat);
6052 }
6053
6054 error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
6055
6056 return (error);
6057
6058 }
6059
6060 /*
6061 * Checksum extended TCP header and data.
6062 */
6063 int
6064 tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
6065 {
6066 struct ifnet *ifp = m->m_pkthdr.rcvif;
6067
6068 switch (af) {
6069 case AF_INET: {
6070 struct ip *ip = mtod(m, struct ip *);
6071 struct ipovly *ipov = (struct ipovly *)ip;
6072
6073 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6074 return (0);
6075
6076 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6077 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6078 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6079 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6080 th->th_sum = m->m_pkthdr.csum_rx_val;
6081 } else {
6082 uint16_t sum = m->m_pkthdr.csum_rx_val;
6083 uint16_t start = m->m_pkthdr.csum_rx_start;
6084
6085 /*
6086 * Perform 1's complement adjustment of octets
6087 * that got included/excluded in the hardware-
6088 * calculated checksum value. Ignore cases
6089 * where the value includes or excludes the IP
6090 * header span, as the sum for those octets
6091 * would already be 0xffff and thus no-op.
6092 */
6093 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6094 start != 0 && (off - start) != off) {
6095 #if BYTE_ORDER != BIG_ENDIAN
6096 if (start < off) {
6097 HTONS(ip->ip_len);
6098 HTONS(ip->ip_off);
6099 }
6100 #endif
6101 /* callee folds in sum */
6102 sum = m_adj_sum16(m, start, off, sum);
6103 #if BYTE_ORDER != BIG_ENDIAN
6104 if (start < off) {
6105 NTOHS(ip->ip_off);
6106 NTOHS(ip->ip_len);
6107 }
6108 #endif
6109 }
6110
6111 /* callee folds in sum */
6112 th->th_sum = in_pseudo(ip->ip_src.s_addr,
6113 ip->ip_dst.s_addr,
6114 sum + htonl(tlen + IPPROTO_TCP));
6115 }
6116 th->th_sum ^= 0xffff;
6117 } else {
6118 uint16_t ip_sum;
6119 int len;
6120 char b[9];
6121
6122 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
6123 bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
6124 ip_sum = ipov->ih_len;
6125 ipov->ih_len = (u_short)tlen;
6126 #if BYTE_ORDER != BIG_ENDIAN
6127 HTONS(ipov->ih_len);
6128 #endif
6129 len = sizeof (struct ip) + tlen;
6130 th->th_sum = in_cksum(m, len);
6131 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
6132 ipov->ih_len = ip_sum;
6133
6134 tcp_in_cksum_stats(len);
6135 }
6136 break;
6137 }
6138 #if INET6
6139 case AF_INET6: {
6140 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6141
6142 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6143 return (0);
6144
6145 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6146 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6147 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6148 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6149 th->th_sum = m->m_pkthdr.csum_rx_val;
6150 } else {
6151 uint16_t sum = m->m_pkthdr.csum_rx_val;
6152 uint16_t start = m->m_pkthdr.csum_rx_start;
6153
6154 /*
6155 * Perform 1's complement adjustment of octets
6156 * that got included/excluded in the hardware-
6157 * calculated checksum value.
6158 */
6159 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6160 start != off) {
6161 uint16_t s, d;
6162
6163 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
6164 s = ip6->ip6_src.s6_addr16[1];
6165 ip6->ip6_src.s6_addr16[1] = 0 ;
6166 }
6167 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
6168 d = ip6->ip6_dst.s6_addr16[1];
6169 ip6->ip6_dst.s6_addr16[1] = 0;
6170 }
6171
6172 /* callee folds in sum */
6173 sum = m_adj_sum16(m, start, off, sum);
6174
6175 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
6176 ip6->ip6_src.s6_addr16[1] = s;
6177 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
6178 ip6->ip6_dst.s6_addr16[1] = d;
6179 }
6180
6181 th->th_sum = in6_pseudo(
6182 &ip6->ip6_src, &ip6->ip6_dst,
6183 sum + htonl(tlen + IPPROTO_TCP));
6184 }
6185 th->th_sum ^= 0xffff;
6186 } else {
6187 tcp_in6_cksum_stats(tlen);
6188 th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
6189 }
6190 break;
6191 }
6192 #endif /* INET6 */
6193 default:
6194 VERIFY(0);
6195 /* NOTREACHED */
6196 }
6197
6198 if (th->th_sum != 0) {
6199 tcpstat.tcps_rcvbadsum++;
6200 IF_TCP_STATINC(ifp, badformat);
6201 return (-1);
6202 }
6203
6204 return (0);
6205 }
6206
6207 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
6208 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
6209 "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
6210
6211 static int
6212 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
6213 {
6214 #pragma unused(arg1, arg2)
6215
6216 int error, val = tcprexmtthresh;
6217
6218 error = sysctl_handle_int(oidp, &val, 0, req);
6219 if (error || !req->newptr)
6220 return (error);
6221
6222 /*
6223 * Constrain the number of duplicate ACKs
6224 * to consider for TCP fast retransmit
6225 * to either 2 or 3
6226 */
6227
6228 if (val < 2 || val > 3)
6229 return (EINVAL);
6230
6231 tcprexmtthresh = val;
6232
6233 return (0);
6234 }
6235
6236 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW |
6237 CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
6238 "Duplicate ACK Threshold for Fast Retransmit");
6239