2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/malloc.h>
76 #include <sys/proc.h> /* for proc0 declaration */
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/syslog.h>
81 #include <sys/mcache.h>
84 #endif /* XNU_TARGET_OS_OSX */
85 #include <sys/kauth.h>
86 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
88 #include <machine/endian.h>
91 #include <net/if_types.h>
92 #include <net/route.h>
93 #include <net/ntstat.h>
94 #include <net/content_filter.h>
96 #include <net/multi_layer_pkt_log.h>
98 #include <netinet/in.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
102 #include <netinet/in_var.h>
103 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
104 #include <netinet/in_pcb.h>
105 #include <netinet/ip_var.h>
106 #include <mach/sdt.h>
107 #include <netinet/ip6.h>
108 #include <netinet/icmp6.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet6/in6_pcb.h>
112 #include <netinet/tcp.h>
113 #include <netinet/tcp_cache.h>
114 #include <netinet/tcp_fsm.h>
115 #include <netinet/tcp_seq.h>
116 #include <netinet/tcp_timer.h>
117 #include <netinet/tcp_var.h>
118 #include <netinet/tcp_cc.h>
119 #include <dev/random/randomdev.h>
120 #include <kern/zalloc.h>
121 #include <netinet6/tcp6_var.h>
122 #include <netinet/tcpip.h>
124 #include <netinet/tcp_debug.h>
125 u_char tcp_saveipgen
[40]; /* the size must be of max ip header, now IPv6 */
126 struct tcphdr tcp_savetcp
;
127 #endif /* TCPDEBUG */
128 #include <netinet/tcp_log.h>
131 #include <netinet6/ipsec.h>
132 #include <netinet6/ipsec6.h>
133 #include <netkey/key.h>
136 #include <sys/kdebug.h>
138 #include <netinet/mptcp_var.h>
139 #include <netinet/mptcp.h>
140 #include <netinet/mptcp_opt.h>
143 #include <corecrypto/ccaes.h>
145 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
146 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
147 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
148 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
150 #define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ)
151 #define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ)
152 #define TCP_STRETCHACK_ENABLE_PKTCNT 2000
154 struct tcpstat tcpstat
;
156 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, flow_control_response
,
157 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_flow_control_response
, 1,
158 "Improved response to Flow-control events");
160 static int log_in_vain
= 0;
161 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, log_in_vain
,
162 CTLFLAG_RW
| CTLFLAG_LOCKED
, &log_in_vain
, 0,
163 "Log all incoming TCP connections");
165 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, ack_strategy
,
166 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_ack_strategy
, TCP_ACK_STRATEGY_MODERN
,
167 "Revised TCP ACK-strategy, avoiding stretch-ACK implementation");
169 static int blackhole
= 0;
170 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, blackhole
,
171 CTLFLAG_RW
| CTLFLAG_LOCKED
, &blackhole
, 0,
172 "Do not send RST when dropping refused connections");
174 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, aggressive_rcvwnd_inc
,
175 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_aggressive_rcvwnd_inc
, 1,
176 "Be more aggressive about increasing the receive-window.");
178 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, delayed_ack
,
179 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_delack_enabled
, 3,
180 "Delay ACK to try and piggyback it onto a data packet");
182 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, recvbg
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
183 int, tcp_recv_bg
, 0, "Receive background");
185 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, drop_synfin
,
186 CTLFLAG_RW
| CTLFLAG_LOCKED
, static int, drop_synfin
, 1,
187 "Drop TCP packets with SYN+FIN set");
189 SYSCTL_NODE(_net_inet_tcp
, OID_AUTO
, reass
, CTLFLAG_RW
| CTLFLAG_LOCKED
, 0,
190 "TCP Segment Reassembly Queue");
192 static int tcp_reass_overflows
= 0;
193 SYSCTL_INT(_net_inet_tcp_reass
, OID_AUTO
, overflows
,
194 CTLFLAG_RD
| CTLFLAG_LOCKED
, &tcp_reass_overflows
, 0,
195 "Global number of TCP Segment Reassembly Queue Overflows");
198 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, slowlink_wsize
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
199 __private_extern__
int, slowlink_wsize
, 8192,
200 "Maximum advertised window size for slowlink");
202 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, maxseg_unacked
,
203 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, maxseg_unacked
, 8,
204 "Maximum number of outstanding segments left unacked");
206 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, rfc3465
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
207 int, tcp_do_rfc3465
, 1, "");
209 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, rfc3465_lim2
,
210 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_do_rfc3465_lim2
, 1,
211 "Appropriate bytes counting w/ L=2*SMSS");
213 int rtt_samples_per_slot
= 20;
215 int tcp_acc_iaj_high_thresh
= ACC_IAJ_HIGH_THRESH
;
216 u_int32_t tcp_autorcvbuf_inc_shift
= 3;
217 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, recv_allowed_iaj
,
218 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_allowed_iaj
, ALLOWED_IAJ
,
219 "Allowed inter-packet arrival jiter");
221 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, doautorcvbuf
,
222 CTLFLAG_RW
| CTLFLAG_LOCKED
, u_int32_t
, tcp_do_autorcvbuf
, 1,
223 "Enable automatic socket buffer tuning");
225 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, autotunereorder
,
226 CTLFLAG_RW
| CTLFLAG_LOCKED
, u_int32_t
, tcp_autotune_reorder
, 1,
227 "Enable automatic socket buffer tuning even when reordering is present");
229 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, autorcvbufmax
,
230 CTLFLAG_RW
| CTLFLAG_LOCKED
, u_int32_t
, tcp_autorcvbuf_max
, 2 * 1024 * 1024,
231 "Maximum receive socket buffer size");
233 int tcp_disable_access_to_stats
= 1;
234 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, disable_access_to_stats
,
235 CTLFLAG_RW
| CTLFLAG_LOCKED
, &tcp_disable_access_to_stats
, 0,
236 "Disable access to tcpstat");
238 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, challengeack_limit
,
239 CTLFLAG_RW
| CTLFLAG_LOCKED
, uint32_t, tcp_challengeack_limit
, 10,
240 "Maximum number of challenge ACKs per connection per second");
243 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, do_rfc5961
,
244 CTLFLAG_RW
| CTLFLAG_LOCKED
, static int, tcp_do_rfc5961
, 1,
245 "Enable/Disable full RFC 5961 compliance");
247 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, do_better_lr
,
248 CTLFLAG_RW
| CTLFLAG_LOCKED
, int, tcp_do_better_lr
, 1,
249 "Improved TCP Loss Recovery");
251 extern int tcp_acc_iaj_high
;
252 extern int tcp_acc_iaj_react_limit
;
254 int tcprexmtthresh
= 3;
257 struct timeval tcp_uptime
; /* uptime when tcp_now was last updated */
258 lck_spin_t
*tcp_uptime_lock
; /* Used to sychronize updates to tcp_now */
260 struct inpcbhead tcb
;
261 #define tcb6 tcb /* for KAME src sync over BSD*'s */
262 struct inpcbinfo tcbinfo
;
264 static void tcp_dooptions(struct tcpcb
*, u_char
*, int, struct tcphdr
*,
266 static void tcp_finalize_options(struct tcpcb
*, struct tcpopt
*, unsigned int);
267 static void tcp_pulloutofband(struct socket
*,
268 struct tcphdr
*, struct mbuf
*, int);
269 static void tcp_xmit_timer(struct tcpcb
*, int, u_int32_t
, tcp_seq
);
270 static inline unsigned int tcp_maxmtu(struct rtentry
*);
271 static inline int tcp_stretch_ack_enable(struct tcpcb
*tp
, int thflags
);
272 static inline void tcp_adaptive_rwtimo_check(struct tcpcb
*, int);
275 static inline void update_iaj_state(struct tcpcb
*tp
, uint32_t tlen
,
277 static inline void compute_iaj(struct tcpcb
*tp
);
278 static inline void compute_iaj_meat(struct tcpcb
*tp
, uint32_t cur_iaj
);
279 #endif /* TRAFFIC_MGT */
281 static inline unsigned int tcp_maxmtu6(struct rtentry
*);
282 unsigned int get_maxmtu(struct rtentry
*);
284 static void tcp_sbrcv_grow(struct tcpcb
*tp
, struct sockbuf
*sb
,
285 struct tcpopt
*to
, uint32_t tlen
);
286 void tcp_sbrcv_trim(struct tcpcb
*tp
, struct sockbuf
*sb
);
287 static void tcp_sbsnd_trim(struct sockbuf
*sbsnd
);
288 static inline void tcp_sbrcv_tstmp_check(struct tcpcb
*tp
);
289 static inline void tcp_sbrcv_reserve(struct tcpcb
*tp
, struct sockbuf
*sb
,
290 u_int32_t newsize
, u_int32_t idealsize
, u_int32_t rcvbuf_max
);
291 static void tcp_bad_rexmt_restore_state(struct tcpcb
*tp
, struct tcphdr
*th
);
292 static void tcp_compute_rtt(struct tcpcb
*tp
, struct tcpopt
*to
,
294 static void tcp_early_rexmt_check(struct tcpcb
*tp
, struct tcphdr
*th
);
295 static void tcp_bad_rexmt_check(struct tcpcb
*tp
, struct tcphdr
*th
,
298 * Constants used for resizing receive socket buffer
299 * when timestamps are not supported
301 #define TCPTV_RCVNOTS_QUANTUM 100
302 #define TCP_RCVNOTS_BYTELEVEL 204800
305 * Constants used for limiting early retransmits
308 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
309 #define TCP_EARLY_REXMT_LIMIT 10
311 #define log_in_vain_log( a ) { log a; }
313 int tcp_rcvunackwin
= TCPTV_UNACKWIN
;
314 int tcp_maxrcvidle
= TCPTV_MAXRCVIDLE
;
315 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, rcvsspktcnt
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
316 int, tcp_rcvsspktcnt
, TCP_RCV_SS_PKTCOUNT
, "packets to be seen before receiver stretches acks");
318 #define DELAY_ACK(tp, th) \
319 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
321 static int tcp_dropdropablreq(struct socket
*head
);
322 static void tcp_newreno_partial_ack(struct tcpcb
*tp
, struct tcphdr
*th
);
323 static void update_base_rtt(struct tcpcb
*tp
, uint32_t rtt
);
324 void tcp_set_background_cc(struct socket
*so
);
325 void tcp_set_foreground_cc(struct socket
*so
);
326 static void tcp_set_new_cc(struct socket
*so
, uint16_t cc_index
);
327 static void tcp_bwmeas_check(struct tcpcb
*tp
);
331 reset_acc_iaj(struct tcpcb
*tp
)
338 update_iaj_state(struct tcpcb
*tp
, uint32_t size
, int rst_size
)
343 if (tp
->iaj_size
== 0 || size
>= tp
->iaj_size
) {
345 tp
->iaj_rcv_ts
= tcp_now
;
346 tp
->iaj_small_pkt
= 0;
350 /* For every 32 bit unsigned integer(v), this function will find the
351 * largest integer n such that (n*n <= v). This takes at most 16 iterations
352 * irrespective of the value of v and does not involve multiplications.
355 isqrt(unsigned int val
)
357 unsigned int sqrt_cache
[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
358 unsigned int temp
, g
= 0, b
= 0x8000, bshft
= 15;
360 for (g
= 0; g
<= 10; ++g
) {
361 if (sqrt_cache
[g
] > val
) {
364 } else if (sqrt_cache
[g
] == val
) {
370 temp
= (((g
<< 1) + b
) << (bshft
--));
376 } while (b
> 0 && val
> 0);
382 compute_iaj(struct tcpcb
*tp
)
384 compute_iaj_meat(tp
, (tcp_now
- tp
->iaj_rcv_ts
));
388 compute_iaj_meat(struct tcpcb
*tp
, uint32_t cur_iaj
)
390 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
391 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
393 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
394 #define IAJ_DIV_SHIFT 4
395 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
397 uint32_t allowed_iaj
, acc_iaj
= 0;
402 cur_iaj_dev
= (cur_iaj
- tp
->avg_iaj
);
404 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
405 * may have a constant jitter more than that. We detect this by
406 * using standard deviation.
408 allowed_iaj
= tp
->avg_iaj
+ tp
->std_dev_iaj
;
409 if (allowed_iaj
< tcp_allowed_iaj
) {
410 allowed_iaj
= tcp_allowed_iaj
;
413 /* Initially when the connection starts, the senders congestion
414 * window is small. During this period we avoid throttling a
415 * connection because we do not have a good starting point for
416 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
417 * the first few packets.
419 if (tp
->iaj_pktcnt
> IAJ_IGNORE_PKTCNT
) {
420 if (cur_iaj
<= allowed_iaj
) {
421 if (tp
->acc_iaj
>= 2) {
422 acc_iaj
= tp
->acc_iaj
- 2;
427 acc_iaj
= tp
->acc_iaj
+ (cur_iaj
- allowed_iaj
);
430 if (acc_iaj
> MAX_ACC_IAJ
) {
431 acc_iaj
= MAX_ACC_IAJ
;
433 tp
->acc_iaj
= acc_iaj
;
436 /* Compute weighted average where the history has a weight of
437 * 15 out of 16 and the current value has a weight of 1 out of 16.
438 * This will make the short-term measurements have more weight.
440 * The addition of 8 will help to round-up the value
441 * instead of round-down
443 tp
->avg_iaj
= (((tp
->avg_iaj
<< IAJ_DIV_SHIFT
) - tp
->avg_iaj
)
444 + cur_iaj
+ IAJ_ROUNDUP_CONST
) >> IAJ_DIV_SHIFT
;
446 /* Compute Root-mean-square of deviation where mean is a weighted
447 * average as described above.
449 temp
= tp
->std_dev_iaj
* tp
->std_dev_iaj
;
450 mean
= (((temp
<< IAJ_DIV_SHIFT
) - temp
)
451 + (cur_iaj_dev
* cur_iaj_dev
)
452 + IAJ_ROUNDUP_CONST
) >> IAJ_DIV_SHIFT
;
454 tp
->std_dev_iaj
= isqrt(mean
);
456 DTRACE_TCP3(iaj
, struct tcpcb
*, tp
, uint32_t, cur_iaj
,
457 uint32_t, allowed_iaj
);
461 #endif /* TRAFFIC_MGT */
464 * Perform rate limit check per connection per second
465 * tp->t_challengeack_last is the last_time diff was greater than 1sec
466 * tp->t_challengeack_count is the number of ACKs sent (within 1sec)
467 * Return TRUE if we shouldn't send the ACK due to rate limitation
468 * Return FALSE if it is still ok to send challenge ACK
471 tcp_is_ack_ratelimited(struct tcpcb
*tp
)
473 boolean_t ret
= TRUE
;
474 uint32_t now
= tcp_now
;
477 diff
= timer_diff(now
, 0, tp
->t_challengeack_last
, 0);
478 /* If it is first time or diff > 1000ms,
479 * update the challengeack_last and reset the
480 * current count of ACKs
482 if (tp
->t_challengeack_last
== 0 || diff
>= 1000) {
483 tp
->t_challengeack_last
= now
;
484 tp
->t_challengeack_count
= 0;
486 } else if (tp
->t_challengeack_count
< tcp_challengeack_limit
) {
490 /* Careful about wrap-around */
491 if (ret
== FALSE
&& (tp
->t_challengeack_count
+ 1 > 0)) {
492 tp
->t_challengeack_count
++;
498 /* Check if enough amount of data has been acknowledged since
499 * bw measurement was started
502 tcp_bwmeas_check(struct tcpcb
*tp
)
504 int32_t bw_meas_bytes
;
505 uint32_t bw
, bytes
, elapsed_time
;
507 if (SEQ_LEQ(tp
->snd_una
, tp
->t_bwmeas
->bw_start
)) {
511 bw_meas_bytes
= tp
->snd_una
- tp
->t_bwmeas
->bw_start
;
512 if ((tp
->t_flagsext
& TF_BWMEAS_INPROGRESS
) &&
513 bw_meas_bytes
>= (int32_t)(tp
->t_bwmeas
->bw_size
)) {
514 bytes
= bw_meas_bytes
;
515 elapsed_time
= tcp_now
- tp
->t_bwmeas
->bw_ts
;
516 if (elapsed_time
> 0) {
517 bw
= bytes
/ elapsed_time
;
519 if (tp
->t_bwmeas
->bw_sndbw
> 0) {
520 tp
->t_bwmeas
->bw_sndbw
=
521 (((tp
->t_bwmeas
->bw_sndbw
<< 3)
522 - tp
->t_bwmeas
->bw_sndbw
)
525 tp
->t_bwmeas
->bw_sndbw
= bw
;
528 /* Store the maximum value */
529 if (tp
->t_bwmeas
->bw_sndbw_max
== 0) {
530 tp
->t_bwmeas
->bw_sndbw_max
=
531 tp
->t_bwmeas
->bw_sndbw
;
533 tp
->t_bwmeas
->bw_sndbw_max
=
534 max(tp
->t_bwmeas
->bw_sndbw
,
535 tp
->t_bwmeas
->bw_sndbw_max
);
539 tp
->t_flagsext
&= ~(TF_BWMEAS_INPROGRESS
);
544 tcp_reass(struct tcpcb
*tp
, struct tcphdr
*th
, int *tlenp
, struct mbuf
*m
,
545 struct ifnet
*ifp
, int *dowakeup
)
548 struct tseg_qent
*p
= NULL
;
549 struct tseg_qent
*nq
;
550 struct tseg_qent
*te
= NULL
;
551 struct inpcb
*inp
= tp
->t_inpcb
;
552 struct socket
*so
= inp
->inp_socket
;
555 boolean_t cell
= IFNET_IS_CELLULAR(ifp
);
556 boolean_t wifi
= (!cell
&& IFNET_IS_WIFI(ifp
));
557 boolean_t wired
= (!wifi
&& IFNET_IS_WIRED(ifp
));
558 boolean_t dsack_set
= FALSE
;
561 * Call with th==0 after become established to
562 * force pre-ESTABLISHED data up to user socket.
569 * If the reassembly queue already has entries or if we are going
570 * to add a new one, then the connection has reached a loss state.
571 * Reset the stretch-ack algorithm at this point.
573 tcp_reset_stretch_ack(tp
);
574 tp
->t_forced_acks
= TCP_FORCED_ACKS_COUNT
;
577 if (tp
->acc_iaj
> 0) {
580 #endif /* TRAFFIC_MGT */
583 * Limit the number of segments in the reassembly queue to prevent
584 * holding on to too many segments (and thus running out of mbufs).
585 * Make sure to let the missing segment through which caused this
586 * queue. Always keep one global queue entry spare to be able to
587 * process the missing segment.
589 qlimit
= min(max(100, so
->so_rcv
.sb_hiwat
>> 10),
590 (tcp_autorcvbuf_max
>> 10));
591 if (th
->th_seq
!= tp
->rcv_nxt
&&
592 (tp
->t_reassqlen
+ 1) >= qlimit
) {
593 tcp_reass_overflows
++;
594 tcpstat
.tcps_rcvmemdrop
++;
600 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
601 te
= (struct tseg_qent
*) zalloc(tcp_reass_zone
);
603 tcpstat
.tcps_rcvmemdrop
++;
610 * Find a segment which begins after this one does.
612 LIST_FOREACH(q
, &tp
->t_segq
, tqe_q
) {
613 if (SEQ_GT(q
->tqe_th
->th_seq
, th
->th_seq
)) {
620 * If there is a preceding segment, it may provide some of
621 * our data already. If so, drop the data from the incoming
622 * segment. If it provides all of our data, drop us.
626 /* conversion to int (in i) handles seq wraparound */
627 i
= p
->tqe_th
->th_seq
+ p
->tqe_len
- th
->th_seq
;
631 * Note duplicate data sequnce numbers
632 * to report in DSACK option
634 tp
->t_dsack_lseq
= th
->th_seq
;
635 tp
->t_dsack_rseq
= th
->th_seq
+
639 * Report only the first part of partial/
640 * non-contiguous duplicate sequence space
645 tcpstat
.tcps_rcvduppack
++;
646 tcpstat
.tcps_rcvdupbyte
+= *tlenp
;
648 nstat_route_rx(inp
->inp_route
.ro_rt
,
650 NSTAT_RX_FLAG_DUPLICATE
);
651 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
653 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
655 tp
->t_stat
.rxduplicatebytes
+= *tlenp
;
656 inp_set_activity_bitmap(inp
);
659 zfree(tcp_reass_zone
, te
);
663 * Try to present any queued data
664 * at the left window edge to the user.
665 * This is needed after the 3-WHS
676 tcpstat
.tcps_rcvoopack
++;
677 tcpstat
.tcps_rcvoobyte
+= *tlenp
;
679 nstat_route_rx(inp
->inp_route
.ro_rt
, 1, *tlenp
,
680 NSTAT_RX_FLAG_OUT_OF_ORDER
);
681 INP_ADD_STAT(inp
, cell
, wifi
, wired
, rxpackets
, 1);
682 INP_ADD_STAT(inp
, cell
, wifi
, wired
, rxbytes
, *tlenp
);
683 tp
->t_stat
.rxoutoforderbytes
+= *tlenp
;
684 inp_set_activity_bitmap(inp
);
688 * While we overlap succeeding segments trim them or,
689 * if they are completely covered, dequeue them.
692 int i
= (th
->th_seq
+ *tlenp
) - q
->tqe_th
->th_seq
;
698 * Report only the first part of partial/non-contiguous
699 * duplicate segment in dsack option. The variable
700 * dsack_set will be true if a previous entry has some of
701 * the duplicate sequence space.
703 if (i
> 1 && !dsack_set
) {
704 if (tp
->t_dsack_lseq
== 0) {
705 tp
->t_dsack_lseq
= q
->tqe_th
->th_seq
;
707 tp
->t_dsack_lseq
+ min(i
, q
->tqe_len
);
710 * this segment overlaps data in multple
711 * entries in the reassembly queue, move
712 * the right sequence number further.
715 tp
->t_dsack_rseq
+ min(i
, q
->tqe_len
);
718 if (i
< q
->tqe_len
) {
719 q
->tqe_th
->th_seq
+= i
;
725 nq
= LIST_NEXT(q
, tqe_q
);
726 LIST_REMOVE(q
, tqe_q
);
728 zfree(tcp_reass_zone
, q
);
733 /* Insert the new segment queue entry into place. */
736 te
->tqe_len
= *tlenp
;
739 LIST_INSERT_HEAD(&tp
->t_segq
, te
, tqe_q
);
741 LIST_INSERT_AFTER(p
, te
, tqe_q
);
746 * Present data to user, advancing rcv_nxt through
747 * completed sequence space.
749 if (!TCPS_HAVEESTABLISHED(tp
->t_state
)) {
752 q
= LIST_FIRST(&tp
->t_segq
);
753 if (!q
|| q
->tqe_th
->th_seq
!= tp
->rcv_nxt
) {
758 * If there is already another thread doing reassembly for this
759 * connection, it is better to let it finish the job --
762 if (tp
->t_flagsext
& TF_REASS_INPROG
) {
766 tp
->t_flagsext
|= TF_REASS_INPROG
;
767 /* lost packet was recovered, so ooo data can be returned */
768 tcpstat
.tcps_recovered_pkts
++;
771 tp
->rcv_nxt
+= q
->tqe_len
;
772 flags
= q
->tqe_th
->th_flags
& TH_FIN
;
773 LIST_REMOVE(q
, tqe_q
);
774 if (so
->so_state
& SS_CANTRCVMORE
) {
778 * The mbuf may be freed after it has been added to the
779 * receive socket buffer so we reinitialize th to point
780 * to a safe copy of the TCP header
782 struct tcphdr saved_tcphdr
= {};
784 so_recv_data_stat(so
, q
->tqe_m
, 0); /* XXXX */
785 memcpy(&saved_tcphdr
, th
, sizeof(struct tcphdr
));
787 if (q
->tqe_th
->th_flags
& TH_PUSH
) {
788 tp
->t_flagsext
|= TF_LAST_IS_PSH
;
790 tp
->t_flagsext
&= ~TF_LAST_IS_PSH
;
793 if (sbappendstream_rcvdemux(so
, q
->tqe_m
)) {
798 zfree(tcp_reass_zone
, q
);
800 q
= LIST_FIRST(&tp
->t_segq
);
801 } while (q
&& q
->tqe_th
->th_seq
== tp
->rcv_nxt
);
802 tp
->t_flagsext
&= ~TF_REASS_INPROG
;
804 if ((inp
->inp_vflag
& INP_IPV6
) != 0) {
805 KERNEL_DEBUG(DBG_LAYER_BEG
,
806 ((inp
->inp_fport
<< 16) | inp
->inp_lport
),
807 (((inp
->in6p_laddr
.s6_addr16
[0] & 0xffff) << 16) |
808 (inp
->in6p_faddr
.s6_addr16
[0] & 0xffff)),
811 KERNEL_DEBUG(DBG_LAYER_BEG
,
812 ((inp
->inp_fport
<< 16) | inp
->inp_lport
),
813 (((inp
->inp_laddr
.s_addr
& 0xffff) << 16) |
814 (inp
->inp_faddr
.s_addr
& 0xffff)),
822 * Reduce congestion window -- used when ECN is seen or when a tail loss
823 * probe recovers the last packet.
826 tcp_reduce_congestion_window(struct tcpcb
*tp
)
829 * If the current tcp cc module has
830 * defined a hook for tasks to run
831 * before entering FR, call it
833 if (CC_ALGO(tp
)->pre_fr
!= NULL
) {
834 CC_ALGO(tp
)->pre_fr(tp
);
836 ENTER_FASTRECOVERY(tp
);
837 if (tp
->t_flags
& TF_SENTFIN
) {
838 tp
->snd_recover
= tp
->snd_max
- 1;
840 tp
->snd_recover
= tp
->snd_max
;
842 tp
->t_timer
[TCPT_REXMT
] = 0;
843 tp
->t_timer
[TCPT_PTO
] = 0;
845 if (tp
->t_flagsext
& TF_CWND_NONVALIDATED
) {
846 tcp_cc_adjust_nonvalidated_cwnd(tp
);
848 tp
->snd_cwnd
= tp
->snd_ssthresh
+
849 tp
->t_maxseg
* tcprexmtthresh
;
854 * This function is called upon reception of data on a socket. It's purpose is
855 * to handle the adaptive keepalive timers that monitor whether the connection
856 * is making progress. First the adaptive read-timer, second the TFO probe-timer.
858 * The application wants to get an event if there is a stall during read.
859 * Set the initial keepalive timeout to be equal to twice RTO.
861 * If the outgoing interface is in marginal conditions, we need to
862 * enable read probes for that too.
865 tcp_adaptive_rwtimo_check(struct tcpcb
*tp
, int tlen
)
867 struct ifnet
*outifp
= tp
->t_inpcb
->inp_last_outifp
;
869 if ((tp
->t_adaptive_rtimo
> 0 ||
871 (outifp
->if_eflags
& IFEF_PROBE_CONNECTIVITY
)))
873 tp
->t_state
== TCPS_ESTABLISHED
) {
874 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
,
875 (TCP_REXMTVAL(tp
) << 1));
876 tp
->t_flagsext
|= TF_DETECT_READSTALL
;
877 tp
->t_rtimo_probes
= 0;
882 tcp_keepalive_reset(struct tcpcb
*tp
)
884 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
,
885 TCP_CONN_KEEPIDLE(tp
));
886 tp
->t_flagsext
&= ~(TF_DETECT_READSTALL
);
887 tp
->t_rtimo_probes
= 0;
891 * TCP input routine, follows pages 65-76 of the
892 * protocol specification dated September, 1981 very closely.
895 tcp6_input(struct mbuf
**mp
, int *offp
, int proto
)
897 #pragma unused(proto)
898 struct mbuf
*m
= *mp
;
900 struct ifnet
*ifp
= m
->m_pkthdr
.rcvif
;
902 IP6_EXTHDR_CHECK(m
, *offp
, sizeof(struct tcphdr
), return IPPROTO_DONE
);
904 /* Expect 32-bit aligned data pointer on strict-align platforms */
905 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
908 * draft-itojun-ipv6-tcp-to-anycast
909 * better place to put this in?
911 if (ip6_getdstifaddr_info(m
, NULL
, &ia6_flags
) == 0) {
912 if (ia6_flags
& IN6_IFF_ANYCAST
) {
915 ip6
= mtod(m
, struct ip6_hdr
*);
916 icmp6_error(m
, ICMP6_DST_UNREACH
,
917 ICMP6_DST_UNREACH_ADDR
,
918 (caddr_t
)&ip6
->ip6_dst
- (caddr_t
)ip6
);
920 IF_TCP_STATINC(ifp
, icmp6unreach
);
930 /* Depending on the usage of mbuf space in the system, this function
931 * will return true or false. This is used to determine if a socket
932 * buffer can take more memory from the system for auto-tuning or not.
935 tcp_cansbgrow(struct sockbuf
*sb
)
937 /* Calculate the host level space limit in terms of MSIZE buffers.
938 * We can use a maximum of half of the available mbuf space for
941 u_int32_t mblim
= ((nmbclusters
>> 1) << (MCLSHIFT
- MSIZESHIFT
));
943 /* Calculate per sb limit in terms of bytes. We optimize this limit
944 * for upto 16 socket buffers.
947 u_int32_t sbspacelim
= ((nmbclusters
>> 4) << MCLSHIFT
);
949 if ((total_sbmb_cnt
< mblim
) &&
950 (sb
->sb_hiwat
< sbspacelim
)) {
953 OSIncrementAtomic64(&sbmb_limreached
);
959 tcp_sbrcv_reserve(struct tcpcb
*tp
, struct sockbuf
*sbrcv
,
960 u_int32_t newsize
, u_int32_t idealsize
, u_int32_t rcvbuf_max
)
962 /* newsize should not exceed max */
963 newsize
= min(newsize
, rcvbuf_max
);
965 /* The receive window scale negotiated at the
966 * beginning of the connection will also set a
967 * limit on the socket buffer size
969 newsize
= min(newsize
, TCP_MAXWIN
<< tp
->rcv_scale
);
971 /* Set new socket buffer size */
972 if (newsize
> sbrcv
->sb_hiwat
&&
973 (sbreserve(sbrcv
, newsize
) == 1)) {
974 sbrcv
->sb_idealsize
= min(max(sbrcv
->sb_idealsize
,
975 (idealsize
!= 0) ? idealsize
: newsize
), rcvbuf_max
);
977 /* Again check the limit set by the advertised
980 sbrcv
->sb_idealsize
= min(sbrcv
->sb_idealsize
,
981 TCP_MAXWIN
<< tp
->rcv_scale
);
986 * This function is used to grow a receive socket buffer. It
987 * will take into account system-level memory usage and the
988 * bandwidth available on the link to make a decision.
991 tcp_sbrcv_grow(struct tcpcb
*tp
, struct sockbuf
*sbrcv
,
992 struct tcpopt
*to
, uint32_t pktlen
)
994 struct socket
*so
= sbrcv
->sb_so
;
997 * Do not grow the receive socket buffer if
998 * - auto resizing is disabled, globally or on this socket
999 * - the high water mark already reached the maximum
1000 * - the stream is in background and receive side is being
1003 if (tcp_do_autorcvbuf
== 0 ||
1004 (sbrcv
->sb_flags
& SB_AUTOSIZE
) == 0 ||
1005 tcp_cansbgrow(sbrcv
) == 0 ||
1006 sbrcv
->sb_hiwat
>= tcp_autorcvbuf_max
||
1007 (tp
->t_flagsext
& TF_RECV_THROTTLE
) ||
1008 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ||
1009 (!tcp_autotune_reorder
&& !LIST_EMPTY(&tp
->t_segq
))) {
1010 /* Can not resize the socket buffer, just return */
1014 if (!TSTMP_SUPPORTED(tp
)) {
1016 * Timestamp option is not supported on this connection.
1017 * If the connection reached a state to indicate that
1018 * the receive socket buffer needs to grow, increase
1019 * the high water mark.
1021 if (TSTMP_GEQ(tcp_now
,
1022 tp
->rfbuf_ts
+ TCPTV_RCVNOTS_QUANTUM
)) {
1023 if (tp
->rfbuf_cnt
+ pktlen
>= TCP_RCVNOTS_BYTELEVEL
) {
1024 tcp_sbrcv_reserve(tp
, sbrcv
,
1025 tcp_autorcvbuf_max
, 0,
1026 tcp_autorcvbuf_max
);
1030 tp
->rfbuf_cnt
+= pktlen
;
1033 } else if (to
->to_tsecr
!= 0) {
1035 * If the timestamp shows that one RTT has
1036 * completed, we can stop counting the
1037 * bytes. Here we consider increasing
1038 * the socket buffer if the bandwidth measured in
1039 * last rtt, is more than half of sb_hiwat, this will
1040 * help to scale the buffer according to the bandwidth
1043 if (TSTMP_GEQ(to
->to_tsecr
, tp
->rfbuf_ts
)) {
1044 if (tcp_aggressive_rcvwnd_inc
) {
1045 tp
->rfbuf_cnt
+= pktlen
;
1048 if ((tcp_aggressive_rcvwnd_inc
== 0 &&
1049 tp
->rfbuf_cnt
+ pktlen
> (sbrcv
->sb_hiwat
-
1050 (sbrcv
->sb_hiwat
>> 1))) ||
1051 (tcp_aggressive_rcvwnd_inc
&&
1052 tp
->rfbuf_cnt
> tp
->rfbuf_space
)) {
1056 if (tcp_aggressive_rcvwnd_inc
== 0) {
1059 tp
->rfbuf_cnt
+= pktlen
;
1061 * Increment the receive window by a
1062 * multiple of maximum sized segments.
1063 * This will prevent a connection from
1064 * sending smaller segments on wire if it
1065 * is limited by the receive window.
1067 * Set the ideal size based on current
1068 * bandwidth measurements. We set the
1069 * ideal size on receive socket buffer to
1070 * be twice the bandwidth delay product.
1072 rcvbuf_inc
= (tp
->rfbuf_cnt
<< 1)
1076 * Make the increment equal to 8 segments
1079 min_incr
= tp
->t_maxseg
<< tcp_autorcvbuf_inc_shift
;
1080 if (rcvbuf_inc
< min_incr
) {
1081 rcvbuf_inc
= min_incr
;
1084 idealsize
= (tp
->rfbuf_cnt
<< 1);
1086 if (tp
->rfbuf_cnt
> tp
->rfbuf_space
+ (tp
->rfbuf_space
>> 1)) {
1087 rcvbuf_inc
= (tp
->rfbuf_cnt
<< 2) - sbrcv
->sb_hiwat
;
1088 idealsize
= (tp
->rfbuf_cnt
<< 2);
1090 rcvbuf_inc
= (tp
->rfbuf_cnt
<< 1) - sbrcv
->sb_hiwat
;
1091 idealsize
= (tp
->rfbuf_cnt
<< 1);
1095 tp
->rfbuf_space
= tp
->rfbuf_cnt
;
1097 if (rcvbuf_inc
> 0) {
1099 (rcvbuf_inc
/ tp
->t_maxseg
) * tp
->t_maxseg
;
1101 tcp_sbrcv_reserve(tp
, sbrcv
,
1102 sbrcv
->sb_hiwat
+ rcvbuf_inc
,
1103 idealsize
, tcp_autorcvbuf_max
);
1106 /* Measure instantaneous receive bandwidth */
1107 if (tp
->t_bwmeas
!= NULL
&& tp
->rfbuf_cnt
> 0 &&
1108 TSTMP_GT(tcp_now
, tp
->rfbuf_ts
)) {
1110 rcv_bw
= tp
->rfbuf_cnt
/
1111 (int)(tcp_now
- tp
->rfbuf_ts
);
1112 if (tp
->t_bwmeas
->bw_rcvbw_max
== 0) {
1113 tp
->t_bwmeas
->bw_rcvbw_max
= rcv_bw
;
1115 tp
->t_bwmeas
->bw_rcvbw_max
= max(
1116 tp
->t_bwmeas
->bw_rcvbw_max
, rcv_bw
);
1121 tp
->rfbuf_cnt
+= pktlen
;
1126 /* Restart the measurement */
1127 tp
->rfbuf_ts
= tcp_now
;
1132 /* This function will trim the excess space added to the socket buffer
1133 * to help a slow-reading app. The ideal-size of a socket buffer depends
1134 * on the link bandwidth or it is set by an application and we aim to
1138 tcp_sbrcv_trim(struct tcpcb
*tp
, struct sockbuf
*sbrcv
)
1140 if (tcp_do_autorcvbuf
== 1 && sbrcv
->sb_idealsize
> 0 &&
1141 sbrcv
->sb_hiwat
> sbrcv
->sb_idealsize
) {
1143 /* compute the difference between ideal and current sizes */
1144 u_int32_t diff
= sbrcv
->sb_hiwat
- sbrcv
->sb_idealsize
;
1146 /* Compute the maximum advertised window for
1149 u_int32_t advwin
= tp
->rcv_adv
- tp
->rcv_nxt
;
1151 /* How much can we trim the receive socket buffer?
1152 * 1. it can not be trimmed beyond the max rcv win advertised
1153 * 2. if possible, leave 1/16 of bandwidth*delay to
1154 * avoid closing the win completely
1156 u_int32_t leave
= max(advwin
, (sbrcv
->sb_idealsize
>> 4));
1158 /* Sometimes leave can be zero, in that case leave at least
1159 * a few segments worth of space.
1162 leave
= tp
->t_maxseg
<< tcp_autorcvbuf_inc_shift
;
1165 trim
= sbrcv
->sb_hiwat
- (sbrcv
->sb_cc
+ leave
);
1166 trim
= imin(trim
, (int32_t)diff
);
1169 sbreserve(sbrcv
, (sbrcv
->sb_hiwat
- trim
));
1174 /* We may need to trim the send socket buffer size for two reasons:
1175 * 1. if the rtt seen on the connection is climbing up, we do not
1176 * want to fill the buffers any more.
1177 * 2. if the congestion win on the socket backed off, there is no need
1178 * to hold more mbufs for that connection than what the cwnd will allow.
1181 tcp_sbsnd_trim(struct sockbuf
*sbsnd
)
1183 if (((sbsnd
->sb_flags
& (SB_AUTOSIZE
| SB_TRIM
)) ==
1184 (SB_AUTOSIZE
| SB_TRIM
)) &&
1185 (sbsnd
->sb_idealsize
> 0) &&
1186 (sbsnd
->sb_hiwat
> sbsnd
->sb_idealsize
)) {
1188 if (sbsnd
->sb_cc
<= sbsnd
->sb_idealsize
) {
1189 trim
= sbsnd
->sb_hiwat
- sbsnd
->sb_idealsize
;
1191 trim
= sbsnd
->sb_hiwat
- sbsnd
->sb_cc
;
1193 sbreserve(sbsnd
, (sbsnd
->sb_hiwat
- trim
));
1195 if (sbsnd
->sb_hiwat
<= sbsnd
->sb_idealsize
) {
1196 sbsnd
->sb_flags
&= ~(SB_TRIM
);
1201 * If timestamp option was not negotiated on this connection
1202 * and this connection is on the receiving side of a stream
1203 * then we can not measure the delay on the link accurately.
1204 * Instead of enabling automatic receive socket buffer
1205 * resizing, just give more space to the receive socket buffer.
1208 tcp_sbrcv_tstmp_check(struct tcpcb
*tp
)
1210 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
1211 u_int32_t newsize
= 2 * tcp_recvspace
;
1212 struct sockbuf
*sbrcv
= &so
->so_rcv
;
1214 if ((tp
->t_flags
& (TF_REQ_TSTMP
| TF_RCVD_TSTMP
)) !=
1215 (TF_REQ_TSTMP
| TF_RCVD_TSTMP
) &&
1216 (sbrcv
->sb_flags
& SB_AUTOSIZE
) != 0) {
1217 tcp_sbrcv_reserve(tp
, sbrcv
, newsize
, 0, newsize
);
1221 /* A receiver will evaluate the flow of packets on a connection
1222 * to see if it can reduce ack traffic. The receiver will start
1223 * stretching acks if all of the following conditions are met:
1224 * 1. tcp_delack_enabled is set to 3
1225 * 2. If the bytes received in the last 100ms is greater than a threshold
1226 * defined by maxseg_unacked
1227 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1228 * 4. If the connection has seen enough packets to let the slow-start
1229 * finish after connection establishment or after some packet loss.
1231 * The receiver will stop stretching acks if there is congestion/reordering
1232 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1233 * timer fires while stretching acks, it means that the packet flow has gone
1234 * below the threshold defined by maxseg_unacked and the receiver will stop
1235 * stretching acks. The receiver gets no indication when slow-start is completed
1236 * or when the connection reaches an idle state. That is why we use
1237 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1241 tcp_stretch_ack_enable(struct tcpcb
*tp
, int thflags
)
1243 if (tp
->rcv_by_unackwin
>= (maxseg_unacked
* tp
->t_maxseg
) &&
1244 TSTMP_GEQ(tp
->rcv_unackwin
, tcp_now
)) {
1245 tp
->t_flags
|= TF_STREAMING_ON
;
1247 tp
->t_flags
&= ~TF_STREAMING_ON
;
1250 /* If there has been an idle time, reset streaming detection */
1251 if (TSTMP_GT(tcp_now
, tp
->rcv_unackwin
+ tcp_maxrcvidle
)) {
1252 tp
->t_flags
&= ~TF_STREAMING_ON
;
1256 * If there are flags other than TH_ACK set, reset streaming
1259 if (thflags
& ~TH_ACK
) {
1260 tp
->t_flags
&= ~TF_STREAMING_ON
;
1263 if (tp
->t_flagsext
& TF_DISABLE_STRETCHACK
) {
1264 if (tp
->rcv_nostrack_pkts
>= TCP_STRETCHACK_ENABLE_PKTCNT
) {
1265 tp
->t_flagsext
&= ~TF_DISABLE_STRETCHACK
;
1266 tp
->rcv_nostrack_pkts
= 0;
1267 tp
->rcv_nostrack_ts
= 0;
1269 tp
->rcv_nostrack_pkts
++;
1273 if (!(tp
->t_flagsext
& (TF_NOSTRETCHACK
| TF_DISABLE_STRETCHACK
)) &&
1274 (tp
->t_flags
& TF_STREAMING_ON
) &&
1275 (!(tp
->t_flagsext
& TF_RCVUNACK_WAITSS
) ||
1276 (tp
->rcv_waitforss
>= tcp_rcvsspktcnt
))) {
1284 * Reset the state related to stretch-ack algorithm. This will make
1285 * the receiver generate an ack every other packet. The receiver
1286 * will start re-evaluating the rate at which packets come to decide
1287 * if it can benefit by lowering the ack traffic.
1290 tcp_reset_stretch_ack(struct tcpcb
*tp
)
1292 tp
->t_flags
&= ~(TF_STRETCHACK
| TF_STREAMING_ON
);
1293 tp
->rcv_by_unackwin
= 0;
1294 tp
->rcv_by_unackhalfwin
= 0;
1295 tp
->rcv_unackwin
= tcp_now
+ tcp_rcvunackwin
;
1298 * When there is packet loss or packet re-ordering or CWR due to
1299 * ECN, the sender's congestion window is reduced. In these states,
1300 * generate an ack for every other packet for some time to allow
1301 * the sender's congestion window to grow.
1303 tp
->t_flagsext
|= TF_RCVUNACK_WAITSS
;
1304 tp
->rcv_waitforss
= 0;
1308 * The last packet was a retransmission, check if this ack
1309 * indicates that the retransmission was spurious.
1311 * If the connection supports timestamps, we could use it to
1312 * detect if the last retransmit was not needed. Otherwise,
1313 * we check if the ACK arrived within RTT/2 window, then it
1314 * was a mistake to do the retransmit in the first place.
1316 * This function will return 1 if it is a spurious retransmit,
1320 tcp_detect_bad_rexmt(struct tcpcb
*tp
, struct tcphdr
*th
,
1321 struct tcpopt
*to
, u_int32_t rxtime
)
1323 int32_t tdiff
, bad_rexmt_win
;
1324 bad_rexmt_win
= (tp
->t_srtt
>> (TCP_RTT_SHIFT
+ 1));
1326 /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1327 if (TCP_ECN_ENABLED(tp
) && (th
->th_flags
& TH_ECE
)) {
1330 if (TSTMP_SUPPORTED(tp
)) {
1331 if (rxtime
> 0 && (to
->to_flags
& TOF_TS
) && to
->to_tsecr
!= 0 &&
1332 TSTMP_LT(to
->to_tsecr
, rxtime
)) {
1336 if ((tp
->t_rxtshift
== 1 || (tp
->t_flagsext
& TF_SENT_TLPROBE
)) &&
1338 tdiff
= (int32_t)(tcp_now
- rxtime
);
1339 if (tdiff
< bad_rexmt_win
) {
1349 * Restore congestion window state if a spurious timeout
1353 tcp_bad_rexmt_restore_state(struct tcpcb
*tp
, struct tcphdr
*th
)
1355 if (TSTMP_SUPPORTED(tp
)) {
1356 u_int32_t fsize
, acked
;
1357 fsize
= tp
->snd_max
- th
->th_ack
;
1358 acked
= BYTES_ACKED(th
, tp
);
1361 * Implement bad retransmit recovery as
1362 * described in RFC 4015.
1364 tp
->snd_ssthresh
= tp
->snd_ssthresh_prev
;
1366 /* Initialize cwnd to the initial window */
1367 if (CC_ALGO(tp
)->cwnd_init
!= NULL
) {
1368 CC_ALGO(tp
)->cwnd_init(tp
);
1371 tp
->snd_cwnd
= fsize
+ min(acked
, tp
->snd_cwnd
);
1373 tp
->snd_cwnd
= tp
->snd_cwnd_prev
;
1374 tp
->snd_ssthresh
= tp
->snd_ssthresh_prev
;
1375 if (tp
->t_flags
& TF_WASFRECOVERY
) {
1376 ENTER_FASTRECOVERY(tp
);
1379 /* Do not use the loss flight size in this case */
1380 tp
->t_lossflightsize
= 0;
1382 tp
->snd_cwnd
= max(tp
->snd_cwnd
, tcp_initial_cwnd(tp
));
1383 tp
->snd_recover
= tp
->snd_recover_prev
;
1384 tp
->snd_nxt
= tp
->snd_max
;
1386 /* Fix send socket buffer to reflect the change in cwnd */
1387 tcp_bad_rexmt_fix_sndbuf(tp
);
1390 * This RTT might reflect the extra delay induced
1391 * by the network. Skip using this sample for RTO
1392 * calculation and mark the connection so we can
1393 * recompute RTT when the next eligible sample is
1396 tp
->t_flagsext
|= TF_RECOMPUTE_RTT
;
1397 tp
->t_badrexmt_time
= tcp_now
;
1402 * If the previous packet was sent in retransmission timer, and it was
1403 * not needed, then restore the congestion window to the state before that
1406 * If the last packet was sent in tail loss probe timeout, check if that
1407 * recovered the last packet. If so, that will indicate a real loss and
1408 * the congestion window needs to be lowered.
1411 tcp_bad_rexmt_check(struct tcpcb
*tp
, struct tcphdr
*th
, struct tcpopt
*to
)
1413 if (tp
->t_rxtshift
> 0 &&
1414 tcp_detect_bad_rexmt(tp
, th
, to
, tp
->t_rxtstart
)) {
1415 ++tcpstat
.tcps_sndrexmitbad
;
1416 tcp_bad_rexmt_restore_state(tp
, th
);
1417 tcp_ccdbg_trace(tp
, th
, TCP_CC_BAD_REXMT_RECOVERY
);
1418 } else if ((tp
->t_flagsext
& TF_SENT_TLPROBE
) && tp
->t_tlphighrxt
> 0 &&
1419 SEQ_GEQ(th
->th_ack
, tp
->t_tlphighrxt
) &&
1420 !tcp_detect_bad_rexmt(tp
, th
, to
, tp
->t_tlpstart
)) {
1422 * check DSACK information also to make sure that
1423 * the TLP was indeed needed
1425 if (tcp_rxtseg_dsack_for_tlp(tp
)) {
1427 * received a DSACK to indicate that TLP was
1430 tcp_rxtseg_clean(tp
);
1435 * The tail loss probe recovered the last packet and
1436 * we need to adjust the congestion window to take
1437 * this loss into account.
1439 ++tcpstat
.tcps_tlp_recoverlastpkt
;
1440 if (!IN_FASTRECOVERY(tp
)) {
1441 tcp_reduce_congestion_window(tp
);
1442 EXIT_FASTRECOVERY(tp
);
1444 tcp_ccdbg_trace(tp
, th
, TCP_CC_TLP_RECOVER_LASTPACKET
);
1445 } else if (tcp_rxtseg_detect_bad_rexmt(tp
, th
->th_ack
)) {
1447 * All of the retransmitted segments were duplicated, this
1448 * can be an indication of bad fast retransmit.
1450 tcpstat
.tcps_dsack_badrexmt
++;
1451 tcp_bad_rexmt_restore_state(tp
, th
);
1452 tcp_ccdbg_trace(tp
, th
, TCP_CC_DSACK_BAD_REXMT
);
1453 tcp_rxtseg_clean(tp
);
1456 tp
->t_flagsext
&= ~(TF_SENT_TLPROBE
);
1457 tp
->t_tlphighrxt
= 0;
1461 * check if the latest ack was for a segment sent during PMTU
1462 * blackhole detection. If the timestamp on the ack is before
1463 * PMTU blackhole detection, then revert the size of the max
1464 * segment to previous size.
1466 if (tp
->t_rxtshift
> 0 && (tp
->t_flags
& TF_BLACKHOLE
) &&
1467 tp
->t_pmtud_start_ts
> 0 && TSTMP_SUPPORTED(tp
)) {
1468 if ((to
->to_flags
& TOF_TS
) && to
->to_tsecr
!= 0
1469 && TSTMP_LT(to
->to_tsecr
, tp
->t_pmtud_start_ts
)) {
1470 tcp_pmtud_revert_segment_size(tp
);
1473 if (tp
->t_pmtud_start_ts
> 0) {
1474 tp
->t_pmtud_start_ts
= 0;
1477 tp
->t_pmtud_lastseg_size
= 0;
1481 * Check if early retransmit can be attempted according to RFC 5827.
1483 * If packet reordering is detected on a connection, fast recovery will
1484 * be delayed until it is clear that the packet was lost and not reordered.
1485 * But reordering detection is done only when SACK is enabled.
1487 * On connections that do not support SACK, there is a limit on the number
1488 * of early retransmits that can be done per minute. This limit is needed
1489 * to make sure that too many packets are not retransmitted when there is
1490 * packet reordering.
1493 tcp_early_rexmt_check(struct tcpcb
*tp
, struct tcphdr
*th
)
1495 u_int32_t obytes
, snd_off
;
1497 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
1499 if ((SACK_ENABLED(tp
) || tp
->t_early_rexmt_count
< TCP_EARLY_REXMT_LIMIT
) &&
1500 SEQ_GT(tp
->snd_max
, tp
->snd_una
) &&
1501 (tp
->t_dupacks
== 1 || (SACK_ENABLED(tp
) && !TAILQ_EMPTY(&tp
->snd_holes
)))) {
1503 * If there are only a few outstanding
1504 * segments on the connection, we might need
1505 * to lower the retransmit threshold. This
1506 * will allow us to do Early Retransmit as
1507 * described in RFC 5827.
1509 if (SACK_ENABLED(tp
) &&
1510 !TAILQ_EMPTY(&tp
->snd_holes
)) {
1511 obytes
= (tp
->snd_max
- tp
->snd_fack
) +
1512 tp
->sackhint
.sack_bytes_rexmit
;
1514 obytes
= (tp
->snd_max
- tp
->snd_una
);
1518 * In order to lower retransmit threshold the
1519 * following two conditions must be met.
1520 * 1. the amount of outstanding data is less
1522 * 2. there is no unsent data ready for
1523 * transmission or the advertised window
1524 * will limit sending new segments.
1526 snd_off
= tp
->snd_max
- tp
->snd_una
;
1527 snd_len
= min(so
->so_snd
.sb_cc
, tp
->snd_wnd
) - snd_off
;
1528 if (obytes
< (tp
->t_maxseg
<< 2) &&
1532 osegs
= obytes
/ tp
->t_maxseg
;
1533 if ((osegs
* tp
->t_maxseg
) < obytes
) {
1538 * Since the connection might have already
1539 * received some dupacks, we add them to
1540 * to the outstanding segments count to get
1541 * the correct retransmit threshold.
1543 * By checking for early retransmit after
1544 * receiving some duplicate acks when SACK
1545 * is supported, the connection will
1546 * enter fast recovery even if multiple
1547 * segments are lost in the same window.
1549 osegs
+= tp
->t_dupacks
;
1552 ((osegs
- 1) > 1) ? (osegs
- 1) : 1;
1554 min(tp
->t_rexmtthresh
, tcprexmtthresh
);
1556 max(tp
->t_rexmtthresh
, tp
->t_dupacks
);
1558 if (tp
->t_early_rexmt_count
== 0) {
1559 tp
->t_early_rexmt_win
= tcp_now
;
1562 if (tp
->t_flagsext
& TF_SENT_TLPROBE
) {
1563 tcpstat
.tcps_tlp_recovery
++;
1564 tcp_ccdbg_trace(tp
, th
,
1565 TCP_CC_TLP_RECOVERY
);
1567 tcpstat
.tcps_early_rexmt
++;
1568 tp
->t_early_rexmt_count
++;
1569 tcp_ccdbg_trace(tp
, th
,
1570 TCP_CC_EARLY_RETRANSMIT
);
1577 * If we ever sent a TLP probe, the acknowledgement will trigger
1578 * early retransmit because the value of snd_fack will be close
1579 * to snd_max. This will take care of adjustments to the
1580 * congestion window. So we can reset TF_SENT_PROBE flag.
1582 tp
->t_flagsext
&= ~(TF_SENT_TLPROBE
);
1583 tp
->t_tlphighrxt
= 0;
1588 tcp_tfo_syn(struct tcpcb
*tp
, struct tcpopt
*to
)
1590 u_char out
[CCAES_BLOCK_SIZE
];
1593 if (!(to
->to_flags
& (TOF_TFO
| TOF_TFOREQ
)) ||
1594 !(tcp_fastopen
& TCP_FASTOPEN_SERVER
)) {
1598 if ((to
->to_flags
& TOF_TFOREQ
)) {
1599 tp
->t_tfo_flags
|= TFO_F_OFFER_COOKIE
;
1601 tp
->t_tfo_stats
|= TFO_S_COOKIEREQ_RECV
;
1602 tcpstat
.tcps_tfo_cookie_req_rcv
++;
1606 /* Ok, then it must be an offered cookie. We need to check that ... */
1607 tcp_tfo_gen_cookie(tp
->t_inpcb
, out
, sizeof(out
));
1609 len
= *to
->to_tfo
- TCPOLEN_FASTOPEN_REQ
;
1611 if (memcmp(out
, to
->to_tfo
, len
)) {
1612 /* Cookies are different! Let's return and offer a new cookie */
1613 tp
->t_tfo_flags
|= TFO_F_OFFER_COOKIE
;
1615 tp
->t_tfo_stats
|= TFO_S_COOKIE_INVALID
;
1616 tcpstat
.tcps_tfo_cookie_invalid
++;
1620 if (OSIncrementAtomic(&tcp_tfo_halfcnt
) >= tcp_tfo_backlog
) {
1621 /* Need to decrement again as we just increased it... */
1622 OSDecrementAtomic(&tcp_tfo_halfcnt
);
1626 tp
->t_tfo_flags
|= TFO_F_COOKIE_VALID
;
1628 tp
->t_tfo_stats
|= TFO_S_SYNDATA_RCV
;
1629 tcpstat
.tcps_tfo_syn_data_rcv
++;
1635 tcp_tfo_synack(struct tcpcb
*tp
, struct tcpopt
*to
)
1637 if (to
->to_flags
& TOF_TFO
) {
1638 unsigned char len
= *to
->to_tfo
- TCPOLEN_FASTOPEN_REQ
;
1641 * If this happens, things have gone terribly wrong. len should
1642 * have been checked in tcp_dooptions.
1644 VERIFY(len
<= TFO_COOKIE_LEN_MAX
);
1648 tcp_cache_set_cookie(tp
, to
->to_tfo
, len
);
1649 tcp_heuristic_tfo_success(tp
);
1651 tp
->t_tfo_stats
|= TFO_S_COOKIE_RCV
;
1652 tcpstat
.tcps_tfo_cookie_rcv
++;
1653 if (tp
->t_tfo_flags
& TFO_F_COOKIE_SENT
) {
1654 tcpstat
.tcps_tfo_cookie_wrong
++;
1655 tp
->t_tfo_stats
|= TFO_S_COOKIE_WRONG
;
1659 * Thus, no cookie in the response, but we either asked for one
1660 * or sent SYN+DATA. Now, we need to check whether we had to
1661 * rexmit the SYN. If that's the case, it's better to start
1662 * backing of TFO-cookie requests.
1664 if (!(tp
->t_flagsext
& TF_FASTOPEN_FORCE_ENABLE
) &&
1665 tp
->t_tfo_flags
& TFO_F_SYN_LOSS
) {
1666 tp
->t_tfo_stats
|= TFO_S_SYN_LOSS
;
1667 tcpstat
.tcps_tfo_syn_loss
++;
1669 tcp_heuristic_tfo_loss(tp
);
1671 if (tp
->t_tfo_flags
& TFO_F_COOKIE_REQ
) {
1672 tp
->t_tfo_stats
|= TFO_S_NO_COOKIE_RCV
;
1673 tcpstat
.tcps_tfo_no_cookie_rcv
++;
1676 tcp_heuristic_tfo_success(tp
);
1682 tcp_tfo_rcv_probe(struct tcpcb
*tp
, int tlen
)
1688 tp
->t_tfo_probe_state
= TFO_PROBE_PROBING
;
1691 * We send the probe out rather quickly (after one RTO). It does not
1692 * really hurt that much, it's only one additional segment on the wire.
1694 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
, (TCP_REXMTVAL(tp
)));
1698 tcp_tfo_rcv_data(struct tcpcb
*tp
)
1700 /* Transition from PROBING to NONE as data has been received */
1701 if (tp
->t_tfo_probe_state
>= TFO_PROBE_PROBING
) {
1702 tp
->t_tfo_probe_state
= TFO_PROBE_NONE
;
1707 tcp_tfo_rcv_ack(struct tcpcb
*tp
, struct tcphdr
*th
)
1709 if (tp
->t_tfo_probe_state
== TFO_PROBE_PROBING
&&
1710 tp
->t_tfo_probes
> 0) {
1711 if (th
->th_seq
== tp
->rcv_nxt
) {
1712 /* No hole, so stop probing */
1713 tp
->t_tfo_probe_state
= TFO_PROBE_NONE
;
1714 } else if (SEQ_GT(th
->th_seq
, tp
->rcv_nxt
)) {
1715 /* There is a hole! Wait a bit for data... */
1716 tp
->t_tfo_probe_state
= TFO_PROBE_WAIT_DATA
;
1717 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
,
1724 * Update snd_wnd information.
1727 tcp_update_window(struct tcpcb
*tp
, int thflags
, struct tcphdr
* th
,
1728 u_int32_t tiwin
, int tlen
)
1730 /* Don't look at the window if there is no ACK flag */
1731 if ((thflags
& TH_ACK
) &&
1732 (SEQ_LT(tp
->snd_wl1
, th
->th_seq
) ||
1733 (tp
->snd_wl1
== th
->th_seq
&& (SEQ_LT(tp
->snd_wl2
, th
->th_ack
) ||
1734 (tp
->snd_wl2
== th
->th_ack
&& tiwin
> tp
->snd_wnd
))))) {
1735 /* keep track of pure window updates */
1737 tp
->snd_wl2
== th
->th_ack
&& tiwin
> tp
->snd_wnd
) {
1738 tcpstat
.tcps_rcvwinupd
++;
1740 tp
->snd_wnd
= tiwin
;
1741 tp
->snd_wl1
= th
->th_seq
;
1742 tp
->snd_wl2
= th
->th_ack
;
1743 if (tp
->snd_wnd
> tp
->max_sndwnd
) {
1744 tp
->max_sndwnd
= tp
->snd_wnd
;
1747 if (tp
->t_inpcb
->inp_socket
->so_flags
& SOF_MP_SUBFLOW
) {
1748 mptcp_update_window_wakeup(tp
);
1756 tcp_handle_wakeup(struct socket
*so
, int read_wakeup
, int write_wakeup
)
1758 if (read_wakeup
!= 0) {
1761 if (write_wakeup
!= 0) {
1767 tcp_update_snd_una(struct tcpcb
*tp
, uint32_t ack
)
1770 if (SACK_ENABLED(tp
) && SEQ_LT(tp
->send_highest_sack
, tp
->snd_una
)) {
1771 tp
->send_highest_sack
= tp
->snd_una
;
1773 /* If we move our marker, we need to start fresh */
1774 tp
->t_new_dupacks
= 0;
1779 tcp_syn_data_valid(struct tcpcb
*tp
, struct tcphdr
*tcp_hdr
, int tlen
)
1786 /* Not the right sequence-number? */
1787 if (tcp_hdr
->th_seq
!= tp
->irs
) {
1791 /* We could have wrapped around, check that */
1792 if (tp
->t_inpcb
->inp_stat
->rxbytes
> INT32_MAX
) {
1800 tcp_input(struct mbuf
*m
, int off0
)
1804 struct ip
*ip
= NULL
;
1806 u_char
*optp
= NULL
;
1810 struct tcpcb
*tp
= 0;
1812 struct socket
*so
= 0;
1813 int todrop
, acked
, ourfinisacked
, needoutput
= 0;
1814 int read_wakeup
= 0;
1815 int write_wakeup
= 0;
1816 struct in_addr laddr
;
1817 struct in6_addr laddr6
;
1819 int iss
= 0, nosock
= 0;
1820 u_int32_t tiwin
, sack_bytes_acked
= 0, sack_bytes_newly_acked
= 0;
1821 struct tcpopt to
; /* options in this segment */
1825 u_char ip_ecn
= IPTOS_ECN_NOTECT
;
1826 unsigned int ifscope
;
1827 uint8_t isconnected
, isdisconnected
;
1828 struct ifnet
*ifp
= m
->m_pkthdr
.rcvif
;
1829 int segment_count
= m
->m_pkthdr
.seg_cnt
? : 1;
1831 u_int16_t pf_tag
= 0;
1833 struct mptcb
*mp_tp
= NULL
;
1835 boolean_t cell
= IFNET_IS_CELLULAR(ifp
);
1836 boolean_t wifi
= (!cell
&& IFNET_IS_WIFI(ifp
));
1837 boolean_t wired
= (!wifi
&& IFNET_IS_WIRED(ifp
));
1838 boolean_t recvd_dsack
= FALSE
;
1839 struct tcp_respond_args tra
;
1841 boolean_t check_cfil
= cfil_filter_present();
1842 bool findpcb_iterated
= false;
1844 * The mbuf may be freed after it has been added to the receive socket
1845 * buffer or the reassembly queue, so we reinitialize th to point to a
1846 * safe copy of the TCP header
1848 struct tcphdr saved_tcphdr
= {};
1850 * Save copy of the IPv4/IPv6 header.
1851 * Note: use array of uint32_t to silence compiler warning when casting
1852 * to a struct ip6_hdr pointer.
1854 #define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
1855 uint32_t saved_hdr
[MAX_IPWORDS
];
1857 #define TCP_INC_VAR(stat, npkts) do { \
1861 if (tcp_ack_strategy
== TCP_ACK_STRATEGY_LEGACY
) {
1864 TCP_INC_VAR(tcpstat
.tcps_rcvtotal
, segment_count
);
1866 struct ip6_hdr
*ip6
= NULL
;
1868 struct proc
*kernel_proc
= current_proc();
1870 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_START
, 0, 0, 0, 0, 0);
1872 isipv6
= (mtod(m
, struct ip
*)->ip_v
== 6) ? 1 : 0;
1873 bzero((char *)&to
, sizeof(to
));
1875 if (m
->m_flags
& M_PKTHDR
) {
1876 pf_tag
= m_pftag(m
)->pftag_tag
;
1881 * Expect 32-bit aligned data pointer on
1882 * strict-align platforms
1884 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
1886 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1887 ip6
= mtod(m
, struct ip6_hdr
*);
1888 tlen
= sizeof(*ip6
) + ntohs(ip6
->ip6_plen
) - off0
;
1889 th
= (struct tcphdr
*)(void *)((caddr_t
)ip6
+ off0
);
1891 if (tcp_input_checksum(AF_INET6
, m
, th
, off0
, tlen
)) {
1892 TCP_LOG_DROP_PKT(ip6
, th
, ifp
, "IPv6 bad tcp checksum");
1896 KERNEL_DEBUG(DBG_LAYER_BEG
, ((th
->th_dport
<< 16) | th
->th_sport
),
1897 (((ip6
->ip6_src
.s6_addr16
[0]) << 16) | (ip6
->ip6_dst
.s6_addr16
[0])),
1898 th
->th_seq
, th
->th_ack
, th
->th_win
);
1900 * Be proactive about unspecified IPv6 address in source.
1901 * As we use all-zero to indicate unbounded/unconnected pcb,
1902 * unspecified IPv6 address can be used to confuse us.
1904 * Note that packets with unspecified IPv6 destination is
1905 * already dropped in ip6_input.
1907 if (IN6_IS_ADDR_UNSPECIFIED(&ip6
->ip6_src
)) {
1909 IF_TCP_STATINC(ifp
, unspecv6
);
1910 TCP_LOG_DROP_PKT(ip6
, th
, ifp
, "src IPv6 address unspecified");
1913 DTRACE_TCP5(receive
, struct mbuf
*, m
, struct inpcb
*, NULL
,
1914 struct ip6_hdr
*, ip6
, struct tcpcb
*, NULL
,
1915 struct tcphdr
*, th
);
1917 ip_ecn
= (ntohl(ip6
->ip6_flow
) >> 20) & IPTOS_ECN_MASK
;
1920 * Get IP and TCP header together in first mbuf.
1921 * Note: IP leaves IP header in first mbuf.
1923 if (off0
> sizeof(struct ip
)) {
1925 off0
= sizeof(struct ip
);
1927 if (m
->m_len
< sizeof(struct tcpiphdr
)) {
1928 if ((m
= m_pullup(m
, sizeof(struct tcpiphdr
))) == 0) {
1929 tcpstat
.tcps_rcvshort
++;
1934 /* Expect 32-bit aligned data pointer on strict-align platforms */
1935 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
1937 ip
= mtod(m
, struct ip
*);
1938 th
= (struct tcphdr
*)(void *)((caddr_t
)ip
+ off0
);
1941 if (tcp_input_checksum(AF_INET
, m
, th
, off0
, tlen
)) {
1942 TCP_LOG_DROP_PKT(ip
, th
, ifp
, "IPv4 bad tcp checksum");
1946 /* Re-initialization for later version check */
1947 ip
->ip_v
= IPVERSION
;
1948 ip_ecn
= (ip
->ip_tos
& IPTOS_ECN_MASK
);
1950 DTRACE_TCP5(receive
, struct mbuf
*, m
, struct inpcb
*, NULL
,
1951 struct ip
*, ip
, struct tcpcb
*, NULL
, struct tcphdr
*, th
);
1953 KERNEL_DEBUG(DBG_LAYER_BEG
, ((th
->th_dport
<< 16) | th
->th_sport
),
1954 (((ip
->ip_src
.s_addr
& 0xffff) << 16) | (ip
->ip_dst
.s_addr
& 0xffff)),
1955 th
->th_seq
, th
->th_ack
, th
->th_win
);
1958 #define TCP_LOG_HDR (isipv6 ? (void *)ip6 : (void *)ip)
1961 * Check that TCP offset makes sense,
1962 * pull out TCP options and adjust length.
1964 off
= th
->th_off
<< 2;
1965 if (off
< sizeof(struct tcphdr
) || off
> tlen
) {
1966 tcpstat
.tcps_rcvbadoff
++;
1967 IF_TCP_STATINC(ifp
, badformat
);
1968 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "bad tcp offset");
1971 tlen
-= off
; /* tlen is used instead of ti->ti_len */
1972 if (off
> sizeof(struct tcphdr
)) {
1974 IP6_EXTHDR_CHECK(m
, off0
, off
, return );
1975 ip6
= mtod(m
, struct ip6_hdr
*);
1976 th
= (struct tcphdr
*)(void *)((caddr_t
)ip6
+ off0
);
1978 if (m
->m_len
< sizeof(struct ip
) + off
) {
1979 if ((m
= m_pullup(m
, sizeof(struct ip
) + off
)) == 0) {
1980 tcpstat
.tcps_rcvshort
++;
1983 ip
= mtod(m
, struct ip
*);
1984 th
= (struct tcphdr
*)(void *)((caddr_t
)ip
+ off0
);
1987 optlen
= off
- sizeof(struct tcphdr
);
1988 optp
= (u_char
*)(th
+ 1);
1990 * Do quick retrieval of timestamp options ("options
1991 * prediction?"). If timestamp is the only option and it's
1992 * formatted as recommended in RFC 1323 appendix A, we
1993 * quickly get the values now and not bother calling
1994 * tcp_dooptions(), etc.
1996 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
1997 (optlen
> TCPOLEN_TSTAMP_APPA
&&
1998 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
1999 *(u_int32_t
*)(void *)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
2000 (th
->th_flags
& TH_SYN
) == 0) {
2001 to
.to_flags
|= TOF_TS
;
2002 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
2003 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
2004 optp
= NULL
; /* we've parsed the options */
2007 thflags
= th
->th_flags
;
2010 * Drop all packets with both the SYN and FIN bits set.
2011 * This prevents e.g. nmap from identifying the TCP/IP stack.
2013 * This is a violation of the TCP specification.
2015 if ((thflags
& (TH_SYN
| TH_FIN
)) == (TH_SYN
| TH_FIN
)) {
2016 IF_TCP_STATINC(ifp
, synfin
);
2017 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "drop SYN FIN");
2022 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
2023 * until after ip6_savecontrol() is called and before other functions
2024 * which don't want those proto headers.
2025 * Because ip6_savecontrol() is going to parse the mbuf to
2026 * search for data to be passed up to user-land, it wants mbuf
2027 * parameters to be unchanged.
2029 drop_hdrlen
= off0
+ off
;
2031 /* Since this is an entry point for input processing of tcp packets, we
2032 * can update the tcp clock here.
2034 calculate_tcp_clock();
2037 * Record the interface where this segment arrived on; this does not
2038 * affect normal data output (for non-detached TCP) as it provides a
2039 * hint about which route and interface to use for sending in the
2040 * absence of a PCB, when scoped routing (and thus source interface
2041 * selection) are enabled.
2043 if ((m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
) || m
->m_pkthdr
.rcvif
== NULL
) {
2044 ifscope
= IFSCOPE_NONE
;
2046 ifscope
= m
->m_pkthdr
.rcvif
->if_index
;
2050 * Convert TCP protocol specific fields to host format.
2053 #if BYTE_ORDER != BIG_ENDIAN
2061 * Locate pcb for segment.
2065 isconnected
= FALSE
;
2066 isdisconnected
= FALSE
;
2069 inp
= in6_pcblookup_hash(&tcbinfo
, &ip6
->ip6_src
, th
->th_sport
,
2070 &ip6
->ip6_dst
, th
->th_dport
, 1,
2073 inp
= in_pcblookup_hash(&tcbinfo
, ip
->ip_src
, th
->th_sport
,
2074 ip
->ip_dst
, th
->th_dport
, 1, m
->m_pkthdr
.rcvif
);
2078 * Use the interface scope information from the PCB for outbound
2079 * segments. If the PCB isn't present and if scoped routing is
2080 * enabled, tcp_respond will use the scope of the interface where
2081 * the segment arrived on.
2083 if (inp
!= NULL
&& (inp
->inp_flags
& INP_BOUND_IF
)) {
2084 ifscope
= inp
->inp_boundifp
->if_index
;
2088 * If the state is CLOSED (i.e., TCB does not exist) then
2089 * all data in the incoming segment is discarded.
2090 * If the TCB exists but is in CLOSED state, it is embryonic,
2091 * but should either do a listen or a connect soon.
2095 char dbuf
[MAX_IPv6_STR_LEN
], sbuf
[MAX_IPv6_STR_LEN
];
2098 inet_ntop(AF_INET6
, &ip6
->ip6_dst
, dbuf
, sizeof(dbuf
));
2099 inet_ntop(AF_INET6
, &ip6
->ip6_src
, sbuf
, sizeof(sbuf
));
2101 inet_ntop(AF_INET
, &ip
->ip_dst
, dbuf
, sizeof(dbuf
));
2102 inet_ntop(AF_INET
, &ip
->ip_src
, sbuf
, sizeof(sbuf
));
2104 switch (log_in_vain
) {
2106 if (thflags
& TH_SYN
) {
2108 "Connection attempt to TCP %s:%d from %s:%d\n",
2109 dbuf
, ntohs(th
->th_dport
),
2111 ntohs(th
->th_sport
));
2116 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2117 dbuf
, ntohs(th
->th_dport
), sbuf
,
2118 ntohs(th
->th_sport
), thflags
);
2122 if ((thflags
& TH_SYN
) && !(thflags
& TH_ACK
) &&
2123 !(m
->m_flags
& (M_BCAST
| M_MCAST
)) &&
2124 ((isipv6
&& !IN6_ARE_ADDR_EQUAL(&ip6
->ip6_dst
, &ip6
->ip6_src
)) ||
2125 (!isipv6
&& ip
->ip_dst
.s_addr
!= ip
->ip_src
.s_addr
))) {
2126 log_in_vain_log((LOG_INFO
,
2127 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2128 dbuf
, ntohs(th
->th_dport
),
2130 ntohs(th
->th_sport
)));
2138 if (m
->m_pkthdr
.rcvif
&& m
->m_pkthdr
.rcvif
->if_type
!= IFT_LOOP
) {
2139 switch (blackhole
) {
2141 if (thflags
& TH_SYN
) {
2142 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "blackhole 1 syn for closed port");
2147 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "blackhole 2 closed port");
2150 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "blackhole closed port");
2155 IF_TCP_STATINC(ifp
, noconnnolist
);
2156 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "closed port");
2157 goto dropwithresetnosock
;
2159 so
= inp
->inp_socket
;
2161 /* This case shouldn't happen as the socket shouldn't be null
2162 * if inp_state isn't set to INPCB_STATE_DEAD
2163 * But just in case, we pretend we didn't find the socket if we hit this case
2164 * as this isn't cause for a panic (the socket might be leaked however)...
2168 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp
);
2170 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "inp_socket NULL");
2175 if (in_pcb_checkstate(inp
, WNT_RELEASE
, 1) == WNT_STOPUSING
) {
2176 socket_unlock(so
, 1);
2177 inp
= NULL
; // pretend we didn't find it
2178 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "inp state WNT_STOPUSING");
2182 if (!isipv6
&& inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
2183 if (inp
->inp_faddr
.s_addr
!= ip
->ip_src
.s_addr
||
2184 inp
->inp_laddr
.s_addr
!= ip
->ip_dst
.s_addr
||
2185 inp
->inp_fport
!= th
->th_sport
||
2186 inp
->inp_lport
!= th
->th_dport
) {
2187 os_log_error(OS_LOG_DEFAULT
, "%s 5-tuple does not match: %u:%u %u:%u\n",
2189 ntohs(inp
->inp_fport
), ntohs(th
->th_sport
),
2190 ntohs(inp
->inp_lport
), ntohs(th
->th_dport
));
2191 if (findpcb_iterated
) {
2194 findpcb_iterated
= true;
2195 socket_unlock(so
, 1);
2199 } else if (isipv6
&& !IN6_IS_ADDR_UNSPECIFIED(&inp
->in6p_faddr
)) {
2200 if (!IN6_ARE_ADDR_EQUAL(&inp
->in6p_faddr
, &ip6
->ip6_src
) ||
2201 !IN6_ARE_ADDR_EQUAL(&inp
->in6p_laddr
, &ip6
->ip6_dst
) ||
2202 inp
->inp_fport
!= th
->th_sport
||
2203 inp
->inp_lport
!= th
->th_dport
) {
2204 os_log_error(OS_LOG_DEFAULT
, "%s 5-tuple does not match: %u:%u %u:%u\n",
2206 ntohs(inp
->inp_fport
), ntohs(th
->th_sport
),
2207 ntohs(inp
->inp_lport
), ntohs(th
->th_dport
));
2208 if (findpcb_iterated
) {
2211 findpcb_iterated
= true;
2212 socket_unlock(so
, 1);
2218 tp
= intotcpcb(inp
);
2220 IF_TCP_STATINC(ifp
, noconnlist
);
2221 TCP_LOG_DROP_PKT(TCP_LOG_HDR
, th
, ifp
, "tp is NULL");
2225 TCP_LOG_TH_FLAGS(TCP_LOG_HDR
, th
, tp
, false, ifp
);
2227 if (tp
->t_state
== TCPS_CLOSED
) {
2228 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "tp state TCPS_CLOSED");
2233 if (so
->so_state
& SS_ISCONNECTED
) {
2234 // Connected TCP sockets have a fully-bound local and remote,
2235 // so the policy check doesn't need to override addresses
2236 if (!necp_socket_is_allowed_to_send_recv(inp
, ifp
, pf_tag
, NULL
, NULL
, NULL
, NULL
)) {
2237 TCP_LOG_DROP_NECP(TCP_LOG_HDR
, th
, intotcpcb(inp
), false);
2238 IF_TCP_STATINC(ifp
, badformat
);
2243 * If the proc_uuid_policy table has been updated since the last use
2244 * of the listening socket (i.e., the proc_uuid_policy_table_gencount
2245 * has been updated), the flags in the socket may be out of date.
2246 * If INP2_WANT_APP_POLICY is stale, inbound packets may
2247 * be dropped by NECP if the socket should now match a per-app
2249 * In order to avoid this refresh the proc_uuid_policy state to
2250 * potentially recalculate the socket's flags before checking
2253 (void) inp_update_policy(inp
);
2256 if (!necp_socket_is_allowed_to_send_recv_v6(inp
,
2257 th
->th_dport
, th
->th_sport
, &ip6
->ip6_dst
,
2258 &ip6
->ip6_src
, ifp
, pf_tag
, NULL
, NULL
, NULL
, NULL
)) {
2259 TCP_LOG_DROP_NECP(TCP_LOG_HDR
, th
, intotcpcb(inp
), false);
2260 IF_TCP_STATINC(ifp
, badformat
);
2264 if (!necp_socket_is_allowed_to_send_recv_v4(inp
,
2265 th
->th_dport
, th
->th_sport
, &ip
->ip_dst
, &ip
->ip_src
,
2266 ifp
, pf_tag
, NULL
, NULL
, NULL
, NULL
)) {
2267 TCP_LOG_DROP_NECP(TCP_LOG_HDR
, th
, intotcpcb(inp
), false);
2268 IF_TCP_STATINC(ifp
, badformat
);
2275 prev_t_state
= tp
->t_state
;
2277 /* If none of the FIN|SYN|RST|ACK flag is set, drop */
2278 if ((thflags
& TH_ACCEPT
) == 0) {
2279 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "rfc5961 TH_ACCEPT == 0");
2283 /* Unscale the window into a 32-bit value. */
2284 if ((thflags
& TH_SYN
) == 0) {
2285 tiwin
= th
->th_win
<< tp
->snd_scale
;
2290 /* Avoid processing packets while closing a listen socket */
2291 if (tp
->t_state
== TCPS_LISTEN
&&
2292 (so
->so_options
& SO_ACCEPTCONN
) == 0) {
2293 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "closing a listening socket");
2297 if (so
->so_options
& (SO_DEBUG
| SO_ACCEPTCONN
)) {
2299 if (so
->so_options
& SO_DEBUG
) {
2300 ostate
= tp
->t_state
;
2302 bcopy((char *)ip6
, (char *)tcp_saveipgen
,
2305 bcopy((char *)ip
, (char *)tcp_saveipgen
, sizeof(*ip
));
2310 if (so
->so_options
& SO_ACCEPTCONN
) {
2311 struct tcpcb
*tp0
= tp
;
2314 struct sockaddr_storage from
;
2315 struct sockaddr_storage to2
;
2316 struct inpcb
*oinp
= sotoinpcb(so
);
2317 struct ifnet
*head_ifscope
;
2318 unsigned int head_nocell
, head_recvanyif
,
2319 head_noexpensive
, head_awdl_unrestricted
,
2320 head_intcoproc_allowed
, head_external_port
,
2323 /* Get listener's bound-to-interface, if any */
2324 head_ifscope
= (inp
->inp_flags
& INP_BOUND_IF
) ?
2325 inp
->inp_boundifp
: NULL
;
2326 /* Get listener's no-cellular information, if any */
2327 head_nocell
= INP_NO_CELLULAR(inp
);
2328 /* Get listener's recv-any-interface, if any */
2329 head_recvanyif
= (inp
->inp_flags
& INP_RECV_ANYIF
);
2330 /* Get listener's no-expensive information, if any */
2331 head_noexpensive
= INP_NO_EXPENSIVE(inp
);
2332 head_noconstrained
= INP_NO_CONSTRAINED(inp
);
2333 head_awdl_unrestricted
= INP_AWDL_UNRESTRICTED(inp
);
2334 head_intcoproc_allowed
= INP_INTCOPROC_ALLOWED(inp
);
2335 head_external_port
= (inp
->inp_flags2
& INP2_EXTERNAL_PORT
);
2338 * If the state is LISTEN then ignore segment if it contains an RST.
2339 * If the segment contains an ACK then it is bad and send a RST.
2340 * If it does not contain a SYN then it is not interesting; drop it.
2341 * If it is from this socket, drop it, it must be forged.
2343 if ((thflags
& (TH_RST
| TH_ACK
| TH_SYN
)) != TH_SYN
) {
2344 IF_TCP_STATINC(ifp
, listbadsyn
);
2346 if (thflags
& TH_RST
) {
2347 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN with RST");
2350 if (thflags
& TH_ACK
) {
2351 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN with ACK");
2353 tcpstat
.tcps_badsyn
++;
2357 /* We come here if there is no SYN set */
2358 tcpstat
.tcps_badsyn
++;
2359 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "bad SYN");
2362 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN
| DBG_FUNC_START
, 0, 0, 0, 0, 0);
2363 if (th
->th_dport
== th
->th_sport
) {
2365 if (IN6_ARE_ADDR_EQUAL(&ip6
->ip6_dst
,
2367 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "bad tuple same port");
2370 } else if (ip
->ip_dst
.s_addr
== ip
->ip_src
.s_addr
) {
2371 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "bad tuple same IPv4 address");
2376 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2377 * in_broadcast() should never return true on a received
2378 * packet with M_BCAST not set.
2380 * Packets with a multicast source address should also
2383 if (m
->m_flags
& (M_BCAST
| M_MCAST
)) {
2384 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "mbuf M_BCAST | M_MCAST");
2388 if (IN6_IS_ADDR_MULTICAST(&ip6
->ip6_dst
) ||
2389 IN6_IS_ADDR_MULTICAST(&ip6
->ip6_src
)) {
2390 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "IN6_IS_ADDR_MULTICAST");
2393 } else if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
)) ||
2394 IN_MULTICAST(ntohl(ip
->ip_src
.s_addr
)) ||
2395 ip
->ip_src
.s_addr
== htonl(INADDR_BROADCAST
) ||
2396 in_broadcast(ip
->ip_dst
, m
->m_pkthdr
.rcvif
)) {
2397 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "multicast or broadcast address");
2403 * If deprecated address is forbidden,
2404 * we do not accept SYN to deprecated interface
2405 * address to prevent any new inbound connection from
2406 * getting established.
2407 * When we do not accept SYN, we send a TCP RST,
2408 * with deprecated source address (instead of dropping
2409 * it). We compromise it as it is much better for peer
2410 * to send a RST, and RST will be the final packet
2413 * If we do not forbid deprecated addresses, we accept
2414 * the SYN packet. RFC 4862 forbids dropping SYN in
2417 if (isipv6
&& !ip6_use_deprecated
) {
2420 if (ip6_getdstifaddr_info(m
, NULL
,
2422 if (ia6_flags
& IN6_IFF_DEPRECATED
) {
2424 IF_TCP_STATINC(ifp
, deprecate6
);
2425 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "deprecated IPv6 address");
2430 if (so
->so_filt
|| check_cfil
) {
2432 struct sockaddr_in6
*sin6
= (struct sockaddr_in6
*)&from
;
2434 sin6
->sin6_len
= sizeof(*sin6
);
2435 sin6
->sin6_family
= AF_INET6
;
2436 sin6
->sin6_port
= th
->th_sport
;
2437 sin6
->sin6_flowinfo
= 0;
2438 sin6
->sin6_addr
= ip6
->ip6_src
;
2439 sin6
->sin6_scope_id
= 0;
2441 sin6
= (struct sockaddr_in6
*)&to2
;
2443 sin6
->sin6_len
= sizeof(struct sockaddr_in6
);
2444 sin6
->sin6_family
= AF_INET6
;
2445 sin6
->sin6_port
= th
->th_dport
;
2446 sin6
->sin6_flowinfo
= 0;
2447 sin6
->sin6_addr
= ip6
->ip6_dst
;
2448 sin6
->sin6_scope_id
= 0;
2450 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&from
;
2452 sin
->sin_len
= sizeof(*sin
);
2453 sin
->sin_family
= AF_INET
;
2454 sin
->sin_port
= th
->th_sport
;
2455 sin
->sin_addr
= ip
->ip_src
;
2457 sin
= (struct sockaddr_in
*)&to2
;
2459 sin
->sin_len
= sizeof(struct sockaddr_in
);
2460 sin
->sin_family
= AF_INET
;
2461 sin
->sin_port
= th
->th_dport
;
2462 sin
->sin_addr
= ip
->ip_dst
;
2467 so2
= sonewconn(so
, 0, (struct sockaddr
*)&from
);
2469 so2
= sonewconn(so
, 0, NULL
);
2472 tcpstat
.tcps_listendrop
++;
2473 if (tcp_dropdropablreq(so
)) {
2475 so2
= sonewconn(so
, 0, (struct sockaddr
*)&from
);
2477 so2
= sonewconn(so
, 0, NULL
);
2481 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, " listen drop");
2486 /* Point "inp" and "tp" in tandem to new socket */
2487 inp
= (struct inpcb
*)so2
->so_pcb
;
2488 tp
= intotcpcb(inp
);
2491 socket_unlock(so
, 0); /* Unlock but keep a reference on listener for now */
2496 * Mark socket as temporary until we're
2497 * committed to keeping it. The code at
2498 * ``drop'' and ``dropwithreset'' check the
2499 * flag dropsocket to see if the temporary
2500 * socket created here should be discarded.
2501 * We mark the socket as discardable until
2502 * we're committed to it below in TCPS_LISTEN.
2503 * There are some error conditions in which we
2504 * have to drop the temporary socket.
2508 * Inherit INP_BOUND_IF from listener; testing if
2509 * head_ifscope is non-NULL is sufficient, since it
2510 * can only be set to a non-zero value earlier if
2511 * the listener has such a flag set.
2513 if (head_ifscope
!= NULL
) {
2514 inp
->inp_flags
|= INP_BOUND_IF
;
2515 inp
->inp_boundifp
= head_ifscope
;
2517 inp
->inp_flags
&= ~INP_BOUND_IF
;
2520 * Inherit restrictions from listener.
2523 inp_set_nocellular(inp
);
2525 if (head_noexpensive
) {
2526 inp_set_noexpensive(inp
);
2528 if (head_noconstrained
) {
2529 inp_set_noconstrained(inp
);
2531 if (head_awdl_unrestricted
) {
2532 inp_set_awdl_unrestricted(inp
);
2534 if (head_intcoproc_allowed
) {
2535 inp_set_intcoproc_allowed(inp
);
2538 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2540 if (head_recvanyif
) {
2541 inp
->inp_flags
|= INP_RECV_ANYIF
;
2543 inp
->inp_flags
&= ~INP_RECV_ANYIF
;
2546 if (head_external_port
) {
2547 inp
->inp_flags2
|= INP2_EXTERNAL_PORT
;
2550 inp
->in6p_laddr
= ip6
->ip6_dst
;
2552 inp
->inp_vflag
&= ~INP_IPV6
;
2553 inp
->inp_vflag
|= INP_IPV4
;
2554 inp
->inp_laddr
= ip
->ip_dst
;
2556 inp
->inp_lport
= th
->th_dport
;
2557 if (in_pcbinshash(inp
, 0) != 0) {
2559 * Undo the assignments above if we failed to
2560 * put the PCB on the hash lists.
2563 inp
->in6p_laddr
= in6addr_any
;
2565 inp
->inp_laddr
.s_addr
= INADDR_ANY
;
2568 socket_lock(oso
, 0); /* release ref on parent */
2569 socket_unlock(oso
, 1);
2570 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, " in_pcbinshash failed");
2573 socket_lock(oso
, 0);
2576 * Inherit socket options from the listening
2578 * Note that in6p_inputopts are not (even
2579 * should not be) copied, since it stores
2580 * previously received options and is used to
2581 * detect if each new option is different than
2582 * the previous one and hence should be passed
2584 * If we copied in6p_inputopts, a user would
2585 * not be able to receive options just after
2586 * calling the accept system call.
2589 oinp
->inp_flags
& INP_CONTROLOPTS
;
2590 if (oinp
->in6p_outputopts
) {
2591 inp
->in6p_outputopts
=
2592 ip6_copypktopts(oinp
->in6p_outputopts
,
2596 inp
->inp_options
= ip_srcroute();
2597 inp
->inp_ip_tos
= oinp
->inp_ip_tos
;
2600 /* copy old policy into new socket's */
2601 if (sotoinpcb(oso
)->inp_sp
) {
2603 /* Is it a security hole here to silently fail to copy the policy? */
2604 if (inp
->inp_sp
!= NULL
) {
2605 error
= ipsec_init_policy(so
, &inp
->inp_sp
);
2607 if (error
!= 0 || ipsec_copy_policy(sotoinpcb(oso
)->inp_sp
, inp
->inp_sp
)) {
2608 printf("tcp_input: could not copy policy\n");
2612 /* inherit states from the listener */
2613 DTRACE_TCP4(state__change
, void, NULL
, struct inpcb
*, inp
,
2614 struct tcpcb
*, tp
, int32_t, TCPS_LISTEN
);
2615 tp
->t_state
= TCPS_LISTEN
;
2616 tp
->t_flags
|= tp0
->t_flags
& (TF_NOPUSH
| TF_NOOPT
| TF_NODELAY
);
2617 tp
->t_flagsext
|= (tp0
->t_flagsext
& (TF_RXTFINDROP
| TF_NOTIMEWAIT
| TF_FASTOPEN
));
2618 tp
->t_keepinit
= tp0
->t_keepinit
;
2619 tp
->t_keepcnt
= tp0
->t_keepcnt
;
2620 tp
->t_keepintvl
= tp0
->t_keepintvl
;
2621 tp
->t_adaptive_wtimo
= tp0
->t_adaptive_wtimo
;
2622 tp
->t_adaptive_rtimo
= tp0
->t_adaptive_rtimo
;
2623 tp
->t_inpcb
->inp_ip_ttl
= tp0
->t_inpcb
->inp_ip_ttl
;
2624 if ((so
->so_flags
& SOF_NOTSENT_LOWAT
) != 0) {
2625 tp
->t_notsent_lowat
= tp0
->t_notsent_lowat
;
2627 tp
->t_inpcb
->inp_flags2
|=
2628 tp0
->t_inpcb
->inp_flags2
& INP2_KEEPALIVE_OFFLOAD
;
2630 /* now drop the reference on the listener */
2631 socket_unlock(oso
, 1);
2633 tcp_set_max_rwinscale(tp
, so
);
2637 int error
= cfil_sock_attach(so2
, (struct sockaddr
*)&to2
, (struct sockaddr
*)&from
,
2638 CFS_CONNECTION_DIR_IN
);
2640 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, " cfil_sock_attach failed");
2644 #endif /* CONTENT_FILTER */
2646 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
2649 socket_lock_assert_owned(so
);
2651 if (net_mpklog_enabled
&& (m
->m_pkthdr
.rcvif
->if_xflags
& IFXF_MPK_LOG
)) {
2652 MPKL_TCP_INPUT(tcp_mpkl_log_object
,
2653 ntohs(tp
->t_inpcb
->inp_lport
), ntohs(tp
->t_inpcb
->inp_fport
),
2654 th
->th_seq
, th
->th_ack
, tlen
, thflags
,
2655 so
->last_pid
, so
->so_log_seqn
++);
2658 if (tp
->t_state
== TCPS_ESTABLISHED
&& tlen
> 0) {
2660 * Evaluate the rate of arrival of packets to see if the
2661 * receiver can reduce the ack traffic. The algorithm to
2662 * stretch acks will be enabled if the connection meets
2663 * certain criteria defined in tcp_stretch_ack_enable function.
2665 if ((tp
->t_flagsext
& TF_RCVUNACK_WAITSS
) != 0) {
2666 TCP_INC_VAR(tp
->rcv_waitforss
, segment_count
);
2668 if (tcp_stretch_ack_enable(tp
, thflags
)) {
2669 tp
->t_flags
|= TF_STRETCHACK
;
2670 tp
->t_flagsext
&= ~(TF_RCVUNACK_WAITSS
);
2671 tp
->rcv_waitforss
= 0;
2673 tp
->t_flags
&= ~(TF_STRETCHACK
);
2675 if (TSTMP_GT(tp
->rcv_unackwin
- (tcp_rcvunackwin
>> 1), tcp_now
)) {
2676 tp
->rcv_by_unackhalfwin
+= (tlen
+ off
);
2677 tp
->rcv_by_unackwin
+= (tlen
+ off
);
2679 tp
->rcv_unackwin
= tcp_now
+ tcp_rcvunackwin
;
2680 tp
->rcv_by_unackwin
= tp
->rcv_by_unackhalfwin
+ tlen
+ off
;
2681 tp
->rcv_by_unackhalfwin
= tlen
+ off
;
2686 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2687 * bother doing extensive checks for state and whatnot.
2689 if (thflags
& TH_CWR
) {
2690 tp
->ecn_flags
&= ~TE_SENDECE
;
2691 tp
->t_ecn_recv_cwr
++;
2695 * Explicit Congestion Notification - Flag that we need to send ECT if
2696 * + The IP Congestion experienced flag was set.
2697 * + Socket is in established state
2698 * + We negotiated ECN in the TCP setup
2699 * + This isn't a pure ack (tlen > 0)
2700 * + The data is in the valid window
2702 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2704 if (ip_ecn
== IPTOS_ECN_CE
&& tp
->t_state
== TCPS_ESTABLISHED
&&
2705 TCP_ECN_ENABLED(tp
) && tlen
> 0 &&
2706 SEQ_GEQ(th
->th_seq
, tp
->last_ack_sent
) &&
2707 SEQ_LT(th
->th_seq
, tp
->last_ack_sent
+ tp
->rcv_wnd
)) {
2708 tp
->t_ecn_recv_ce
++;
2709 tcpstat
.tcps_ecn_recv_ce
++;
2710 INP_INC_IFNET_STAT(inp
, ecn_recv_ce
);
2711 /* Mark this connection as it received CE from network */
2712 tp
->ecn_flags
|= TE_RECV_ECN_CE
;
2713 tp
->ecn_flags
|= TE_SENDECE
;
2717 * If we received an explicit notification of congestion in
2718 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2719 * the ack-stretching state. We need to handle ECN notification if
2720 * an ECN setup SYN was sent even once.
2722 if (tp
->t_state
== TCPS_ESTABLISHED
&&
2723 (tp
->ecn_flags
& TE_SETUPSENT
) &&
2724 (ip_ecn
== IPTOS_ECN_CE
|| (thflags
& TH_CWR
))) {
2725 tcp_reset_stretch_ack(tp
);
2726 tp
->t_forced_acks
= TCP_FORCED_ACKS_COUNT
;
2727 CLEAR_IAJ_STATE(tp
);
2730 if (ip_ecn
== IPTOS_ECN_CE
&& tp
->t_state
== TCPS_ESTABLISHED
&&
2731 !TCP_ECN_ENABLED(tp
) && !(tp
->ecn_flags
& TE_CEHEURI_SET
)) {
2732 tcpstat
.tcps_ecn_fallback_ce
++;
2733 tcp_heuristic_ecn_aggressive(tp
);
2734 tp
->ecn_flags
|= TE_CEHEURI_SET
;
2737 if (tp
->t_state
== TCPS_ESTABLISHED
&& TCP_ECN_ENABLED(tp
) &&
2738 ip_ecn
== IPTOS_ECN_CE
&& !(tp
->ecn_flags
& TE_CEHEURI_SET
)) {
2739 if (inp
->inp_stat
->rxpackets
< ECN_MIN_CE_PROBES
) {
2740 tp
->t_ecn_recv_ce_pkt
++;
2741 } else if (tp
->t_ecn_recv_ce_pkt
> ECN_MAX_CE_RATIO
) {
2742 tcpstat
.tcps_ecn_fallback_ce
++;
2743 tcp_heuristic_ecn_aggressive(tp
);
2744 tp
->ecn_flags
|= TE_CEHEURI_SET
;
2745 INP_INC_IFNET_STAT(inp
, ecn_fallback_ce
);
2747 /* We tracked the first ECN_MIN_CE_PROBES segments, we
2748 * now know that the path is good.
2750 tp
->ecn_flags
|= TE_CEHEURI_SET
;
2754 /* Update rcvtime as a new segment was received on the connection */
2755 tp
->t_rcvtime
= tcp_now
;
2758 * Segment received on connection.
2759 * Reset idle time and keep-alive timer.
2761 if (TCPS_HAVEESTABLISHED(tp
->t_state
)) {
2762 tcp_keepalive_reset(tp
);
2765 mptcp_reset_keepalive(tp
);
2770 * Process options if not in LISTEN state,
2771 * else do it below (after getting remote address).
2773 if (tp
->t_state
!= TCPS_LISTEN
&& optp
) {
2774 tcp_dooptions(tp
, optp
, optlen
, th
, &to
);
2777 if (tp
->t_state
!= TCPS_LISTEN
&& (so
->so_flags
& SOF_MP_SUBFLOW
) &&
2778 mptcp_input_preproc(tp
, m
, th
, drop_hdrlen
) != 0) {
2779 tp
->t_flags
|= TF_ACKNOW
;
2780 (void) tcp_output(tp
);
2781 tcp_check_timer_state(tp
);
2782 socket_unlock(so
, 1);
2786 if (tp
->t_state
== TCPS_SYN_SENT
&& (thflags
& TH_SYN
)) {
2787 if (!(thflags
& TH_ACK
) ||
2788 (SEQ_GT(th
->th_ack
, tp
->iss
) &&
2789 SEQ_LEQ(th
->th_ack
, tp
->snd_max
))) {
2790 tcp_finalize_options(tp
, &to
, ifscope
);
2796 * Compute inter-packet arrival jitter. According to RFC 3550,
2797 * inter-packet arrival jitter is defined as the difference in
2798 * packet spacing at the receiver compared to the sender for a
2799 * pair of packets. When two packets of maximum segment size come
2800 * one after the other with consecutive sequence numbers, we
2801 * consider them as packets sent together at the sender and use
2802 * them as a pair to compute inter-packet arrival jitter. This
2803 * metric indicates the delay induced by the network components due
2804 * to queuing in edge/access routers.
2806 if (tp
->t_state
== TCPS_ESTABLISHED
&&
2807 (thflags
& (TH_SYN
| TH_FIN
| TH_RST
| TH_URG
| TH_ACK
| TH_ECE
| TH_PUSH
)) == TH_ACK
&&
2808 ((tp
->t_flags
& TF_NEEDFIN
) == 0) &&
2809 ((to
.to_flags
& TOF_TS
) == 0 ||
2810 TSTMP_GEQ(to
.to_tsval
, tp
->ts_recent
)) &&
2811 th
->th_seq
== tp
->rcv_nxt
&& LIST_EMPTY(&tp
->t_segq
)) {
2812 int seg_size
= tlen
;
2813 if (tp
->iaj_pktcnt
<= IAJ_IGNORE_PKTCNT
) {
2814 TCP_INC_VAR(tp
->iaj_pktcnt
, segment_count
);
2817 if (tp
->iaj_size
== 0 || seg_size
> tp
->iaj_size
||
2818 (seg_size
== tp
->iaj_size
&& tp
->iaj_rcv_ts
== 0)) {
2820 * State related to inter-arrival jitter is
2821 * uninitialized or we are trying to find a good
2822 * first packet to start computing the metric
2824 update_iaj_state(tp
, seg_size
, 0);
2826 if (seg_size
== tp
->iaj_size
) {
2828 * Compute inter-arrival jitter taking
2829 * this packet as the second packet
2833 if (seg_size
< tp
->iaj_size
) {
2835 * There is a smaller packet in the stream.
2836 * Some times the maximum size supported
2837 * on a path can change if there is a new
2838 * link with smaller MTU. The receiver will
2839 * not know about this change. If there
2840 * are too many packets smaller than
2841 * iaj_size, we try to learn the iaj_size
2844 TCP_INC_VAR(tp
->iaj_small_pkt
, segment_count
);
2845 if (tp
->iaj_small_pkt
> RESET_IAJ_SIZE_THRESH
) {
2846 update_iaj_state(tp
, seg_size
, 1);
2848 CLEAR_IAJ_STATE(tp
);
2851 update_iaj_state(tp
, seg_size
, 0);
2855 CLEAR_IAJ_STATE(tp
);
2857 #endif /* TRAFFIC_MGT */
2860 * Header prediction: check for the two common cases
2861 * of a uni-directional data xfer. If the packet has
2862 * no control flags, is in-sequence, the window didn't
2863 * change and we're not retransmitting, it's a
2864 * candidate. If the length is zero and the ack moved
2865 * forward, we're the sender side of the xfer. Just
2866 * free the data acked & wake any higher level process
2867 * that was blocked waiting for space. If the length
2868 * is non-zero and the ack didn't move, we're the
2869 * receiver side. If we're getting packets in-order
2870 * (the reassembly queue is empty), add the data to
2871 * the socket buffer and note that we need a delayed ack.
2872 * Make sure that the hidden state-flags are also off.
2873 * Since we check for TCPS_ESTABLISHED above, it can only
2876 if (tp
->t_state
== TCPS_ESTABLISHED
&&
2877 (thflags
& (TH_SYN
| TH_FIN
| TH_RST
| TH_URG
| TH_ACK
| TH_ECE
| TH_CWR
)) == TH_ACK
&&
2878 ((tp
->t_flags
& TF_NEEDFIN
) == 0) &&
2879 ((to
.to_flags
& TOF_TS
) == 0 ||
2880 TSTMP_GEQ(to
.to_tsval
, tp
->ts_recent
)) &&
2881 th
->th_seq
== tp
->rcv_nxt
&&
2882 tiwin
&& tiwin
== tp
->snd_wnd
&&
2883 tp
->snd_nxt
== tp
->snd_max
) {
2885 * If last ACK falls within this segment's sequence numbers,
2886 * record the timestamp.
2887 * NOTE that the test is modified according to the latest
2888 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2890 if ((to
.to_flags
& TOF_TS
) != 0 &&
2891 SEQ_LEQ(th
->th_seq
, tp
->last_ack_sent
)) {
2892 tp
->ts_recent_age
= tcp_now
;
2893 tp
->ts_recent
= to
.to_tsval
;
2897 if (SEQ_GT(th
->th_ack
, tp
->snd_una
) &&
2898 SEQ_LEQ(th
->th_ack
, tp
->snd_max
) &&
2899 tp
->snd_cwnd
>= tp
->snd_ssthresh
&&
2900 (!IN_FASTRECOVERY(tp
) &&
2901 ((!(SACK_ENABLED(tp
)) &&
2902 tp
->t_dupacks
< tp
->t_rexmtthresh
) ||
2903 (SACK_ENABLED(tp
) && to
.to_nsacks
== 0 &&
2904 TAILQ_EMPTY(&tp
->snd_holes
))))) {
2906 * this is a pure ack for outstanding data.
2908 ++tcpstat
.tcps_predack
;
2910 tcp_bad_rexmt_check(tp
, th
, &to
);
2912 /* Recalculate the RTT */
2913 tcp_compute_rtt(tp
, &to
, th
);
2915 VERIFY(SEQ_GEQ(th
->th_ack
, tp
->snd_una
));
2916 acked
= BYTES_ACKED(th
, tp
);
2917 tcpstat
.tcps_rcvackpack
++;
2918 tcpstat
.tcps_rcvackbyte
+= acked
;
2921 * Handle an ack that is in sequence during
2922 * congestion avoidance phase. The
2923 * calculations in this function
2924 * assume that snd_una is not updated yet.
2926 if (CC_ALGO(tp
)->congestion_avd
!= NULL
) {
2927 CC_ALGO(tp
)->congestion_avd(tp
, th
);
2929 tcp_ccdbg_trace(tp
, th
, TCP_CC_INSEQ_ACK_RCVD
);
2930 sbdrop(&so
->so_snd
, acked
);
2931 tcp_sbsnd_trim(&so
->so_snd
);
2933 if (SEQ_GT(tp
->snd_una
, tp
->snd_recover
) &&
2934 SEQ_LEQ(th
->th_ack
, tp
->snd_recover
)) {
2935 tp
->snd_recover
= th
->th_ack
- 1;
2938 tcp_update_snd_una(tp
, th
->th_ack
);
2940 TCP_RESET_REXMT_STATE(tp
);
2943 * pull snd_wl2 up to prevent seq wrap relative
2946 tp
->snd_wl2
= th
->th_ack
;
2948 if (tp
->t_dupacks
> 0) {
2950 tp
->t_rexmtthresh
= tcprexmtthresh
;
2951 tp
->t_new_dupacks
= 0;
2954 tp
->sackhint
.sack_bytes_acked
= 0;
2957 * If all outstanding data are acked, stop
2958 * retransmit timer, otherwise restart timer
2959 * using current (possibly backed-off) value.
2960 * If process is waiting for space,
2961 * wakeup/selwakeup/signal. If data
2962 * are ready to send, let tcp_output
2963 * decide between more output or persist.
2965 if (tp
->snd_una
== tp
->snd_max
) {
2966 tp
->t_timer
[TCPT_REXMT
] = 0;
2967 tp
->t_timer
[TCPT_PTO
] = 0;
2968 } else if (tp
->t_timer
[TCPT_PERSIST
] == 0) {
2969 tp
->t_timer
[TCPT_REXMT
] = OFFSET_FROM_START(tp
, tp
->t_rxtcur
);
2971 if (!SLIST_EMPTY(&tp
->t_rxt_segments
) &&
2972 !TCP_DSACK_SEQ_IN_WINDOW(tp
,
2973 tp
->t_dsack_lastuna
, tp
->snd_una
)) {
2974 tcp_rxtseg_clean(tp
);
2977 if ((tp
->t_flagsext
& TF_MEASURESNDBW
) != 0 &&
2978 tp
->t_bwmeas
!= NULL
) {
2979 tcp_bwmeas_check(tp
);
2983 if (!SLIST_EMPTY(&tp
->t_notify_ack
)) {
2984 tcp_notify_acknowledgement(tp
, so
);
2987 if ((so
->so_snd
.sb_cc
) || (tp
->t_flags
& TF_ACKNOW
)) {
2988 (void) tcp_output(tp
);
2991 tcp_tfo_rcv_ack(tp
, th
);
2995 tcp_check_timer_state(tp
);
2997 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
2999 socket_unlock(so
, 1);
3000 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
3003 } else if (th
->th_ack
== tp
->snd_una
&& LIST_EMPTY(&tp
->t_segq
) &&
3004 tlen
<= tcp_sbspace(tp
)) {
3006 * this is a pure, in-sequence data packet
3007 * with nothing on the reassembly queue and
3008 * we have enough buffer space to take it.
3011 /* Clean receiver SACK report if present */
3012 if (SACK_ENABLED(tp
) && tp
->rcv_numsacks
) {
3013 tcp_clean_sackreport(tp
);
3015 ++tcpstat
.tcps_preddat
;
3016 tp
->rcv_nxt
+= tlen
;
3018 * Pull snd_wl1 up to prevent seq wrap relative to
3021 tp
->snd_wl1
= th
->th_seq
;
3023 * Pull rcv_up up to prevent seq wrap relative to
3026 tp
->rcv_up
= tp
->rcv_nxt
;
3027 TCP_INC_VAR(tcpstat
.tcps_rcvpack
, segment_count
);
3028 tcpstat
.tcps_rcvbyte
+= tlen
;
3029 if (nstat_collect
) {
3030 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
3032 INP_ADD_STAT(inp
, cell
, wifi
, wired
, rxbytes
,
3034 inp_set_activity_bitmap(inp
);
3038 * Calculate the RTT on the receiver only if the
3039 * connection is in streaming mode and the last
3040 * packet was not an end-of-write
3042 if (tp
->t_flags
& TF_STREAMING_ON
) {
3043 tcp_compute_rtt(tp
, &to
, th
);
3046 tcp_sbrcv_grow(tp
, &so
->so_rcv
, &to
, tlen
);
3049 * Add data to socket buffer.
3051 so_recv_data_stat(so
, m
, 0);
3052 m_adj(m
, drop_hdrlen
); /* delayed header drop */
3055 * If message delivery (SOF_ENABLE_MSGS) is enabled on
3056 * this socket, deliver the packet received as an
3057 * in-order message with sequence number attached to it.
3060 memcpy(&saved_hdr
, ip6
, sizeof(struct ip6_hdr
));
3061 ip6
= (struct ip6_hdr
*)&saved_hdr
[0];
3063 memcpy(&saved_hdr
, ip
, ip
->ip_hl
<< 2);
3064 ip
= (struct ip
*)&saved_hdr
[0];
3066 memcpy(&saved_tcphdr
, th
, sizeof(struct tcphdr
));
3068 if (th
->th_flags
& TH_PUSH
) {
3069 tp
->t_flagsext
|= TF_LAST_IS_PSH
;
3071 tp
->t_flagsext
&= ~TF_LAST_IS_PSH
;
3074 if (sbappendstream_rcvdemux(so
, m
)) {
3075 mptcp_handle_input(so
);
3081 KERNEL_DEBUG(DBG_LAYER_END
, ((th
->th_dport
<< 16) | th
->th_sport
),
3082 (((ip6
->ip6_src
.s6_addr16
[0]) << 16) | (ip6
->ip6_dst
.s6_addr16
[0])),
3083 th
->th_seq
, th
->th_ack
, th
->th_win
);
3085 KERNEL_DEBUG(DBG_LAYER_END
, ((th
->th_dport
<< 16) | th
->th_sport
),
3086 (((ip
->ip_src
.s_addr
& 0xffff) << 16) | (ip
->ip_dst
.s_addr
& 0xffff)),
3087 th
->th_seq
, th
->th_ack
, th
->th_win
);
3089 TCP_INC_VAR(tp
->t_unacksegs
, segment_count
);
3090 if (DELAY_ACK(tp
, th
)) {
3091 if ((tp
->t_flags
& TF_DELACK
) == 0) {
3092 tp
->t_flags
|= TF_DELACK
;
3093 tp
->t_timer
[TCPT_DELACK
] = OFFSET_FROM_START(tp
, tcp_delack
);
3096 tp
->t_flags
|= TF_ACKNOW
;
3100 tcp_adaptive_rwtimo_check(tp
, tlen
);
3103 tcp_tfo_rcv_data(tp
);
3106 tcp_check_timer_state(tp
);
3108 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
3110 socket_unlock(so
, 1);
3111 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
3117 * Calculate amount of space in receive window,
3118 * and then do TCP input processing.
3119 * Receive window is amount of space in rcv queue,
3120 * but not less than advertised window.
3122 socket_lock_assert_owned(so
);
3123 win
= tcp_sbspace(tp
);
3126 } else { /* clip rcv window to 4K for modems */
3127 if (tp
->t_flags
& TF_SLOWLINK
&& slowlink_wsize
> 0) {
3128 win
= min(win
, slowlink_wsize
);
3131 tp
->rcv_wnd
= imax(win
, (int)(tp
->rcv_adv
- tp
->rcv_nxt
));
3134 * Ensure that the subflow receive window isn't greater
3135 * than the connection level receive window.
3137 if ((tp
->t_mpflags
& TMPF_MPTCP_TRUE
) && (mp_tp
= tptomptp(tp
))) {
3138 socket_lock_assert_owned(mptetoso(mp_tp
->mpt_mpte
));
3139 int64_t recwin_conn
= (int64_t)(mp_tp
->mpt_rcvadv
- mp_tp
->mpt_rcvnxt
);
3141 VERIFY(recwin_conn
< INT32_MAX
&& recwin_conn
> INT32_MIN
);
3142 if (recwin_conn
> 0 && tp
->rcv_wnd
> (uint32_t)recwin_conn
) {
3143 tp
->rcv_wnd
= (uint32_t)recwin_conn
;
3144 tcpstat
.tcps_mp_reducedwin
++;
3149 switch (tp
->t_state
) {
3151 * Initialize tp->rcv_nxt, and tp->irs, select an initial
3152 * tp->iss, and send a segment:
3153 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3154 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3155 * Fill in remote peer address fields if not previously specified.
3156 * Enter SYN_RECEIVED state, and process any other fields of this
3157 * segment in this state.
3160 struct sockaddr_in
*sin
;
3161 struct sockaddr_in6
*sin6
;
3163 socket_lock_assert_owned(so
);
3165 /* Clear the logging flags inherited from the listening socket */
3166 tp
->t_log_flags
= 0;
3167 tp
->t_flagsext
&= ~TF_LOGGED_CONN_SUMMARY
;
3170 MALLOC(sin6
, struct sockaddr_in6
*, sizeof *sin6
,
3171 M_SONAME
, M_NOWAIT
);
3173 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "LISTEN malloc M_SONAME failed");
3176 bzero(sin6
, sizeof(*sin6
));
3177 sin6
->sin6_family
= AF_INET6
;
3178 sin6
->sin6_len
= sizeof(*sin6
);
3179 sin6
->sin6_addr
= ip6
->ip6_src
;
3180 sin6
->sin6_port
= th
->th_sport
;
3181 laddr6
= inp
->in6p_laddr
;
3182 if (IN6_IS_ADDR_UNSPECIFIED(&inp
->in6p_laddr
)) {
3183 inp
->in6p_laddr
= ip6
->ip6_dst
;
3185 if (in6_pcbconnect(inp
, (struct sockaddr
*)sin6
,
3187 inp
->in6p_laddr
= laddr6
;
3188 FREE(sin6
, M_SONAME
);
3189 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, " LISTEN in6_pcbconnect failed");
3192 FREE(sin6
, M_SONAME
);
3194 socket_lock_assert_owned(so
);
3195 MALLOC(sin
, struct sockaddr_in
*, sizeof *sin
, M_SONAME
,
3198 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "LISTEN malloc M_SONAME failed");
3201 sin
->sin_family
= AF_INET
;
3202 sin
->sin_len
= sizeof(*sin
);
3203 sin
->sin_addr
= ip
->ip_src
;
3204 sin
->sin_port
= th
->th_sport
;
3205 bzero((caddr_t
)sin
->sin_zero
, sizeof(sin
->sin_zero
));
3206 laddr
= inp
->inp_laddr
;
3207 if (inp
->inp_laddr
.s_addr
== INADDR_ANY
) {
3208 inp
->inp_laddr
= ip
->ip_dst
;
3210 if (in_pcbconnect(inp
, (struct sockaddr
*)sin
, kernel_proc
,
3211 IFSCOPE_NONE
, NULL
)) {
3212 inp
->inp_laddr
= laddr
;
3213 FREE(sin
, M_SONAME
);
3214 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, " LISTEN in_pcbconnect failed");
3217 FREE(sin
, M_SONAME
);
3220 tcp_dooptions(tp
, optp
, optlen
, th
, &to
);
3221 tcp_finalize_options(tp
, &to
, ifscope
);
3223 if (tfo_enabled(tp
) && tcp_tfo_syn(tp
, &to
)) {
3230 tp
->iss
= tcp_new_isn(tp
);
3232 tp
->irs
= th
->th_seq
;
3233 tcp_sendseqinit(tp
);
3235 tp
->snd_recover
= tp
->snd_una
;
3237 * Initialization of the tcpcb for transaction;
3238 * set SND.WND = SEG.WND,
3239 * initialize CCsend and CCrecv.
3241 tp
->snd_wnd
= tiwin
; /* initial send-window */
3242 tp
->max_sndwnd
= tp
->snd_wnd
;
3243 tp
->t_flags
|= TF_ACKNOW
;
3244 tp
->t_unacksegs
= 0;
3245 DTRACE_TCP4(state__change
, void, NULL
, struct inpcb
*, inp
,
3246 struct tcpcb
*, tp
, int32_t, TCPS_SYN_RECEIVED
);
3247 tp
->t_state
= TCPS_SYN_RECEIVED
;
3248 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
,
3249 TCP_CONN_KEEPINIT(tp
));
3250 tp
->t_connect_time
= tcp_now
;
3251 dropsocket
= 0; /* committed to socket */
3253 if (inp
->inp_flowhash
== 0) {
3254 inp
->inp_flowhash
= inp_calc_flowhash(inp
);
3256 /* update flowinfo - RFC 6437 */
3257 if (inp
->inp_flow
== 0 &&
3258 inp
->in6p_flags
& IN6P_AUTOFLOWLABEL
) {
3259 inp
->inp_flow
&= ~IPV6_FLOWLABEL_MASK
;
3261 (htonl(inp
->inp_flowhash
) & IPV6_FLOWLABEL_MASK
);
3264 /* reset the incomp processing flag */
3265 so
->so_flags
&= ~(SOF_INCOMP_INPROGRESS
);
3266 tcpstat
.tcps_accepts
++;
3267 if ((thflags
& (TH_ECE
| TH_CWR
)) == (TH_ECE
| TH_CWR
)) {
3269 tp
->ecn_flags
|= (TE_SETUPRECEIVED
| TE_SENDIPECT
);
3273 * The address and connection state are finalized
3275 TCP_LOG_CONNECT(tp
, false, 0);
3277 tcp_add_fsw_flow(tp
, ifp
);
3283 * If the state is SYN_RECEIVED and the seg contains an ACK,
3284 * but not for our SYN/ACK, send a RST.
3286 case TCPS_SYN_RECEIVED
:
3287 if ((thflags
& TH_ACK
) &&
3288 (SEQ_LEQ(th
->th_ack
, tp
->snd_una
) ||
3289 SEQ_GT(th
->th_ack
, tp
->snd_max
))) {
3290 IF_TCP_STATINC(ifp
, ooopacket
);
3291 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN_RECEIVED bad ACK");
3296 * In SYN_RECEIVED state, if we recv some SYNS with
3297 * window scale and others without, window scaling should
3298 * be disabled. Otherwise the window advertised will be
3299 * lower if we assume scaling and the other end does not.
3301 if ((thflags
& TH_SYN
) &&
3302 (tp
->irs
== th
->th_seq
) &&
3303 !(to
.to_flags
& TOF_SCALE
)) {
3304 tp
->t_flags
&= ~TF_RCVD_SCALE
;
3309 * If the state is SYN_SENT:
3310 * if seg contains an ACK, but not for our SYN, drop the input.
3311 * if seg contains a RST, then drop the connection.
3312 * if seg does not contain SYN, then drop it.
3313 * Otherwise this is an acceptable SYN segment
3314 * initialize tp->rcv_nxt and tp->irs
3315 * if seg contains ack then advance tp->snd_una
3316 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3317 * arrange for segment to be acked (eventually)
3318 * continue processing rest of data/controls, beginning with URG
3321 if ((thflags
& TH_ACK
) &&
3322 (SEQ_LEQ(th
->th_ack
, tp
->iss
) ||
3323 SEQ_GT(th
->th_ack
, tp
->snd_max
))) {
3324 IF_TCP_STATINC(ifp
, ooopacket
);
3325 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN_SENT bad ACK");
3328 if (thflags
& TH_RST
) {
3329 if ((thflags
& TH_ACK
) != 0) {
3330 if (tfo_enabled(tp
) &&
3331 !(tp
->t_flagsext
& TF_FASTOPEN_FORCE_ENABLE
)) {
3332 tcp_heuristic_tfo_rst(tp
);
3334 if ((tp
->ecn_flags
& (TE_SETUPSENT
| TE_RCVD_SYN_RST
)) == TE_SETUPSENT
) {
3336 * On local connections, send
3337 * non-ECN syn one time before
3338 * dropping the connection
3340 if (tp
->t_flags
& TF_LOCAL
) {
3341 tp
->ecn_flags
|= TE_RCVD_SYN_RST
;
3344 tcp_heuristic_ecn_synrst(tp
);
3348 (SO_FILT_HINT_LOCKED
|
3349 SO_FILT_HINT_CONNRESET
));
3350 tp
= tcp_drop(tp
, ECONNREFUSED
);
3352 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN_SENT got RST");
3355 if ((thflags
& TH_SYN
) == 0) {
3356 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN_SENT no SYN");
3359 tp
->snd_wnd
= th
->th_win
; /* initial send window */
3360 tp
->max_sndwnd
= tp
->snd_wnd
;
3362 tp
->irs
= th
->th_seq
;
3364 if (thflags
& TH_ACK
) {
3365 tcpstat
.tcps_connects
++;
3367 if ((thflags
& (TH_ECE
| TH_CWR
)) == (TH_ECE
)) {
3368 /* ECN-setup SYN-ACK */
3369 tp
->ecn_flags
|= TE_SETUPRECEIVED
;
3370 if (TCP_ECN_ENABLED(tp
)) {
3371 tcp_heuristic_ecn_success(tp
);
3372 tcpstat
.tcps_ecn_client_success
++;
3375 if (tp
->ecn_flags
& TE_SETUPSENT
&&
3376 tp
->t_rxtshift
== 0) {
3377 tcp_heuristic_ecn_success(tp
);
3378 tcpstat
.tcps_ecn_not_supported
++;
3380 if (tp
->ecn_flags
& TE_SETUPSENT
&&
3381 tp
->t_rxtshift
> 0) {
3382 tcp_heuristic_ecn_loss(tp
);
3385 /* non-ECN-setup SYN-ACK */
3386 tp
->ecn_flags
&= ~TE_SENDIPECT
;
3389 /* Do window scaling on this connection? */
3390 if (TCP_WINDOW_SCALE_ENABLED(tp
)) {
3391 tp
->snd_scale
= tp
->requested_s_scale
;
3392 tp
->rcv_scale
= tp
->request_r_scale
;
3395 tp
->rcv_adv
+= min(tp
->rcv_wnd
, TCP_MAXWIN
<< tp
->rcv_scale
);
3396 tp
->snd_una
++; /* SYN is acked */
3397 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
)) {
3398 tp
->snd_nxt
= tp
->snd_una
;
3402 * We have sent more in the SYN than what is being
3403 * acked. (e.g., TFO)
3404 * We should restart the sending from what the receiver
3405 * has acknowledged immediately.
3407 if (SEQ_GT(tp
->snd_nxt
, th
->th_ack
)) {
3409 * rdar://problem/33214601
3410 * There is a middlebox that acks all but one
3411 * byte and still drops the data.
3413 if (!(tp
->t_flagsext
& TF_FASTOPEN_FORCE_ENABLE
) &&
3414 (tp
->t_tfo_stats
& TFO_S_SYN_DATA_SENT
) &&
3415 tp
->snd_max
== th
->th_ack
+ 1 &&
3416 tp
->snd_max
> tp
->snd_una
+ 1) {
3417 tcp_heuristic_tfo_middlebox(tp
);
3419 so
->so_error
= ENODATA
;
3421 (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MP_SUB_ERROR
));
3423 tp
->t_tfo_stats
|= TFO_S_ONE_BYTE_PROXY
;
3426 tp
->snd_max
= tp
->snd_nxt
= th
->th_ack
;
3430 * If there's data, delay ACK; if there's also a FIN
3431 * ACKNOW will be turned on later.
3433 TCP_INC_VAR(tp
->t_unacksegs
, segment_count
);
3434 if (DELAY_ACK(tp
, th
) && tlen
!= 0) {
3435 if ((tp
->t_flags
& TF_DELACK
) == 0) {
3436 tp
->t_flags
|= TF_DELACK
;
3437 tp
->t_timer
[TCPT_DELACK
] = OFFSET_FROM_START(tp
, tcp_delack
);
3440 tp
->t_flags
|= TF_ACKNOW
;
3443 * Received <SYN,ACK> in SYN_SENT[*] state.
3445 * SYN_SENT --> ESTABLISHED
3446 * SYN_SENT* --> FIN_WAIT_1
3448 tp
->t_starttime
= tcp_now
;
3449 tcp_sbrcv_tstmp_check(tp
);
3450 if (tp
->t_flags
& TF_NEEDFIN
) {
3451 DTRACE_TCP4(state__change
, void, NULL
,
3452 struct inpcb
*, inp
,
3453 struct tcpcb
*, tp
, int32_t,
3455 tp
->t_state
= TCPS_FIN_WAIT_1
;
3456 tp
->t_flags
&= ~TF_NEEDFIN
;
3459 TCP_LOG_CONNECTION_SUMMARY(tp
);
3461 DTRACE_TCP4(state__change
, void, NULL
,
3462 struct inpcb
*, inp
, struct tcpcb
*,
3463 tp
, int32_t, TCPS_ESTABLISHED
);
3464 tp
->t_state
= TCPS_ESTABLISHED
;
3465 tp
->t_timer
[TCPT_KEEP
] =
3466 OFFSET_FROM_START(tp
,
3467 TCP_CONN_KEEPIDLE(tp
));
3468 if (nstat_collect
) {
3469 nstat_route_connect_success(
3470 inp
->inp_route
.ro_rt
);
3473 * The SYN is acknowledged but una is not
3474 * updated yet. So pass the value of
3475 * ack to compute sndbytes correctly
3477 inp_count_sndbytes(inp
, th
->th_ack
);
3479 tp
->t_forced_acks
= TCP_FORCED_ACKS_COUNT
;
3482 * Do not send the connect notification for additional
3483 * subflows until ACK for 3-way handshake arrives.
3485 if ((!(tp
->t_mpflags
& TMPF_MPTCP_TRUE
)) &&
3486 (tp
->t_mpflags
& TMPF_SENT_JOIN
)) {
3487 isconnected
= FALSE
;
3492 if ((tp
->t_tfo_flags
& (TFO_F_COOKIE_REQ
| TFO_F_COOKIE_SENT
)) ||
3493 (tp
->t_tfo_stats
& TFO_S_SYN_DATA_SENT
)) {
3494 tcp_tfo_synack(tp
, &to
);
3496 if ((tp
->t_tfo_stats
& TFO_S_SYN_DATA_SENT
) &&
3497 SEQ_LT(tp
->snd_una
, th
->th_ack
)) {
3498 tp
->t_tfo_stats
|= TFO_S_SYN_DATA_ACKED
;
3499 tcpstat
.tcps_tfo_syn_data_acked
++;
3501 if (so
->so_flags
& SOF_MP_SUBFLOW
) {
3502 so
->so_flags1
|= SOF1_TFO_REWIND
;
3505 tcp_tfo_rcv_probe(tp
, tlen
);
3510 * Received initial SYN in SYN-SENT[*] state => simul-
3512 * Do 3-way handshake:
3513 * SYN-SENT -> SYN-RECEIVED
3514 * SYN-SENT* -> SYN-RECEIVED*
3516 tp
->t_flags
|= TF_ACKNOW
;
3517 tp
->t_timer
[TCPT_REXMT
] = 0;
3518 DTRACE_TCP4(state__change
, void, NULL
, struct inpcb
*, inp
,
3519 struct tcpcb
*, tp
, int32_t, TCPS_SYN_RECEIVED
);
3520 tp
->t_state
= TCPS_SYN_RECEIVED
;
3523 * During simultaneous open, TFO should not be used.
3524 * So, we disable it here, to prevent that data gets
3525 * sent on the SYN/ACK.
3527 tcp_disable_tfo(tp
);
3532 * Advance th->th_seq to correspond to first data byte.
3533 * If data, trim to stay within window,
3534 * dropping FIN if necessary.
3537 if (tlen
> tp
->rcv_wnd
) {
3538 todrop
= tlen
- tp
->rcv_wnd
;
3542 tcpstat
.tcps_rcvpackafterwin
++;
3543 tcpstat
.tcps_rcvbyteafterwin
+= todrop
;
3545 tp
->snd_wl1
= th
->th_seq
- 1;
3546 tp
->rcv_up
= th
->th_seq
;
3548 * Client side of transaction: already sent SYN and data.
3549 * If the remote host used T/TCP to validate the SYN,
3550 * our data will be ACK'd; if so, enter normal data segment
3551 * processing in the middle of step 5, ack processing.
3552 * Otherwise, goto step 6.
3554 if (thflags
& TH_ACK
) {
3559 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3560 * do normal processing.
3562 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
3566 case TCPS_TIME_WAIT
:
3567 break; /* continue normal processing */
3569 /* Received a SYN while connection is already established.
3570 * This is a "half open connection and other anomalies" described
3571 * in RFC793 page 34, send an ACK so the remote reset the connection
3572 * or recovers by adjusting its sequence numbering. Sending an ACK is
3573 * in accordance with RFC 5961 Section 4.2
3575 case TCPS_ESTABLISHED
:
3576 if (thflags
& TH_SYN
&& tlen
<= 0) {
3577 /* Drop the packet silently if we have reached the limit */
3578 if (tcp_is_ack_ratelimited(tp
)) {
3579 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "ESTABLISHED rfc5961 rate limited");
3582 /* Send challenge ACK */
3583 tcpstat
.tcps_synchallenge
++;
3584 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "ESTABLISHED rfc5961 challenge ACK");
3592 * States other than LISTEN or SYN_SENT.
3593 * First check the RST flag and sequence number since reset segments
3594 * are exempt from the timestamp and connection count tests. This
3595 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3596 * below which allowed reset segments in half the sequence space
3597 * to fall though and be processed (which gives forged reset
3598 * segments with a random sequence number a 50 percent chance of
3599 * killing a connection).
3600 * Then check timestamp, if present.
3601 * Then check the connection count, if present.
3602 * Then check that at least some bytes of segment are within
3603 * receive window. If segment begins before rcv_nxt,
3604 * drop leading data (and SYN); if nothing left, just ack.
3607 * If the RST bit is set, check the sequence number to see
3608 * if this is a valid reset segment.
3610 * In all states except SYN-SENT, all reset (RST) segments
3611 * are validated by checking their SEQ-fields. A reset is
3612 * valid if its sequence number is in the window.
3613 * Note: this does not take into account delayed ACKs, so
3614 * we should test against last_ack_sent instead of rcv_nxt.
3615 * The sequence number in the reset segment is normally an
3616 * echo of our outgoing acknowlegement numbers, but some hosts
3617 * send a reset with the sequence number at the rightmost edge
3618 * of our receive window, and we have to handle this case.
3619 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3620 * that brute force RST attacks are possible. To combat this,
3621 * we use a much stricter check while in the ESTABLISHED state,
3622 * only accepting RSTs where the sequence number is equal to
3623 * last_ack_sent. In all other states (the states in which a
3624 * RST is more likely), the more permissive check is used.
3625 * RFC 5961 Section 3.2: if the RST bit is set, sequence # is
3626 * within the receive window and last_ack_sent == seq,
3627 * then reset the connection. Otherwise if the seq doesn't
3628 * match last_ack_sent, TCP must send challenge ACK. Perform
3629 * rate limitation when sending the challenge ACK.
3630 * If we have multiple segments in flight, the intial reset
3631 * segment sequence numbers will be to the left of last_ack_sent,
3632 * but they will eventually catch up.
3633 * In any case, it never made sense to trim reset segments to
3634 * fit the receive window since RFC 1122 says:
3635 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
3637 * A TCP SHOULD allow a received RST segment to include data.
3640 * It has been suggested that a RST segment could contain
3641 * ASCII text that encoded and explained the cause of the
3642 * RST. No standard has yet been established for such
3645 * If the reset segment passes the sequence number test examine
3647 * SYN_RECEIVED STATE:
3648 * If passive open, return to LISTEN state.
3649 * If active open, inform user that connection was refused.
3650 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3651 * Inform user that connection was reset, and close tcb.
3652 * CLOSING, LAST_ACK STATES:
3655 * Drop the segment - see Stevens, vol. 2, p. 964 and
3658 * Radar 4803931: Allows for the case where we ACKed the FIN but
3659 * there is already a RST in flight from the peer.
3660 * In that case, accept the RST for non-established
3661 * state if it's one off from last_ack_sent.
3664 if (thflags
& TH_RST
) {
3665 if ((SEQ_GEQ(th
->th_seq
, tp
->last_ack_sent
) &&
3666 SEQ_LT(th
->th_seq
, tp
->last_ack_sent
+ tp
->rcv_wnd
)) ||
3667 (tp
->rcv_wnd
== 0 &&
3668 ((tp
->last_ack_sent
== th
->th_seq
) ||
3669 ((tp
->last_ack_sent
- 1) == th
->th_seq
)))) {
3670 if (tp
->last_ack_sent
== th
->th_seq
) {
3671 switch (tp
->t_state
) {
3672 case TCPS_SYN_RECEIVED
:
3673 IF_TCP_STATINC(ifp
, rstinsynrcv
);
3674 so
->so_error
= ECONNREFUSED
;
3677 case TCPS_ESTABLISHED
:
3678 if (TCP_ECN_ENABLED(tp
) &&
3679 tp
->snd_una
== tp
->iss
+ 1 &&
3680 SEQ_GT(tp
->snd_max
, tp
->snd_una
)) {
3682 * If the first data packet on an
3683 * ECN connection, receives a RST
3684 * increment the heuristic
3686 tcp_heuristic_ecn_droprst(tp
);
3689 case TCPS_FIN_WAIT_1
:
3690 case TCPS_CLOSE_WAIT
:
3691 case TCPS_FIN_WAIT_2
:
3692 so
->so_error
= ECONNRESET
;
3695 (SO_FILT_HINT_LOCKED
|
3696 SO_FILT_HINT_CONNRESET
));
3698 tcpstat
.tcps_drops
++;
3707 case TCPS_TIME_WAIT
:
3711 tcpstat
.tcps_badrst
++;
3712 /* Drop if we have reached the ACK limit */
3713 if (tcp_is_ack_ratelimited(tp
)) {
3714 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "ESTABLISHED rfc5961 rate limited");
3717 /* Send challenge ACK */
3718 tcpstat
.tcps_rstchallenge
++;
3719 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "ESTABLISHED rfc5961 challenge ACK");
3728 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3729 * and it's less than ts_recent, drop it.
3731 if ((to
.to_flags
& TOF_TS
) != 0 && tp
->ts_recent
&&
3732 TSTMP_LT(to
.to_tsval
, tp
->ts_recent
)) {
3733 /* Check to see if ts_recent is over 24 days old. */
3734 if ((int)(tcp_now
- tp
->ts_recent_age
) > TCP_PAWS_IDLE
) {
3736 * Invalidate ts_recent. If this segment updates
3737 * ts_recent, the age will be reset later and ts_recent
3738 * will get a valid value. If it does not, setting
3739 * ts_recent to zero will at least satisfy the
3740 * requirement that zero be placed in the timestamp
3741 * echo reply when ts_recent isn't valid. The
3742 * age isn't reset until we get a valid ts_recent
3743 * because we don't want out-of-order segments to be
3744 * dropped when ts_recent is old.
3748 tcpstat
.tcps_rcvduppack
++;
3749 tcpstat
.tcps_rcvdupbyte
+= tlen
;
3751 tcpstat
.tcps_pawsdrop
++;
3754 * PAWS-drop when ECN is being used? That indicates
3755 * that ECT-marked packets take a different path, with
3756 * different congestion-characteristics.
3758 * Only fallback when we did send less than 2GB as PAWS
3759 * really has no reason to kick in earlier.
3761 if (TCP_ECN_ENABLED(tp
) &&
3762 inp
->inp_stat
->rxbytes
< 2147483648) {
3763 INP_INC_IFNET_STAT(inp
, ecn_fallback_reorder
);
3764 tcpstat
.tcps_ecn_fallback_reorder
++;
3765 tcp_heuristic_ecn_aggressive(tp
);
3768 if (nstat_collect
) {
3769 nstat_route_rx(tp
->t_inpcb
->inp_route
.ro_rt
,
3770 1, tlen
, NSTAT_RX_FLAG_DUPLICATE
);
3771 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
3773 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
3775 tp
->t_stat
.rxduplicatebytes
+= tlen
;
3776 inp_set_activity_bitmap(inp
);
3786 * In the SYN-RECEIVED state, validate that the packet belongs to
3787 * this connection before trimming the data to fit the receive
3788 * window. Check the sequence number versus IRS since we know
3789 * the sequence numbers haven't wrapped. This is a partial fix
3790 * for the "LAND" DoS attack.
3792 if (tp
->t_state
== TCPS_SYN_RECEIVED
&& SEQ_LT(th
->th_seq
, tp
->irs
)) {
3793 IF_TCP_STATINC(ifp
, dospacket
);
3794 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SYN_RECEIVED bad SEQ");
3799 * Check if there is old data at the beginning of the window
3800 * i.e. the sequence number is before rcv_nxt
3802 todrop
= tp
->rcv_nxt
- th
->th_seq
;
3804 boolean_t is_syn_set
= FALSE
;
3806 if (thflags
& TH_SYN
) {
3810 if (th
->th_urp
> 1) {
3818 * Following if statement from Stevens, vol. 2, p. 960.
3819 * The amount of duplicate data is greater than or equal
3820 * to the size of the segment - entire segment is duplicate
3823 || (todrop
== tlen
&& (thflags
& TH_FIN
) == 0)) {
3825 * Any valid FIN must be to the left of the window.
3826 * At this point the FIN must be a duplicate or out
3827 * of sequence; drop it.
3832 * Send an ACK to resynchronize and drop any data.
3833 * But keep on processing for RST or ACK.
3835 * If the SYN bit was originally set, then only send
3836 * an ACK if we are not rate-limiting this connection.
3839 if (!tcp_is_ack_ratelimited(tp
)) {
3840 tcpstat
.tcps_synchallenge
++;
3841 tp
->t_flags
|= TF_ACKNOW
;
3844 tp
->t_flags
|= TF_ACKNOW
;
3848 /* This could be a keepalive */
3849 soevent(so
, SO_FILT_HINT_LOCKED
|
3850 SO_FILT_HINT_KEEPALIVE
);
3853 tcpstat
.tcps_rcvduppack
++;
3854 tcpstat
.tcps_rcvdupbyte
+= todrop
;
3856 tcpstat
.tcps_rcvpartduppack
++;
3857 tcpstat
.tcps_rcvpartdupbyte
+= todrop
;
3862 * Note the duplicate data sequence space so that
3863 * it can be reported in DSACK option.
3865 tp
->t_dsack_lseq
= th
->th_seq
;
3866 tp
->t_dsack_rseq
= th
->th_seq
+ todrop
;
3867 tp
->t_flags
|= TF_ACKNOW
;
3869 if (nstat_collect
) {
3870 nstat_route_rx(tp
->t_inpcb
->inp_route
.ro_rt
, 1,
3871 todrop
, NSTAT_RX_FLAG_DUPLICATE
);
3872 INP_ADD_STAT(inp
, cell
, wifi
, wired
, rxpackets
, 1);
3873 INP_ADD_STAT(inp
, cell
, wifi
, wired
, rxbytes
, todrop
);
3874 tp
->t_stat
.rxduplicatebytes
+= todrop
;
3875 inp_set_activity_bitmap(inp
);
3877 drop_hdrlen
+= todrop
; /* drop from the top afterwards */
3878 th
->th_seq
+= todrop
;
3880 if (th
->th_urp
> todrop
) {
3881 th
->th_urp
-= todrop
;
3889 * If new data are received on a connection after the user
3890 * processes are gone, then RST the other end.
3891 * Send also a RST when we received a data segment after we've
3892 * sent our FIN when the socket is defunct.
3893 * Note that an MPTCP subflow socket would have SS_NOFDREF set
3894 * by default. So, if it's an MPTCP-subflow we rather check the
3895 * MPTCP-level's socket state for SS_NOFDREF.
3898 boolean_t close_it
= FALSE
;
3900 if (!(so
->so_flags
& SOF_MP_SUBFLOW
) && (so
->so_state
& SS_NOFDREF
) &&
3901 tp
->t_state
> TCPS_CLOSE_WAIT
) {
3902 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SS_NOFDREF");
3906 if ((so
->so_flags
& SOF_MP_SUBFLOW
) && (mptetoso(tptomptp(tp
)->mpt_mpte
)->so_state
& SS_NOFDREF
) &&
3907 tp
->t_state
> TCPS_CLOSE_WAIT
) {
3908 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SOF_MP_SUBFLOW SS_NOFDREF");
3912 if ((so
->so_flags
& SOF_DEFUNCT
) && tp
->t_state
> TCPS_FIN_WAIT_1
) {
3913 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "SOF_DEFUNCT");
3919 tcpstat
.tcps_rcvafterclose
++;
3920 IF_TCP_STATINC(ifp
, cleanup
);
3926 * If segment ends after window, drop trailing data
3927 * (and PUSH and FIN); if nothing left, just ACK.
3929 todrop
= (th
->th_seq
+ tlen
) - (tp
->rcv_nxt
+ tp
->rcv_wnd
);
3931 tcpstat
.tcps_rcvpackafterwin
++;
3932 if (todrop
>= tlen
) {
3933 tcpstat
.tcps_rcvbyteafterwin
+= tlen
;
3935 * If a new connection request is received
3936 * while in TIME_WAIT, drop the old connection
3937 * and start over if the sequence numbers
3938 * are above the previous ones.
3940 if (thflags
& TH_SYN
&&
3941 tp
->t_state
== TCPS_TIME_WAIT
&&
3942 SEQ_GT(th
->th_seq
, tp
->rcv_nxt
)) {
3943 iss
= tcp_new_isn(tp
);
3945 socket_unlock(so
, 1);
3949 * If window is closed can only take segments at
3950 * window edge, and have to drop data and PUSH from
3951 * incoming segments. Continue processing, but
3952 * remember to ack. Otherwise, drop segment
3955 if (tp
->rcv_wnd
== 0 && th
->th_seq
== tp
->rcv_nxt
) {
3956 tp
->t_flags
|= TF_ACKNOW
;
3957 tcpstat
.tcps_rcvwinprobe
++;
3962 tcpstat
.tcps_rcvbyteafterwin
+= todrop
;
3966 thflags
&= ~(TH_PUSH
| TH_FIN
);
3970 * If last ACK falls within this segment's sequence numbers,
3971 * record its timestamp.
3973 * 1) That the test incorporates suggestions from the latest
3974 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
3975 * 2) That updating only on newer timestamps interferes with
3976 * our earlier PAWS tests, so this check should be solely
3977 * predicated on the sequence space of this segment.
3978 * 3) That we modify the segment boundary check to be
3979 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
3980 * instead of RFC1323's
3981 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
3982 * This modified check allows us to overcome RFC1323's
3983 * limitations as described in Stevens TCP/IP Illustrated
3984 * Vol. 2 p.869. In such cases, we can still calculate the
3985 * RTT correctly when RCV.NXT == Last.ACK.Sent.
3987 if ((to
.to_flags
& TOF_TS
) != 0 &&
3988 SEQ_LEQ(th
->th_seq
, tp
->last_ack_sent
) &&
3989 SEQ_LEQ(tp
->last_ack_sent
, th
->th_seq
+ tlen
+
3990 ((thflags
& (TH_SYN
| TH_FIN
)) != 0))) {
3991 tp
->ts_recent_age
= tcp_now
;
3992 tp
->ts_recent
= to
.to_tsval
;
3996 * Stevens: If a SYN is in the window, then this is an
3997 * error and we send an RST and drop the connection.
3999 * RFC 5961 Section 4.2
4000 * Send challenge ACK for any SYN in synchronized state
4001 * Perform rate limitation in doing so.
4003 if (thflags
& TH_SYN
) {
4004 if (!tcp_syn_data_valid(tp
, th
, tlen
)) {
4005 tcpstat
.tcps_badsyn
++;
4006 /* Drop if we have reached ACK limit */
4007 if (tcp_is_ack_ratelimited(tp
)) {
4008 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "rfc5961 bad SYN rate limited");
4011 /* Send challenge ACK */
4012 tcpstat
.tcps_synchallenge
++;
4013 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "rfc5961 bad SYN challenge ack");
4018 * Received SYN (/ACK) with data.
4019 * Move sequence number along to process the data.
4027 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
4028 * flag is on (half-synchronized state), then queue data for
4029 * later processing; else drop segment and return.
4031 if ((thflags
& TH_ACK
) == 0) {
4032 if (tp
->t_state
== TCPS_SYN_RECEIVED
) {
4033 if ((tfo_enabled(tp
))) {
4035 * So, we received a valid segment while in
4037 * As this cannot be an RST (see that if a bit
4038 * higher), and it does not have the ACK-flag
4039 * set, we want to retransmit the SYN/ACK.
4040 * Thus, we have to reset snd_nxt to snd_una to
4041 * trigger the going back to sending of the
4042 * SYN/ACK. This is more consistent with the
4043 * behavior of tcp_output(), which expects
4044 * to send the segment that is pointed to by
4047 tp
->snd_nxt
= tp
->snd_una
;
4050 * We need to make absolutely sure that we are
4051 * going to reply upon a duplicate SYN-segment.
4053 if (th
->th_flags
& TH_SYN
) {
4059 } else if (tp
->t_flags
& TF_ACKNOW
) {
4060 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "bad ACK");
4063 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "bad ACK");
4072 switch (tp
->t_state
) {
4074 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
4075 * ESTABLISHED state and continue processing.
4076 * The ACK was checked above.
4078 case TCPS_SYN_RECEIVED
:
4080 tcpstat
.tcps_connects
++;
4082 /* Do window scaling? */
4083 if (TCP_WINDOW_SCALE_ENABLED(tp
)) {
4084 tp
->snd_scale
= tp
->requested_s_scale
;
4085 tp
->rcv_scale
= tp
->request_r_scale
;
4086 tp
->snd_wnd
= th
->th_win
<< tp
->snd_scale
;
4087 tp
->max_sndwnd
= tp
->snd_wnd
;
4088 tiwin
= tp
->snd_wnd
;
4092 * SYN-RECEIVED -> ESTABLISHED
4093 * SYN-RECEIVED* -> FIN-WAIT-1
4095 tp
->t_starttime
= tcp_now
;
4096 tcp_sbrcv_tstmp_check(tp
);
4097 if (tp
->t_flags
& TF_NEEDFIN
) {
4098 DTRACE_TCP4(state__change
, void, NULL
,
4099 struct inpcb
*, inp
,
4100 struct tcpcb
*, tp
, int32_t, TCPS_FIN_WAIT_1
);
4101 tp
->t_state
= TCPS_FIN_WAIT_1
;
4102 tp
->t_flags
&= ~TF_NEEDFIN
;
4104 TCP_LOG_CONNECTION_SUMMARY(tp
);
4106 DTRACE_TCP4(state__change
, void, NULL
,
4107 struct inpcb
*, inp
,
4108 struct tcpcb
*, tp
, int32_t, TCPS_ESTABLISHED
);
4109 tp
->t_state
= TCPS_ESTABLISHED
;
4110 tp
->t_timer
[TCPT_KEEP
] = OFFSET_FROM_START(tp
,
4111 TCP_CONN_KEEPIDLE(tp
));
4112 if (nstat_collect
) {
4113 nstat_route_connect_success(
4114 tp
->t_inpcb
->inp_route
.ro_rt
);
4118 * The SYN is acknowledged but una is not updated
4119 * yet. So pass the value of ack to compute
4120 * sndbytes correctly
4122 inp_count_sndbytes(inp
, th
->th_ack
);
4124 tp
->t_forced_acks
= TCP_FORCED_ACKS_COUNT
;
4126 * If segment contains data or ACK, will call tcp_reass()
4127 * later; if not, do so now to pass queued data to user.
4129 if (tlen
== 0 && (thflags
& TH_FIN
) == 0) {
4131 memcpy(&saved_hdr
, ip6
, sizeof(struct ip6_hdr
));
4132 ip6
= (struct ip6_hdr
*)&saved_hdr
[0];
4134 memcpy(&saved_hdr
, ip
, ip
->ip_hl
<< 2);
4135 ip
= (struct ip
*)&saved_hdr
[0];
4137 memcpy(&saved_tcphdr
, th
, sizeof(struct tcphdr
));
4138 (void) tcp_reass(tp
, (struct tcphdr
*)0, &tlen
,
4139 NULL
, ifp
, &read_wakeup
);
4142 tp
->snd_wl1
= th
->th_seq
- 1;
4146 * Do not send the connect notification for additional subflows
4147 * until ACK for 3-way handshake arrives.
4149 if ((!(tp
->t_mpflags
& TMPF_MPTCP_TRUE
)) &&
4150 (tp
->t_mpflags
& TMPF_SENT_JOIN
)) {
4151 isconnected
= FALSE
;
4155 if ((tp
->t_tfo_flags
& TFO_F_COOKIE_VALID
)) {
4156 /* Done this when receiving the SYN */
4157 isconnected
= FALSE
;
4159 OSDecrementAtomic(&tcp_tfo_halfcnt
);
4161 /* Panic if something has gone terribly wrong. */
4162 VERIFY(tcp_tfo_halfcnt
>= 0);
4164 tp
->t_tfo_flags
&= ~TFO_F_COOKIE_VALID
;
4168 * In case there is data in the send-queue (e.g., TFO is being
4169 * used, or connectx+data has been done), then if we would
4170 * "FALLTHROUGH", we would handle this ACK as if data has been
4171 * acknowledged. But, we have to prevent this. And this
4172 * can be prevented by increasing snd_una by 1, so that the
4173 * SYN is not considered as data (snd_una++ is actually also
4174 * done in SYN_SENT-state as part of the regular TCP stack).
4176 * In case there is data on this ack as well, the data will be
4177 * handled by the label "dodata" right after step6.
4179 if (so
->so_snd
.sb_cc
) {
4180 tp
->snd_una
++; /* SYN is acked */
4181 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
)) {
4182 tp
->snd_nxt
= tp
->snd_una
;
4186 * No duplicate-ACK handling is needed. So, we
4187 * directly advance to processing the ACK (aka,
4188 * updating the RTT estimation,...)
4190 * But, we first need to handle eventual SACKs,
4191 * because TFO will start sending data with the
4192 * SYN/ACK, so it might be that the client
4193 * includes a SACK with its ACK.
4195 if (SACK_ENABLED(tp
) &&
4196 (to
.to_nsacks
> 0 || !TAILQ_EMPTY(&tp
->snd_holes
))) {
4197 tcp_sack_doack(tp
, &to
, th
, &sack_bytes_acked
, &sack_bytes_newly_acked
);
4206 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
4207 * ACKs. If the ack is in the range
4208 * tp->snd_una < th->th_ack <= tp->snd_max
4209 * then advance tp->snd_una to th->th_ack and drop
4210 * data from the retransmission queue. If this ACK reflects
4211 * more up to date window information we update our window information.
4213 case TCPS_ESTABLISHED
:
4214 case TCPS_FIN_WAIT_1
:
4215 case TCPS_FIN_WAIT_2
:
4216 case TCPS_CLOSE_WAIT
:
4219 case TCPS_TIME_WAIT
:
4220 if (SEQ_GT(th
->th_ack
, tp
->snd_max
)) {
4221 tcpstat
.tcps_rcvacktoomuch
++;
4222 if (tcp_is_ack_ratelimited(tp
)) {
4223 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "rfc5961 rcvacktoomuch");
4229 if (SEQ_LT(th
->th_ack
, tp
->snd_una
- tp
->max_sndwnd
)) {
4230 if (tcp_is_ack_ratelimited(tp
)) {
4231 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "rfc5961 bad ACK");
4237 if (SACK_ENABLED(tp
) && to
.to_nsacks
> 0) {
4238 recvd_dsack
= tcp_sack_process_dsack(tp
, &to
, th
);
4240 * If DSACK is received and this packet has no
4241 * other SACK information, it can be dropped.
4242 * We do not want to treat it as a duplicate ack.
4245 SEQ_LEQ(th
->th_ack
, tp
->snd_una
) &&
4246 to
.to_nsacks
== 0) {
4247 tcp_bad_rexmt_check(tp
, th
, &to
);
4252 if (SACK_ENABLED(tp
) &&
4253 (to
.to_nsacks
> 0 || !TAILQ_EMPTY(&tp
->snd_holes
))) {
4254 tcp_sack_doack(tp
, &to
, th
, &sack_bytes_acked
, &sack_bytes_newly_acked
);
4258 if (tp
->t_mpuna
&& SEQ_GEQ(th
->th_ack
, tp
->t_mpuna
)) {
4259 if (tp
->t_mpflags
& TMPF_PREESTABLISHED
) {
4260 /* MP TCP establishment succeeded */
4262 if (tp
->t_mpflags
& TMPF_JOINED_FLOW
) {
4263 if (tp
->t_mpflags
& TMPF_SENT_JOIN
) {
4265 ~TMPF_PREESTABLISHED
;
4269 tp
->t_timer
[TCPT_JACK_RXMT
] = 0;
4270 tp
->t_mprxtshift
= 0;
4273 isconnected
= FALSE
;
4282 tcp_tfo_rcv_ack(tp
, th
);
4285 * If we have outstanding data (other than
4286 * a window probe), this is a completely
4287 * duplicate ack and the ack is the biggest we've seen.
4289 * Need to accommodate a change in window on duplicate acks
4290 * to allow operating systems that update window during
4291 * recovery with SACK
4293 if (SEQ_LEQ(th
->th_ack
, tp
->snd_una
)) {
4294 if (tlen
== 0 && (tiwin
== tp
->snd_wnd
||
4295 (to
.to_nsacks
> 0 && sack_bytes_acked
> 0))) {
4296 uint32_t old_dupacks
;
4298 * If both ends send FIN at the same time,
4299 * then the ack will be a duplicate ack
4300 * but we have to process the FIN. Check
4301 * for this condition and process the FIN
4302 * instead of the dupack
4304 if ((thflags
& TH_FIN
) &&
4305 !TCPS_HAVERCVDFIN(tp
->t_state
)) {
4309 old_dupacks
= tp
->t_dupacks
;
4312 * MPTCP options that are ignored must
4313 * not be treated as duplicate ACKs.
4315 if (to
.to_flags
& TOF_MPTCP
) {
4319 if ((isconnected
) && (tp
->t_mpflags
& TMPF_JOINED_FLOW
)) {
4320 mptcplog((LOG_DEBUG
, "MPTCP "
4321 "Sockets: bypass ack recovery\n"),
4323 MPTCP_LOGLVL_VERBOSE
);
4328 * If a duplicate acknowledgement was seen
4329 * after ECN, it indicates packet loss in
4330 * addition to ECN. Reset INRECOVERY flag
4331 * so that we can process partial acks
4334 if (tp
->ecn_flags
& TE_INRECOVERY
) {
4335 tp
->ecn_flags
&= ~TE_INRECOVERY
;
4338 tcpstat
.tcps_rcvdupack
++;
4339 if (SACK_ENABLED(tp
) && tcp_do_better_lr
) {
4340 tp
->t_dupacks
+= max(1, sack_bytes_acked
/ tp
->t_maxseg
);
4345 tp
->sackhint
.sack_bytes_acked
+= sack_bytes_acked
;
4347 if (SACK_ENABLED(tp
) && tcp_do_better_lr
) {
4348 tp
->t_new_dupacks
+= (sack_bytes_newly_acked
/ tp
->t_maxseg
);
4350 if (tp
->t_new_dupacks
>= tp
->t_rexmtthresh
&& IN_FASTRECOVERY(tp
)) {
4351 /* Let's restart the retransmission */
4352 tcp_sack_lost_rexmit(tp
);
4355 * If the current tcp cc module has
4356 * defined a hook for tasks to run
4357 * before entering FR, call it
4359 if (CC_ALGO(tp
)->pre_fr
!= NULL
) {
4360 CC_ALGO(tp
)->pre_fr(tp
);
4363 ENTER_FASTRECOVERY(tp
);
4365 if (tp
->t_flags
& TF_SENTFIN
) {
4366 tp
->snd_recover
= tp
->snd_max
- 1;
4368 tp
->snd_recover
= tp
->snd_max
;
4372 if (TCP_ECN_ENABLED(tp
)) {
4373 tp
->ecn_flags
|= TE_SENDCWR
;
4376 if (tp
->t_flagsext
& TF_CWND_NONVALIDATED
) {
4377 tcp_cc_adjust_nonvalidated_cwnd(tp
);
4379 tp
->snd_cwnd
= tp
->snd_ssthresh
;
4385 * Check if we need to reset the limit on
4388 if (tp
->t_early_rexmt_count
> 0 &&
4390 (tp
->t_early_rexmt_win
+
4391 TCP_EARLY_REXMT_WIN
))) {
4392 tp
->t_early_rexmt_count
= 0;
4396 * Is early retransmit needed? We check for
4397 * this when the connection is waiting for
4398 * duplicate acks to enter fast recovery.
4400 if (!IN_FASTRECOVERY(tp
)) {
4401 tcp_early_rexmt_check(tp
, th
);
4405 * If we've seen exactly rexmt threshold
4406 * of duplicate acks, assume a packet
4407 * has been dropped and retransmit it.
4408 * Kludge snd_nxt & the congestion
4409 * window so we send only this one
4412 * We know we're losing at the current
4413 * window size so do congestion avoidance
4414 * (set ssthresh to half the current window
4415 * and pull our congestion window back to
4416 * the new ssthresh).
4418 * Dup acks mean that packets have left the
4419 * network (they're now cached at the receiver)
4420 * so bump cwnd by the amount in the receiver
4421 * to keep a constant cwnd packets in the
4424 if (tp
->t_timer
[TCPT_REXMT
] == 0 ||
4425 (th
->th_ack
!= tp
->snd_una
&& sack_bytes_acked
== 0)) {
4427 tp
->t_rexmtthresh
= tcprexmtthresh
;
4428 tp
->t_new_dupacks
= 0;
4429 } else if ((tp
->t_dupacks
> tp
->t_rexmtthresh
&& (!tcp_do_better_lr
|| old_dupacks
>= tp
->t_rexmtthresh
)) ||
4430 IN_FASTRECOVERY(tp
)) {
4432 * If this connection was seeing packet
4433 * reordering, then recovery might be
4434 * delayed to disambiguate between
4435 * reordering and loss
4437 if (SACK_ENABLED(tp
) && !IN_FASTRECOVERY(tp
) &&
4439 (TF_PKTS_REORDERED
| TF_DELAY_RECOVERY
)) ==
4440 (TF_PKTS_REORDERED
| TF_DELAY_RECOVERY
)) {
4442 * Since the SACK information is already
4443 * updated, this ACK will be dropped
4449 * Dup acks mean that packets have left the
4450 * network (they're now cached at the receiver)
4451 * so bump cwnd by the amount in the receiver
4452 * to keep a constant cwnd packets in the
4455 if (SACK_ENABLED(tp
) && IN_FASTRECOVERY(tp
)) {
4459 * Compute the amount of data in flight first.
4460 * We can inject new data into the pipe iff
4461 * we have less than snd_ssthres worth of data in
4464 awnd
= (tp
->snd_nxt
- tp
->snd_fack
) + tp
->sackhint
.sack_bytes_rexmit
;
4465 if (awnd
< tp
->snd_ssthresh
) {
4466 tp
->snd_cwnd
+= tp
->t_maxseg
;
4467 if (tp
->snd_cwnd
> tp
->snd_ssthresh
) {
4468 tp
->snd_cwnd
= tp
->snd_ssthresh
;
4472 tp
->snd_cwnd
+= tp
->t_maxseg
;
4475 /* Process any window updates */
4476 if (tiwin
> tp
->snd_wnd
) {
4477 tcp_update_window(tp
, thflags
,
4480 tcp_ccdbg_trace(tp
, th
,
4481 TCP_CC_IN_FASTRECOVERY
);
4483 (void) tcp_output(tp
);
4486 } else if ((!tcp_do_better_lr
&& tp
->t_dupacks
== tp
->t_rexmtthresh
) ||
4487 (tcp_do_better_lr
&& tp
->t_dupacks
>= tp
->t_rexmtthresh
)) {
4488 tcp_seq onxt
= tp
->snd_nxt
;
4491 * If we're doing sack, check to
4492 * see if we're already in sack
4493 * recovery. If we're not doing sack,
4494 * check to see if we're in newreno
4497 if (SACK_ENABLED(tp
)) {
4498 if (IN_FASTRECOVERY(tp
)) {
4501 } else if (tp
->t_flagsext
& TF_DELAY_RECOVERY
) {
4505 if (SEQ_LEQ(th
->th_ack
, tp
->snd_recover
)) {
4510 if (tp
->t_flags
& TF_SENTFIN
) {
4511 tp
->snd_recover
= tp
->snd_max
- 1;
4513 tp
->snd_recover
= tp
->snd_max
;
4515 tp
->t_timer
[TCPT_PTO
] = 0;
4519 * If the connection has seen pkt
4520 * reordering, delay recovery until
4521 * it is clear that the packet
4524 if (SACK_ENABLED(tp
) &&
4526 (TF_PKTS_REORDERED
| TF_DELAY_RECOVERY
))
4527 == TF_PKTS_REORDERED
&&
4528 !IN_FASTRECOVERY(tp
) &&
4529 tp
->t_reorderwin
> 0 &&
4530 (tp
->t_state
== TCPS_ESTABLISHED
||
4531 tp
->t_state
== TCPS_FIN_WAIT_1
)) {
4532 tp
->t_timer
[TCPT_DELAYFR
] =
4533 OFFSET_FROM_START(tp
,
4535 tp
->t_flagsext
|= TF_DELAY_RECOVERY
;
4536 tcpstat
.tcps_delay_recovery
++;
4537 tcp_ccdbg_trace(tp
, th
,
4538 TCP_CC_DELAY_FASTRECOVERY
);
4542 tcp_rexmt_save_state(tp
);
4544 * If the current tcp cc module has
4545 * defined a hook for tasks to run
4546 * before entering FR, call it
4548 if (CC_ALGO(tp
)->pre_fr
!= NULL
) {
4549 CC_ALGO(tp
)->pre_fr(tp
);
4551 ENTER_FASTRECOVERY(tp
);
4552 tp
->t_timer
[TCPT_REXMT
] = 0;
4553 if (TCP_ECN_ENABLED(tp
)) {
4554 tp
->ecn_flags
|= TE_SENDCWR
;
4557 if (SACK_ENABLED(tp
)) {
4558 tcpstat
.tcps_sack_recovery_episode
++;
4559 tp
->t_sack_recovery_episode
++;
4560 tp
->sack_newdata
= tp
->snd_nxt
;
4561 if (tcp_do_better_lr
) {
4562 tp
->snd_cwnd
= tp
->snd_ssthresh
;
4564 tp
->snd_cwnd
= tp
->t_maxseg
;
4566 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
4568 /* Process any window updates */
4569 if (tiwin
> tp
->snd_wnd
) {
4570 tcp_update_window(tp
, thflags
, th
, tiwin
, tlen
);
4573 tcp_ccdbg_trace(tp
, th
, TCP_CC_ENTER_FASTRECOVERY
);
4574 (void) tcp_output(tp
);
4577 tp
->snd_nxt
= th
->th_ack
;
4578 tp
->snd_cwnd
= tp
->t_maxseg
;
4580 /* Process any window updates */
4581 if (tiwin
> tp
->snd_wnd
) {
4582 tcp_update_window(tp
, thflags
, th
, tiwin
, tlen
);
4585 (void) tcp_output(tp
);
4586 if (tp
->t_flagsext
& TF_CWND_NONVALIDATED
) {
4587 tcp_cc_adjust_nonvalidated_cwnd(tp
);
4589 tp
->snd_cwnd
= tp
->snd_ssthresh
+ tp
->t_maxseg
* tp
->t_dupacks
;
4591 if (SEQ_GT(onxt
, tp
->snd_nxt
)) {
4595 tcp_ccdbg_trace(tp
, th
, TCP_CC_ENTER_FASTRECOVERY
);
4597 } else if (ALLOW_LIMITED_TRANSMIT(tp
) &&
4598 (!(SACK_ENABLED(tp
)) || sack_bytes_acked
> 0) &&
4599 (so
->so_snd
.sb_cc
- (tp
->snd_max
- tp
->snd_una
)) > 0) {
4600 u_int32_t incr
= (tp
->t_maxseg
* tp
->t_dupacks
);
4602 /* Use Limited Transmit algorithm on the first two
4603 * duplicate acks when there is new data to transmit
4605 tp
->snd_cwnd
+= incr
;
4606 tcpstat
.tcps_limited_txt
++;
4607 (void) tcp_output(tp
);
4609 tcp_ccdbg_trace(tp
, th
, TCP_CC_LIMITED_TRANSMIT
);
4611 /* Reset snd_cwnd back to normal */
4612 tp
->snd_cwnd
-= incr
;
4618 * If the congestion window was inflated to account
4619 * for the other side's cached packets, retract it.
4621 if (IN_FASTRECOVERY(tp
)) {
4622 if (SEQ_LT(th
->th_ack
, tp
->snd_recover
)) {
4624 * If we received an ECE and entered
4625 * recovery, the subsequent ACKs should
4626 * not be treated as partial acks.
4628 if (tp
->ecn_flags
& TE_INRECOVERY
) {
4632 if (SACK_ENABLED(tp
)) {
4633 tcp_sack_partialack(tp
, th
);
4635 tcp_newreno_partial_ack(tp
, th
);
4637 tcp_ccdbg_trace(tp
, th
, TCP_CC_PARTIAL_ACK
);
4639 if (tcp_cubic_minor_fixes
) {
4642 EXIT_FASTRECOVERY(tp
);
4643 if (CC_ALGO(tp
)->post_fr
!= NULL
) {
4644 CC_ALGO(tp
)->post_fr(tp
, th
);
4647 tcp_clear_pipeack_state(tp
);
4648 tcp_ccdbg_trace(tp
, th
,
4649 TCP_CC_EXIT_FASTRECOVERY
);
4651 } else if ((tp
->t_flagsext
&
4652 (TF_PKTS_REORDERED
| TF_DELAY_RECOVERY
))
4653 == (TF_PKTS_REORDERED
| TF_DELAY_RECOVERY
)) {
4655 * If the ack acknowledges upto snd_recover or if
4656 * it acknowledges all the snd holes, exit
4657 * recovery and cancel the timer. Otherwise,
4658 * this is a partial ack. Wait for recovery timer
4659 * to enter recovery. The snd_holes have already
4662 if (SEQ_GEQ(th
->th_ack
, tp
->snd_recover
) ||
4663 TAILQ_EMPTY(&tp
->snd_holes
)) {
4664 tp
->t_timer
[TCPT_DELAYFR
] = 0;
4665 tp
->t_flagsext
&= ~TF_DELAY_RECOVERY
;
4666 EXIT_FASTRECOVERY(tp
);
4667 tcp_ccdbg_trace(tp
, th
,
4668 TCP_CC_EXIT_FASTRECOVERY
);
4672 * We were not in fast recovery. Reset the
4673 * duplicate ack counter.
4676 tp
->t_rexmtthresh
= tcprexmtthresh
;
4677 tp
->t_new_dupacks
= 0;
4681 VERIFY(SEQ_GEQ(th
->th_ack
, tp
->snd_una
));
4682 acked
= BYTES_ACKED(th
, tp
);
4683 tcpstat
.tcps_rcvackpack
++;
4684 tcpstat
.tcps_rcvackbyte
+= acked
;
4687 * If the last packet was a retransmit, make sure
4688 * it was not spurious.
4690 * This will also take care of congestion window
4691 * adjustment if a last packet was recovered due to a
4694 tcp_bad_rexmt_check(tp
, th
, &to
);
4696 /* Recalculate the RTT */
4697 tcp_compute_rtt(tp
, &to
, th
);
4700 * If all outstanding data is acked, stop retransmit
4701 * timer and remember to restart (more output or persist).
4702 * If there is more data to be acked, restart retransmit
4703 * timer, using current (possibly backed-off) value.
4705 TCP_RESET_REXMT_STATE(tp
);
4706 TCPT_RANGESET(tp
->t_rxtcur
, TCP_REXMTVAL(tp
),
4707 tp
->t_rttmin
, TCPTV_REXMTMAX
,
4708 TCP_ADD_REXMTSLOP(tp
));
4709 if (th
->th_ack
== tp
->snd_max
) {
4710 tp
->t_timer
[TCPT_REXMT
] = 0;
4711 tp
->t_timer
[TCPT_PTO
] = 0;
4713 } else if (tp
->t_timer
[TCPT_PERSIST
] == 0) {
4714 tp
->t_timer
[TCPT_REXMT
] = OFFSET_FROM_START(tp
,
4718 if ((prev_t_state
== TCPS_SYN_SENT
||
4719 prev_t_state
== TCPS_SYN_RECEIVED
) &&
4720 tp
->t_state
== TCPS_ESTABLISHED
) {
4721 TCP_LOG_RTT_INFO(tp
);
4725 * If no data (only SYN) was ACK'd, skip rest of ACK
4733 * When outgoing data has been acked (except the SYN+data), we
4734 * mark this connection as "sending good" for TFO.
4736 if ((tp
->t_tfo_stats
& TFO_S_SYN_DATA_SENT
) &&
4737 !(tp
->t_tfo_flags
& TFO_F_NO_SNDPROBING
) &&
4738 !(th
->th_flags
& TH_SYN
)) {
4739 tp
->t_tfo_flags
|= TFO_F_NO_SNDPROBING
;
4743 * If TH_ECE is received, make sure that ECN is enabled
4744 * on that connection and we have sent ECT on data packets.
4746 if ((thflags
& TH_ECE
) != 0 && TCP_ECN_ENABLED(tp
) &&
4747 (tp
->ecn_flags
& TE_SENDIPECT
)) {
4749 * Reduce the congestion window if we haven't
4752 if (!IN_FASTRECOVERY(tp
)) {
4753 tcp_reduce_congestion_window(tp
);
4754 tp
->ecn_flags
|= (TE_INRECOVERY
| TE_SENDCWR
);
4756 * Also note that the connection received
4759 tp
->ecn_flags
|= TE_RECV_ECN_ECE
;
4760 INP_INC_IFNET_STAT(inp
, ecn_recv_ece
);
4761 tcpstat
.tcps_ecn_recv_ece
++;
4762 tcp_ccdbg_trace(tp
, th
, TCP_CC_ECN_RCVD
);
4767 * When new data is acked, open the congestion window.
4768 * The specifics of how this is achieved are up to the
4769 * congestion control algorithm in use for this connection.
4771 * The calculations in this function assume that snd_una is
4774 if (!IN_FASTRECOVERY(tp
) && !exiting_fr
) {
4775 if (CC_ALGO(tp
)->ack_rcvd
!= NULL
) {
4776 CC_ALGO(tp
)->ack_rcvd(tp
, th
);
4778 tcp_ccdbg_trace(tp
, th
, TCP_CC_ACK_RCVD
);
4780 if (acked
> so
->so_snd
.sb_cc
) {
4781 tp
->snd_wnd
-= so
->so_snd
.sb_cc
;
4782 sbdrop(&so
->so_snd
, (int)so
->so_snd
.sb_cc
);
4785 sbdrop(&so
->so_snd
, acked
);
4786 tcp_sbsnd_trim(&so
->so_snd
);
4787 tp
->snd_wnd
-= acked
;
4790 /* detect una wraparound */
4791 if (!IN_FASTRECOVERY(tp
) &&
4792 SEQ_GT(tp
->snd_una
, tp
->snd_recover
) &&
4793 SEQ_LEQ(th
->th_ack
, tp
->snd_recover
)) {
4794 tp
->snd_recover
= th
->th_ack
- 1;
4797 if (IN_FASTRECOVERY(tp
) &&
4798 SEQ_GEQ(th
->th_ack
, tp
->snd_recover
)) {
4799 EXIT_FASTRECOVERY(tp
);
4802 tcp_update_snd_una(tp
, th
->th_ack
);
4804 if (SACK_ENABLED(tp
)) {
4805 if (SEQ_GT(tp
->snd_una
, tp
->snd_recover
)) {
4806 tp
->snd_recover
= tp
->snd_una
;
4809 if (SEQ_LT(tp
->snd_nxt
, tp
->snd_una
)) {
4810 tp
->snd_nxt
= tp
->snd_una
;
4812 if (!SLIST_EMPTY(&tp
->t_rxt_segments
) &&
4813 !TCP_DSACK_SEQ_IN_WINDOW(tp
, tp
->t_dsack_lastuna
,
4815 tcp_rxtseg_clean(tp
);
4817 if ((tp
->t_flagsext
& TF_MEASURESNDBW
) != 0 &&
4818 tp
->t_bwmeas
!= NULL
) {
4819 tcp_bwmeas_check(tp
);
4824 if (!SLIST_EMPTY(&tp
->t_notify_ack
)) {
4825 tcp_notify_acknowledgement(tp
, so
);
4828 switch (tp
->t_state
) {
4830 * In FIN_WAIT_1 STATE in addition to the processing
4831 * for the ESTABLISHED state if our FIN is now acknowledged
4832 * then enter FIN_WAIT_2.
4834 case TCPS_FIN_WAIT_1
:
4835 if (ourfinisacked
) {
4837 * If we can't receive any more
4838 * data, then closing user can proceed.
4839 * Starting the TCPT_2MSL timer is contrary to the
4840 * specification, but if we don't get a FIN
4841 * we'll hang forever.
4843 if (so
->so_state
& SS_CANTRCVMORE
) {
4844 tp
->t_timer
[TCPT_2MSL
] = OFFSET_FROM_START(tp
,
4845 TCP_CONN_MAXIDLE(tp
));
4846 isconnected
= FALSE
;
4847 isdisconnected
= TRUE
;
4849 DTRACE_TCP4(state__change
, void, NULL
,
4850 struct inpcb
*, inp
,
4852 int32_t, TCPS_FIN_WAIT_2
);
4853 tp
->t_state
= TCPS_FIN_WAIT_2
;
4854 /* fall through and make sure we also recognize
4855 * data ACKed with the FIN
4861 * In CLOSING STATE in addition to the processing for
4862 * the ESTABLISHED state if the ACK acknowledges our FIN
4863 * then enter the TIME-WAIT state, otherwise ignore
4867 if (ourfinisacked
) {
4868 DTRACE_TCP4(state__change
, void, NULL
,
4869 struct inpcb
*, inp
,
4871 int32_t, TCPS_TIME_WAIT
);
4872 tp
->t_state
= TCPS_TIME_WAIT
;
4873 tcp_canceltimers(tp
);
4874 if (tp
->t_flagsext
& TF_NOTIMEWAIT
) {
4875 tp
->t_flags
|= TF_CLOSING
;
4877 add_to_time_wait(tp
, 2 * tcp_msl
);
4879 isconnected
= FALSE
;
4880 isdisconnected
= TRUE
;
4885 * In LAST_ACK, we may still be waiting for data to drain
4886 * and/or to be acked, as well as for the ack of our FIN.
4887 * If our FIN is now acknowledged, delete the TCB,
4888 * enter the closed state and return.
4891 if (ourfinisacked
) {
4898 * In TIME_WAIT state the only thing that should arrive
4899 * is a retransmission of the remote FIN. Acknowledge
4900 * it and restart the finack timer.
4902 case TCPS_TIME_WAIT
:
4903 add_to_time_wait(tp
, 2 * tcp_msl
);
4908 * If there is a SACK option on the ACK and we
4909 * haven't seen any duplicate acks before, count
4910 * it as a duplicate ack even if the cumulative
4911 * ack is advanced. If the receiver delayed an
4912 * ack and detected loss afterwards, then the ack
4913 * will advance cumulative ack and will also have
4914 * a SACK option. So counting it as one duplicate
4917 if (tp
->t_state
== TCPS_ESTABLISHED
&&
4918 SACK_ENABLED(tp
) && sack_bytes_acked
> 0 &&
4919 to
.to_nsacks
> 0 && tp
->t_dupacks
== 0 &&
4920 SEQ_LEQ(th
->th_ack
, tp
->snd_una
) && tlen
== 0 &&
4921 !(tp
->t_flagsext
& TF_PKTS_REORDERED
)) {
4922 tcpstat
.tcps_sack_ackadv
++;
4923 goto process_dupack
;
4929 * Update window information.
4931 if (tcp_update_window(tp
, thflags
, th
, tiwin
, tlen
)) {
4936 * Process segments with URG.
4938 if ((thflags
& TH_URG
) && th
->th_urp
&&
4939 TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
4941 * This is a kludge, but if we receive and accept
4942 * random urgent pointers, we'll crash in
4943 * soreceive. It's hard to imagine someone
4944 * actually wanting to send this much urgent data.
4946 if (th
->th_urp
+ so
->so_rcv
.sb_cc
> sb_max
) {
4947 th
->th_urp
= 0; /* XXX */
4948 thflags
&= ~TH_URG
; /* XXX */
4949 goto dodata
; /* XXX */
4952 * If this segment advances the known urgent pointer,
4953 * then mark the data stream. This should not happen
4954 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4955 * a FIN has been received from the remote side.
4956 * In these states we ignore the URG.
4958 * According to RFC961 (Assigned Protocols),
4959 * the urgent pointer points to the last octet
4960 * of urgent data. We continue, however,
4961 * to consider it to indicate the first octet
4962 * of data past the urgent section as the original
4963 * spec states (in one of two places).
4965 if (SEQ_GT(th
->th_seq
+ th
->th_urp
, tp
->rcv_up
)) {
4966 tp
->rcv_up
= th
->th_seq
+ th
->th_urp
;
4967 so
->so_oobmark
= so
->so_rcv
.sb_cc
+
4968 (tp
->rcv_up
- tp
->rcv_nxt
) - 1;
4969 if (so
->so_oobmark
== 0) {
4970 so
->so_state
|= SS_RCVATMARK
;
4973 tp
->t_oobflags
&= ~(TCPOOB_HAVEDATA
| TCPOOB_HADDATA
);
4976 * Remove out of band data so doesn't get presented to user.
4977 * This can happen independent of advancing the URG pointer,
4978 * but if two URG's are pending at once, some out-of-band
4979 * data may creep in... ick.
4981 if (th
->th_urp
<= (u_int32_t
)tlen
4983 && (so
->so_options
& SO_OOBINLINE
) == 0
4986 tcp_pulloutofband(so
, th
, m
,
4987 drop_hdrlen
); /* hdr drop is delayed */
4991 * If no out of band data is expected,
4992 * pull receive urgent pointer along
4993 * with the receive window.
4995 if (SEQ_GT(tp
->rcv_nxt
, tp
->rcv_up
)) {
4996 tp
->rcv_up
= tp
->rcv_nxt
;
5001 /* Set socket's connect or disconnect state correcly before doing data.
5002 * The following might unlock the socket if there is an upcall or a socket
5007 } else if (isdisconnected
) {
5008 soisdisconnected(so
);
5011 /* Let's check the state of pcb just to make sure that it did not get closed
5012 * when we unlocked above
5014 if (inp
->inp_state
== INPCB_STATE_DEAD
) {
5015 /* Just drop the packet that we are processing and return */
5016 TCP_LOG_DROP_PCB(TCP_LOG_HDR
, th
, tp
, false, "INPCB_STATE_DEAD");
5021 * Process the segment text, merging it into the TCP sequencing queue,
5022 * and arranging for acknowledgment of receipt if necessary.
5023 * This process logically involves adjusting tp->rcv_wnd as data
5024 * is presented to the user (this happens in tcp_usrreq.c,
5025 * case PRU_RCVD). If a FIN has already been received on this
5026 * connection then we just ignore the text.
5028 * If we are in SYN-received state and got a valid TFO cookie, we want
5029 * to process the data.
5031 if ((tlen
|| (thflags
& TH_FIN
)) &&
5032 TCPS_HAVERCVDFIN(tp
->t_state
) == 0 &&
5033 (TCPS_HAVEESTABLISHED(tp
->t_state
) ||
5034 (tp
->t_state
== TCPS_SYN_RECEIVED
&&
5035 (tp
->t_tfo_flags
& TFO_F_COOKIE_VALID
)))) {
5036 tcp_seq save_start
= th
->th_seq
;
5037 tcp_seq save_end
= th
->th_seq
+ tlen
;
5038 m_adj(m
, drop_hdrlen
); /* delayed header drop */
5040 * Insert segment which includes th into TCP reassembly queue
5041 * with control block tp. Set thflags to whether reassembly now
5042 * includes a segment with FIN. This handles the common case
5043 * inline (segment is the next to be received on an established
5044 * connection, and the queue is empty), avoiding linkage into
5045 * and removal from the queue and repetition of various
5047 * Set DELACK for segments received in order, but ack
5048 * immediately when segments are out of order (so
5049 * fast retransmit can work).
5051 if (th
->th_seq
== tp
->rcv_nxt
&& LIST_EMPTY(&tp
->t_segq
)) {
5052 TCP_INC_VAR(tp
->t_unacksegs
, segment_count
);
5054 * Calculate the RTT on the receiver only if the
5055 * connection is in streaming mode and the last
5056 * packet was not an end-of-write
5058 if (tp
->t_flags
& TF_STREAMING_ON
) {
5059 tcp_compute_rtt(tp
, &to
, th
);
5062 if (DELAY_ACK(tp
, th
) &&
5063 ((tp
->t_flags
& TF_ACKNOW
) == 0)) {
5064 if ((tp
->t_flags
& TF_DELACK
) == 0) {
5065 tp
->t_flags
|= TF_DELACK
;
5066 tp
->t_timer
[TCPT_DELACK
] =
5067 OFFSET_FROM_START(tp
, tcp_delack
);
5070 tp
->t_flags
|= TF_ACKNOW
;
5072 tp
->rcv_nxt
+= tlen
;
5073 thflags
= th
->th_flags
& TH_FIN
;
5074 TCP_INC_VAR(tcpstat
.tcps_rcvpack
, segment_count
);
5075 tcpstat
.tcps_rcvbyte
+= tlen
;
5076 if (nstat_collect
) {
5077 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
5079 INP_ADD_STAT(inp
, cell
, wifi
, wired
,
5081 inp_set_activity_bitmap(inp
);
5083 tcp_sbrcv_grow(tp
, &so
->so_rcv
, &to
, tlen
);
5084 so_recv_data_stat(so
, m
, drop_hdrlen
);
5087 memcpy(&saved_hdr
, ip6
, sizeof(struct ip6_hdr
));
5088 ip6
= (struct ip6_hdr
*)&saved_hdr
[0];
5090 memcpy(&saved_hdr
, ip
, ip
->ip_hl
<< 2);
5091 ip
= (struct ip
*)&saved_hdr
[0];
5093 memcpy(&saved_tcphdr
, th
, sizeof(struct tcphdr
));
5095 if (th
->th_flags
& TH_PUSH
) {
5096 tp
->t_flagsext
|= TF_LAST_IS_PSH
;
5098 tp
->t_flagsext
&= ~TF_LAST_IS_PSH
;
5101 if (sbappendstream_rcvdemux(so
, m
)) {
5107 memcpy(&saved_hdr
, ip6
, sizeof(struct ip6_hdr
));
5108 ip6
= (struct ip6_hdr
*)&saved_hdr
[0];
5110 memcpy(&saved_hdr
, ip
, ip
->ip_hl
<< 2);
5111 ip
= (struct ip
*)&saved_hdr
[0];
5114 if (tcp_autotune_reorder
) {
5115 tcp_sbrcv_grow(tp
, &so
->so_rcv
, &to
, tlen
);
5118 memcpy(&saved_tcphdr
, th
, sizeof(struct tcphdr
));
5119 thflags
= tcp_reass(tp
, th
, &tlen
, m
, ifp
, &read_wakeup
);
5121 tp
->t_flags
|= TF_ACKNOW
;
5124 if ((tlen
> 0 || (th
->th_flags
& TH_FIN
)) && SACK_ENABLED(tp
)) {
5125 if (th
->th_flags
& TH_FIN
) {
5128 tcp_update_sack_list(tp
, save_start
, save_end
);
5131 tcp_adaptive_rwtimo_check(tp
, tlen
);
5134 tcp_tfo_rcv_data(tp
);
5137 if (tp
->t_flags
& TF_DELACK
) {
5139 KERNEL_DEBUG(DBG_LAYER_END
, ((th
->th_dport
<< 16) | th
->th_sport
),
5140 (((ip6
->ip6_src
.s6_addr16
[0]) << 16) | (ip6
->ip6_dst
.s6_addr16
[0])),
5141 th
->th_seq
, th
->th_ack
, th
->th_win
);
5143 KERNEL_DEBUG(DBG_LAYER_END
, ((th
->th_dport
<< 16) | th
->th_sport
),
5144 (((ip
->ip_src
.s_addr
& 0xffff) << 16) | (ip
->ip_dst
.s_addr
& 0xffff)),
5145 th
->th_seq
, th
->th_ack
, th
->th_win
);
5149 if ((so
->so_flags
& SOF_MP_SUBFLOW
) && tlen
== 0 &&
5150 (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
) &&
5151 (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
5152 m_adj(m
, drop_hdrlen
); /* delayed header drop */
5153 mptcp_input(tptomptp(tp
)->mpt_mpte
, m
);
5154 tp
->t_flags
|= TF_ACKNOW
;
5162 * If FIN is received ACK the FIN and let the user know
5163 * that the connection is closing.
5165 if (thflags
& TH_FIN
) {
5166 if (TCPS_HAVERCVDFIN(tp
->t_state
) == 0) {
5169 * If connection is half-synchronized
5170 * (ie NEEDSYN flag on) then delay ACK,
5171 * so it may be piggybacked when SYN is sent.
5172 * Otherwise, since we received a FIN then no
5173 * more input can be expected, send ACK now.
5175 TCP_INC_VAR(tp
->t_unacksegs
, segment_count
);
5176 tp
->t_flags
|= TF_ACKNOW
;
5179 switch (tp
->t_state
) {
5181 * In SYN_RECEIVED and ESTABLISHED STATES
5182 * enter the CLOSE_WAIT state.
5184 case TCPS_SYN_RECEIVED
:
5185 tp
->t_starttime
= tcp_now
;
5187 case TCPS_ESTABLISHED
:
5188 DTRACE_TCP4(state__change
, void, NULL
, struct inpcb
*, inp
,
5189 struct tcpcb
*, tp
, int32_t, TCPS_CLOSE_WAIT
);
5190 tp
->t_state
= TCPS_CLOSE_WAIT
;
5194 * If still in FIN_WAIT_1 STATE FIN has not been acked so
5195 * enter the CLOSING state.
5197 case TCPS_FIN_WAIT_1
:
5198 DTRACE_TCP4(state__change
, void, NULL
, struct inpcb
*, inp
,
5199 struct tcpcb
*, tp
, int32_t, TCPS_CLOSING
);
5200 tp
->t_state
= TCPS_CLOSING
;
5204 * In FIN_WAIT_2 state enter the TIME_WAIT state,
5205 * starting the time-wait timer, turning off the other
5208 case TCPS_FIN_WAIT_2
:
5209 DTRACE_TCP4(state__change
, void, NULL
,
5210 struct inpcb
*, inp
,
5212 int32_t, TCPS_TIME_WAIT
);
5213 tp
->t_state
= TCPS_TIME_WAIT
;
5214 tcp_canceltimers(tp
);
5215 tp
->t_flags
|= TF_ACKNOW
;
5216 if (tp
->t_flagsext
& TF_NOTIMEWAIT
) {
5217 tp
->t_flags
|= TF_CLOSING
;
5219 add_to_time_wait(tp
, 2 * tcp_msl
);
5221 soisdisconnected(so
);
5225 * In TIME_WAIT state restart the 2 MSL time_wait timer.
5227 case TCPS_TIME_WAIT
:
5228 add_to_time_wait(tp
, 2 * tcp_msl
);
5233 if (so
->so_options
& SO_DEBUG
) {
5234 tcp_trace(TA_INPUT
, ostate
, tp
, (void *)tcp_saveipgen
,
5240 mptcp_handle_input(so
);
5244 * Return any desired output.
5246 if (needoutput
|| (tp
->t_flags
& TF_ACKNOW
)) {
5247 (void) tcp_output(tp
);
5250 tcp_check_timer_state(tp
);
5252 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
5254 socket_unlock(so
, 1);
5255 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
5260 * Generate an ACK dropping incoming segment if it occupies
5261 * sequence space, where the ACK reflects our state.
5263 * We can now skip the test for the RST flag since all
5264 * paths to this code happen after packets containing
5265 * RST have been dropped.
5267 * In the SYN-RECEIVED state, don't send an ACK unless the
5268 * segment we received passes the SYN-RECEIVED ACK test.
5269 * If it fails send a RST. This breaks the loop in the
5270 * "LAND" DoS attack, and also prevents an ACK storm
5271 * between two listening ports that have been sent forged
5272 * SYN segments, each with the source address of the other.
5274 if (tp
->t_state
== TCPS_SYN_RECEIVED
&& (thflags
& TH_ACK
) &&
5275 (SEQ_GT(tp
->snd_una
, th
->th_ack
) ||
5276 SEQ_GT(th
->th_ack
, tp
->snd_max
))) {
5277 IF_TCP_STATINC(ifp
, dospacket
);
5281 if (so
->so_options
& SO_DEBUG
) {
5282 tcp_trace(TA_DROP
, ostate
, tp
, (void *)tcp_saveipgen
,
5287 tp
->t_flags
|= TF_ACKNOW
;
5289 (void) tcp_output(tp
);
5291 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
5293 /* Don't need to check timer state as we should have done it during tcp_output */
5294 socket_unlock(so
, 1);
5295 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
5297 dropwithresetnosock
:
5301 * Generate a RST, dropping incoming segment.
5302 * Make ACK acceptable to originator of segment.
5303 * Don't bother to respond if destination was broadcast/multicast.
5305 if ((thflags
& TH_RST
) || m
->m_flags
& (M_BCAST
| M_MCAST
)) {
5309 if (IN6_IS_ADDR_MULTICAST(&ip6
->ip6_dst
) ||
5310 IN6_IS_ADDR_MULTICAST(&ip6
->ip6_src
)) {
5313 } else if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
)) ||
5314 IN_MULTICAST(ntohl(ip
->ip_src
.s_addr
)) ||
5315 ip
->ip_src
.s_addr
== htonl(INADDR_BROADCAST
) ||
5316 in_broadcast(ip
->ip_dst
, m
->m_pkthdr
.rcvif
)) {
5319 /* IPv6 anycast check is done at tcp6_input() */
5322 if (tp
== 0 || (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
)) {
5323 tcp_trace(TA_DROP
, ostate
, tp
, (void *)tcp_saveipgen
,
5327 bzero(&tra
, sizeof(tra
));
5328 tra
.ifscope
= ifscope
;
5329 tra
.awdl_unrestricted
= 1;
5330 tra
.intcoproc_allowed
= 1;
5331 if (thflags
& TH_ACK
) {
5332 /* mtod() below is safe as long as hdr dropping is delayed */
5333 tcp_respond(tp
, mtod(m
, void *), th
, m
, (tcp_seq
)0, th
->th_ack
,
5336 if (thflags
& TH_SYN
) {
5339 /* mtod() below is safe as long as hdr dropping is delayed */
5340 tcp_respond(tp
, mtod(m
, void *), th
, m
, th
->th_seq
+ tlen
,
5341 (tcp_seq
)0, TH_RST
| TH_ACK
, &tra
);
5343 /* destroy temporarily created socket */
5346 socket_unlock(so
, 1);
5347 } else if ((inp
!= NULL
) && (nosock
== 0)) {
5348 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
5350 socket_unlock(so
, 1);
5352 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
5358 * Drop space held by incoming segment and return.
5361 if (tp
== 0 || (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
)) {
5362 tcp_trace(TA_DROP
, ostate
, tp
, (void *)tcp_saveipgen
,
5367 /* destroy temporarily created socket */
5370 socket_unlock(so
, 1);
5371 } else if (nosock
== 0) {
5372 tcp_handle_wakeup(so
, read_wakeup
, write_wakeup
);
5374 socket_unlock(so
, 1);
5376 KERNEL_DEBUG(DBG_FNC_TCP_INPUT
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
5381 * Parse TCP options and place in tcpopt.
5384 tcp_dooptions(struct tcpcb
*tp
, u_char
*cp
, int cnt
, struct tcphdr
*th
,
5390 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
5392 if (opt
== TCPOPT_EOL
) {
5395 if (opt
== TCPOPT_NOP
) {
5402 if (optlen
< 2 || optlen
> cnt
) {
5411 if (optlen
!= TCPOLEN_MAXSEG
) {
5414 if (!(th
->th_flags
& TH_SYN
)) {
5417 bcopy((char *) cp
+ 2, (char *) &mss
, sizeof(mss
));
5420 to
->to_flags
|= TOF_MSS
;
5424 if (optlen
!= TCPOLEN_WINDOW
) {
5427 if (!(th
->th_flags
& TH_SYN
)) {
5430 to
->to_flags
|= TOF_SCALE
;
5431 to
->to_requested_s_scale
= min(cp
[2], TCP_MAX_WINSHIFT
);
5434 case TCPOPT_TIMESTAMP
:
5435 if (optlen
!= TCPOLEN_TIMESTAMP
) {
5438 to
->to_flags
|= TOF_TS
;
5439 bcopy((char *)cp
+ 2,
5440 (char *)&to
->to_tsval
, sizeof(to
->to_tsval
));
5441 NTOHL(to
->to_tsval
);
5442 bcopy((char *)cp
+ 6,
5443 (char *)&to
->to_tsecr
, sizeof(to
->to_tsecr
));
5444 NTOHL(to
->to_tsecr
);
5445 /* Re-enable sending Timestamps if we received them */
5446 if (!(tp
->t_flags
& TF_REQ_TSTMP
)) {
5447 tp
->t_flags
|= TF_REQ_TSTMP
;
5450 case TCPOPT_SACK_PERMITTED
:
5451 if (optlen
!= TCPOLEN_SACK_PERMITTED
) {
5454 if (th
->th_flags
& TH_SYN
) {
5455 to
->to_flags
|= TOF_SACK
;
5459 if (optlen
<= 2 || (optlen
- 2) % TCPOLEN_SACK
!= 0) {
5462 to
->to_nsacks
= (optlen
- 2) / TCPOLEN_SACK
;
5463 to
->to_sacks
= cp
+ 2;
5464 tcpstat
.tcps_sack_rcv_blocks
++;
5467 case TCPOPT_FASTOPEN
:
5468 if (optlen
== TCPOLEN_FASTOPEN_REQ
) {
5469 if (tp
->t_state
!= TCPS_LISTEN
) {
5473 to
->to_flags
|= TOF_TFOREQ
;
5475 if (optlen
< TCPOLEN_FASTOPEN_REQ
||
5476 (optlen
- TCPOLEN_FASTOPEN_REQ
) > TFO_COOKIE_LEN_MAX
||
5477 (optlen
- TCPOLEN_FASTOPEN_REQ
) < TFO_COOKIE_LEN_MIN
) {
5480 if (tp
->t_state
!= TCPS_LISTEN
&&
5481 tp
->t_state
!= TCPS_SYN_SENT
) {
5485 to
->to_flags
|= TOF_TFO
;
5486 to
->to_tfo
= cp
+ 1;
5491 case TCPOPT_MULTIPATH
:
5492 tcp_do_mptcp_options(tp
, cp
, th
, to
, optlen
);
5500 tcp_finalize_options(struct tcpcb
*tp
, struct tcpopt
*to
, unsigned int ifscope
)
5502 if (to
->to_flags
& TOF_TS
) {
5503 tp
->t_flags
|= TF_RCVD_TSTMP
;
5504 tp
->ts_recent
= to
->to_tsval
;
5505 tp
->ts_recent_age
= tcp_now
;
5507 if (to
->to_flags
& TOF_MSS
) {
5508 tcp_mss(tp
, to
->to_mss
, ifscope
);
5510 if (SACK_ENABLED(tp
)) {
5511 if (!(to
->to_flags
& TOF_SACK
)) {
5512 tp
->t_flagsext
&= ~(TF_SACK_ENABLE
);
5514 tp
->t_flags
|= TF_SACK_PERMIT
;
5517 if (to
->to_flags
& TOF_SCALE
) {
5518 tp
->t_flags
|= TF_RCVD_SCALE
;
5519 tp
->requested_s_scale
= to
->to_requested_s_scale
;
5521 /* Re-enable window scaling, if the option is received */
5522 if (tp
->request_r_scale
> 0) {
5523 tp
->t_flags
|= TF_REQ_SCALE
;
5529 * Pull out of band byte out of a segment so
5530 * it doesn't appear in the user's data queue.
5531 * It is still reflected in the segment length for
5532 * sequencing purposes.
5534 * @param off delayed to be droped hdrlen
5537 tcp_pulloutofband(struct socket
*so
, struct tcphdr
*th
, struct mbuf
*m
, int off
)
5539 int cnt
= off
+ th
->th_urp
- 1;
5542 if (m
->m_len
> cnt
) {
5543 char *cp
= mtod(m
, caddr_t
) + cnt
;
5544 struct tcpcb
*tp
= sototcpcb(so
);
5547 tp
->t_oobflags
|= TCPOOB_HAVEDATA
;
5548 bcopy(cp
+ 1, cp
, (unsigned)(m
->m_len
- cnt
- 1));
5550 if (m
->m_flags
& M_PKTHDR
) {
5561 panic("tcp_pulloutofband");
5565 get_base_rtt(struct tcpcb
*tp
)
5567 struct rtentry
*rt
= tp
->t_inpcb
->inp_route
.ro_rt
;
5568 return (rt
== NULL
) ? 0 : rt
->rtt_min
;
5571 /* Each value of RTT base represents the minimum RTT seen in a minute.
5572 * We keep upto N_RTT_BASE minutes worth of history.
5575 update_base_rtt(struct tcpcb
*tp
, uint32_t rtt
)
5577 u_int32_t base_rtt
, i
;
5580 if ((rt
= tp
->t_inpcb
->inp_route
.ro_rt
) == NULL
) {
5583 if (rt
->rtt_expire_ts
== 0) {
5585 if (rt
->rtt_expire_ts
!= 0) {
5589 rt
->rtt_expire_ts
= tcp_now
;
5591 rt
->rtt_hist
[0] = rtt
;
5599 * If the recv side is being throttled, check if the
5600 * current RTT is closer to the base RTT seen in
5601 * first (recent) two slots. If so, unthrottle the stream.
5603 if ((tp
->t_flagsext
& TF_RECV_THROTTLE
) &&
5604 (int)(tcp_now
- tp
->t_recv_throttle_ts
) >= TCP_RECV_THROTTLE_WIN
) {
5605 base_rtt
= rt
->rtt_min
;
5606 if (tp
->t_rttcur
<= (base_rtt
+ target_qdelay
)) {
5607 tp
->t_flagsext
&= ~TF_RECV_THROTTLE
;
5608 tp
->t_recv_throttle_ts
= 0;
5611 #endif /* TRAFFIC_MGT */
5612 if ((int)(tcp_now
- rt
->rtt_expire_ts
) >=
5613 TCP_RTT_HISTORY_EXPIRE_TIME
) {
5615 /* check the condition again to avoid race */
5616 if ((int)(tcp_now
- rt
->rtt_expire_ts
) >=
5617 TCP_RTT_HISTORY_EXPIRE_TIME
) {
5619 if (rt
->rtt_index
>= NRTT_HIST
) {
5622 rt
->rtt_hist
[rt
->rtt_index
] = rtt
;
5623 rt
->rtt_expire_ts
= tcp_now
;
5625 rt
->rtt_hist
[rt
->rtt_index
] =
5626 min(rt
->rtt_hist
[rt
->rtt_index
], rtt
);
5628 /* forget the old value and update minimum */
5630 for (i
= 0; i
< NRTT_HIST
; ++i
) {
5631 if (rt
->rtt_hist
[i
] != 0 &&
5632 (rt
->rtt_min
== 0 ||
5633 rt
->rtt_hist
[i
] < rt
->rtt_min
)) {
5634 rt
->rtt_min
= rt
->rtt_hist
[i
];
5639 rt
->rtt_hist
[rt
->rtt_index
] =
5640 min(rt
->rtt_hist
[rt
->rtt_index
], rtt
);
5641 if (rt
->rtt_min
== 0) {
5644 rt
->rtt_min
= min(rt
->rtt_min
, rtt
);
5650 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
5651 * present but transmit timer is running and timed sequence number was
5652 * acked, update smoothed RTT.
5654 * If timestamps are supported, a receiver can update RTT even if
5655 * there is no outstanding data.
5657 * Some boxes send broken timestamp replies during the SYN+ACK phase,
5658 * ignore timestamps of 0or we could calculate a huge RTT and blow up
5659 * the retransmit timer.
5662 tcp_compute_rtt(struct tcpcb
*tp
, struct tcpopt
*to
, struct tcphdr
*th
)
5665 VERIFY(to
!= NULL
&& th
!= NULL
);
5666 if (tp
->t_rtttime
!= 0 && SEQ_GT(th
->th_ack
, tp
->t_rtseq
)) {
5667 u_int32_t pipe_ack_val
;
5668 rtt
= tcp_now
- tp
->t_rtttime
;
5670 * Compute pipe ack -- the amount of data acknowledged
5673 if (SEQ_GT(th
->th_ack
, tp
->t_pipeack_lastuna
)) {
5674 pipe_ack_val
= th
->th_ack
- tp
->t_pipeack_lastuna
;
5675 /* Update the sample */
5676 tp
->t_pipeack_sample
[tp
->t_pipeack_ind
++] =
5678 tp
->t_pipeack_ind
%= TCP_PIPEACK_SAMPLE_COUNT
;
5680 /* Compute the max of the pipeack samples */
5681 pipe_ack_val
= tcp_get_max_pipeack(tp
);
5682 tp
->t_pipeack
= (pipe_ack_val
>
5683 tcp_initial_cwnd(tp
)) ?
5686 /* start another measurement */
5689 if (((to
->to_flags
& TOF_TS
) != 0) &&
5690 (to
->to_tsecr
!= 0) &&
5691 TSTMP_GEQ(tcp_now
, to
->to_tsecr
)) {
5692 tcp_xmit_timer(tp
, (tcp_now
- to
->to_tsecr
),
5693 to
->to_tsecr
, th
->th_ack
);
5694 } else if (rtt
> 0) {
5695 tcp_xmit_timer(tp
, rtt
, 0, th
->th_ack
);
5700 * Collect new round-trip time estimate and update averages and
5704 tcp_xmit_timer(struct tcpcb
*tp
, int rtt
,
5705 u_int32_t tsecr
, tcp_seq th_ack
)
5708 int old_srtt
= tp
->t_srtt
;
5709 int old_rttvar
= tp
->t_rttvar
;
5710 bool log_rtt
= false;
5713 * On AWDL interface, the initial RTT measurement on SYN
5714 * can be wrong due to peer caching. Avoid the first RTT
5715 * measurement as it might skew up the RTO.
5716 * <rdar://problem/28739046>
5718 if (tp
->t_inpcb
->inp_last_outifp
!= NULL
&&
5719 (tp
->t_inpcb
->inp_last_outifp
->if_eflags
& IFEF_AWDL
) &&
5720 th_ack
== tp
->iss
+ 1) {
5724 if (tp
->t_flagsext
& TF_RECOMPUTE_RTT
) {
5725 if (SEQ_GT(th_ack
, tp
->snd_una
) &&
5726 SEQ_LEQ(th_ack
, tp
->snd_max
) &&
5728 TSTMP_GEQ(tsecr
, tp
->t_badrexmt_time
))) {
5730 * We received a new ACK after a
5731 * spurious timeout. Adapt retransmission
5732 * timer as described in rfc 4015.
5734 tp
->t_flagsext
&= ~(TF_RECOMPUTE_RTT
);
5735 tp
->t_badrexmt_time
= 0;
5736 tp
->t_srtt
= max(tp
->t_srtt_prev
, rtt
);
5737 tp
->t_srtt
= tp
->t_srtt
<< TCP_RTT_SHIFT
;
5738 tp
->t_rttvar
= max(tp
->t_rttvar_prev
, (rtt
>> 1));
5739 tp
->t_rttvar
= tp
->t_rttvar
<< TCP_RTTVAR_SHIFT
;
5741 if (tp
->t_rttbest
> (tp
->t_srtt
+ tp
->t_rttvar
)) {
5742 tp
->t_rttbest
= tp
->t_srtt
+ tp
->t_rttvar
;
5751 tcpstat
.tcps_rttupdated
++;
5756 update_base_rtt(tp
, rtt
);
5759 if (tp
->t_srtt
!= 0) {
5761 * srtt is stored as fixed point with 5 bits after the
5762 * binary point (i.e., scaled by 32). The following magic
5763 * is equivalent to the smoothing algorithm in rfc793 with
5764 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
5767 * Freebsd adjusts rtt to origin 0 by subtracting 1
5768 * from the provided rtt value. This was required because
5769 * of the way t_rtttime was initiailised to 1 before.
5770 * Since we changed t_rtttime to be based on
5771 * tcp_now, this extra adjustment is not needed.
5773 delta
= (rtt
<< TCP_DELTA_SHIFT
)
5774 - (tp
->t_srtt
>> (TCP_RTT_SHIFT
- TCP_DELTA_SHIFT
));
5776 if ((tp
->t_srtt
+= delta
) <= 0) {
5781 * We accumulate a smoothed rtt variance (actually, a
5782 * smoothed mean difference), then set the retransmit
5783 * timer to smoothed rtt + 4 times the smoothed variance.
5784 * rttvar is stored as fixed point with 4 bits after the
5785 * binary point (scaled by 16). The following is
5786 * equivalent to rfc793 smoothing with an alpha of .75
5787 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
5788 * rfc793's wired-in beta.
5793 delta
-= tp
->t_rttvar
>> (TCP_RTTVAR_SHIFT
- TCP_DELTA_SHIFT
);
5794 if ((tp
->t_rttvar
+= delta
) <= 0) {
5797 if (tp
->t_rttbest
== 0 ||
5798 tp
->t_rttbest
> (tp
->t_srtt
+ tp
->t_rttvar
)) {
5799 tp
->t_rttbest
= tp
->t_srtt
+ tp
->t_rttvar
;
5803 * No rtt measurement yet - use the unsmoothed rtt.
5804 * Set the variance to half the rtt (so our first
5805 * retransmit happens at 3*rtt).
5807 tp
->t_srtt
= rtt
<< TCP_RTT_SHIFT
;
5808 tp
->t_rttvar
= rtt
<< (TCP_RTTVAR_SHIFT
- 1);
5809 tp
->t_rttbest
= tp
->t_srtt
+ tp
->t_rttvar
;
5813 nstat_route_rtt(tp
->t_inpcb
->inp_route
.ro_rt
, tp
->t_srtt
,
5817 * the retransmit should happen at rtt + 4 * rttvar.
5818 * Because of the way we do the smoothing, srtt and rttvar
5819 * will each average +1/2 tick of bias. When we compute
5820 * the retransmit timer, we want 1/2 tick of rounding and
5821 * 1 extra tick because of +-1/2 tick uncertainty in the
5822 * firing of the timer. The bias will give us exactly the
5823 * 1.5 tick we need. But, because the bias is
5824 * statistical, we have to test that we don't drop below
5825 * the minimum feasible timer (which is 2 ticks).
5827 TCPT_RANGESET(tp
->t_rxtcur
, TCP_REXMTVAL(tp
),
5828 max(tp
->t_rttmin
, rtt
+ 2), TCPTV_REXMTMAX
,
5829 TCP_ADD_REXMTSLOP(tp
));
5832 * We received an ack for a packet that wasn't retransmitted;
5833 * it is probably safe to discard any error indications we've
5834 * received recently. This isn't quite right, but close enough
5835 * for now (a route might have failed after we sent a segment,
5836 * and the return path might not be symmetrical).
5838 tp
->t_softerror
= 0;
5841 TCP_LOG_RTT_INFO(tp
);
5844 TCP_LOG_RTT_CHANGE(tp
, old_srtt
, old_rttvar
);
5847 static inline unsigned int
5848 tcp_maxmtu(struct rtentry
*rt
)
5850 unsigned int maxmtu
;
5851 int interface_mtu
= 0;
5853 RT_LOCK_ASSERT_HELD(rt
);
5854 interface_mtu
= rt
->rt_ifp
->if_mtu
;
5856 if (rt_key(rt
)->sa_family
== AF_INET
&&
5857 INTF_ADJUST_MTU_FOR_CLAT46(rt
->rt_ifp
)) {
5858 interface_mtu
= IN6_LINKMTU(rt
->rt_ifp
);
5859 /* Further adjust the size for CLAT46 expansion */
5860 interface_mtu
-= CLAT46_HDR_EXPANSION_OVERHD
;
5863 if (rt
->rt_rmx
.rmx_mtu
== 0) {
5864 maxmtu
= interface_mtu
;
5866 maxmtu
= MIN(rt
->rt_rmx
.rmx_mtu
, interface_mtu
);
5872 static inline unsigned int
5873 tcp_maxmtu6(struct rtentry
*rt
)
5875 unsigned int maxmtu
;
5876 struct nd_ifinfo
*ndi
= NULL
;
5878 RT_LOCK_ASSERT_HELD(rt
);
5879 if ((ndi
= ND_IFINFO(rt
->rt_ifp
)) != NULL
&& !ndi
->initialized
) {
5883 lck_mtx_lock(&ndi
->lock
);
5885 if (rt
->rt_rmx
.rmx_mtu
== 0) {
5886 maxmtu
= IN6_LINKMTU(rt
->rt_ifp
);
5888 maxmtu
= MIN(rt
->rt_rmx
.rmx_mtu
, IN6_LINKMTU(rt
->rt_ifp
));
5891 lck_mtx_unlock(&ndi
->lock
);
5898 get_maxmtu(struct rtentry
*rt
)
5900 unsigned int maxmtu
= 0;
5902 RT_LOCK_ASSERT_NOTHELD(rt
);
5906 if (rt_key(rt
)->sa_family
== AF_INET6
) {
5907 maxmtu
= tcp_maxmtu6(rt
);
5909 maxmtu
= tcp_maxmtu(rt
);
5918 * Determine a reasonable value for maxseg size.
5919 * If the route is known, check route for mtu.
5920 * If none, use an mss that can be handled on the outgoing
5921 * interface without forcing IP to fragment; if bigger than
5922 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
5923 * to utilize large mbufs. If no route is found, route has no mtu,
5924 * or the destination isn't local, use a default, hopefully conservative
5925 * size (usually 512 or the default IP max size, but no more than the mtu
5926 * of the interface), as we can't discover anything about intervening
5927 * gateways or networks. We also initialize the congestion/slow start
5928 * window. While looking at the routing entry, we also initialize
5929 * other path-dependent parameters from pre-set or cached values
5930 * in the routing entry.
5932 * Also take into account the space needed for options that we
5933 * send regularly. Make maxseg shorter by that amount to assure
5934 * that we can send maxseg amount of data even when the options
5935 * are present. Store the upper limit of the length of options plus
5938 * NOTE that this routine is only called when we process an incoming
5939 * segment, for outgoing segments only tcp_mssopt is called.
5943 tcp_mss(struct tcpcb
*tp
, int offer
, unsigned int input_ifscope
)
5951 int origoffer
= offer
;
5952 u_int32_t sb_max_corrected
;
5959 so
= inp
->inp_socket
;
5961 * Nothing left to send after the socket is defunct or TCP is in the closed state
5963 if ((so
->so_state
& SS_DEFUNCT
) || tp
->t_state
== TCPS_CLOSED
) {
5967 isipv6
= ((inp
->inp_vflag
& INP_IPV6
) != 0) ? 1 : 0;
5968 min_protoh
= isipv6
? sizeof(struct ip6_hdr
) + sizeof(struct tcphdr
)
5969 : sizeof(struct tcpiphdr
);
5972 rt
= tcp_rtlookup6(inp
, input_ifscope
);
5974 rt
= tcp_rtlookup(inp
, input_ifscope
);
5976 isnetlocal
= (tp
->t_flags
& TF_LOCAL
);
5979 tp
->t_maxopd
= tp
->t_maxseg
= isipv6
? tcp_v6mssdflt
: tcp_mssdflt
;
5984 * Slower link window correction:
5985 * If a value is specificied for slowlink_wsize use it for
5986 * PPP links believed to be on a serial modem (speed <128Kbps).
5987 * Excludes 9600bps as it is the default value adversized
5988 * by pseudo-devices over ppp.
5990 if (ifp
->if_type
== IFT_PPP
&& slowlink_wsize
> 0 &&
5991 ifp
->if_baudrate
> 9600 && ifp
->if_baudrate
<= 128000) {
5992 tp
->t_flags
|= TF_SLOWLINK
;
5996 * Offer == -1 means that we didn't receive SYN yet. Use 0 then.
5999 offer
= rt
->rt_rmx
.rmx_filler
[0];
6002 * Offer == 0 means that there was no MSS on the SYN segment,
6003 * in this case we use tcp_mssdflt.
6006 offer
= isipv6
? tcp_v6mssdflt
: tcp_mssdflt
;
6009 * Prevent DoS attack with too small MSS. Round up
6010 * to at least minmss.
6012 offer
= max(offer
, tcp_minmss
);
6014 * Sanity check: make sure that maxopd will be large
6015 * enough to allow some data on segments even is the
6016 * all the option space is used (40bytes). Otherwise
6017 * funny things may happen in tcp_output.
6019 offer
= max(offer
, 64);
6021 rt
->rt_rmx
.rmx_filler
[0] = offer
;
6024 * While we're here, check if there's an initial rtt
6025 * or rttvar. Convert from the route-table units
6026 * to scaled multiples of the slow timeout timer.
6028 if (tp
->t_srtt
== 0 && (rtt
= rt
->rt_rmx
.rmx_rtt
) != 0) {
6029 tcp_getrt_rtt(tp
, rt
);
6031 tp
->t_rttmin
= isnetlocal
? tcp_TCPTV_MIN
: TCPTV_REXMTMIN
;
6034 mss
= (isipv6
? tcp_maxmtu6(rt
) : tcp_maxmtu(rt
));
6037 // At this point, the mss is just the MTU. Adjust if necessary.
6038 mss
= necp_socket_get_effective_mtu(inp
, mss
);
6043 if (rt
->rt_rmx
.rmx_mtu
== 0) {
6046 mss
= min(mss
, tcp_v6mssdflt
);
6048 } else if (!isnetlocal
) {
6049 mss
= min(mss
, tcp_mssdflt
);
6053 mss
= min(mss
, offer
);
6055 * maxopd stores the maximum length of data AND options
6056 * in a segment; maxseg is the amount of data in a normal
6057 * segment. We need to store this value (maxopd) apart
6058 * from maxseg, because now every segment carries options
6059 * and thus we normally have somewhat less data in segments.
6064 * origoffer==-1 indicates, that no segments were received yet.
6065 * In this case we just guess.
6067 if ((tp
->t_flags
& (TF_REQ_TSTMP
| TF_NOOPT
)) == TF_REQ_TSTMP
&&
6069 (tp
->t_flags
& TF_RCVD_TSTMP
) == TF_RCVD_TSTMP
)) {
6070 mss
-= TCPOLEN_TSTAMP_APPA
;
6074 mss
-= mptcp_adj_mss(tp
, FALSE
);
6079 * Calculate corrected value for sb_max; ensure to upgrade the
6080 * numerator for large sb_max values else it will overflow.
6082 sb_max_corrected
= (sb_max
* (u_int64_t
)MCLBYTES
) / (MSIZE
+ MCLBYTES
);
6085 * If there's a pipesize (ie loopback), change the socket
6086 * buffer to that size only if it's bigger than the current
6087 * sockbuf size. Make the socket buffers an integral
6088 * number of mss units; if the mss is larger than
6089 * the socket buffer, decrease the mss.
6092 bufsize
= rt
->rt_rmx
.rmx_sendpipe
;
6093 if (bufsize
< so
->so_snd
.sb_hiwat
)
6095 bufsize
= so
->so_snd
.sb_hiwat
;
6096 if (bufsize
< mss
) {
6099 bufsize
= (((bufsize
+ (u_int64_t
)mss
- 1) / (u_int64_t
)mss
) * (u_int64_t
)mss
);
6100 if (bufsize
> sb_max_corrected
) {
6101 bufsize
= sb_max_corrected
;
6103 (void)sbreserve(&so
->so_snd
, bufsize
);
6107 ASSERT(tp
->t_maxseg
);
6110 * Update MSS using recommendation from link status report. This is
6113 tcp_update_mss_locked(so
, ifp
);
6116 bufsize
= rt
->rt_rmx
.rmx_recvpipe
;
6117 if (bufsize
< so
->so_rcv
.sb_hiwat
)
6119 bufsize
= so
->so_rcv
.sb_hiwat
;
6120 if (bufsize
> mss
) {
6121 bufsize
= (((bufsize
+ (u_int64_t
)mss
- 1) / (u_int64_t
)mss
) * (u_int64_t
)mss
);
6122 if (bufsize
> sb_max_corrected
) {
6123 bufsize
= sb_max_corrected
;
6125 (void)sbreserve(&so
->so_rcv
, bufsize
);
6128 set_tcp_stream_priority(so
);
6130 if (rt
->rt_rmx
.rmx_ssthresh
) {
6132 * There's some sort of gateway or interface
6133 * buffer limit on the path. Use this to set
6134 * slow-start threshold, but set the threshold to
6135 * no less than 2*mss.
6137 tp
->snd_ssthresh
= max(2 * mss
, rt
->rt_rmx
.rmx_ssthresh
);
6138 tcpstat
.tcps_usedssthresh
++;
6140 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
6144 * Set the slow-start flight size depending on whether this
6145 * is a local network or not.
6147 if (CC_ALGO(tp
)->cwnd_init
!= NULL
) {
6148 CC_ALGO(tp
)->cwnd_init(tp
);
6151 tcp_ccdbg_trace(tp
, NULL
, TCP_CC_CWND_INIT
);
6153 /* Route locked during lookup above */
6158 * Determine the MSS option to send on an outgoing SYN.
6161 tcp_mssopt(struct tcpcb
*tp
)
6168 isipv6
= ((tp
->t_inpcb
->inp_vflag
& INP_IPV6
) != 0) ? 1 : 0;
6169 min_protoh
= isipv6
? sizeof(struct ip6_hdr
) + sizeof(struct tcphdr
)
6170 : sizeof(struct tcpiphdr
);
6173 rt
= tcp_rtlookup6(tp
->t_inpcb
, IFSCOPE_NONE
);
6175 rt
= tcp_rtlookup(tp
->t_inpcb
, IFSCOPE_NONE
);
6178 return isipv6
? tcp_v6mssdflt
: tcp_mssdflt
;
6181 * Slower link window correction:
6182 * If a value is specificied for slowlink_wsize use it for PPP links
6183 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
6184 * it is the default value adversized by pseudo-devices over ppp.
6186 if (rt
->rt_ifp
->if_type
== IFT_PPP
&& slowlink_wsize
> 0 &&
6187 rt
->rt_ifp
->if_baudrate
> 9600 && rt
->rt_ifp
->if_baudrate
<= 128000) {
6188 tp
->t_flags
|= TF_SLOWLINK
;
6191 mss
= (isipv6
? tcp_maxmtu6(rt
) : tcp_maxmtu(rt
));
6192 /* Route locked during lookup above */
6196 // At this point, the mss is just the MTU. Adjust if necessary.
6197 mss
= necp_socket_get_effective_mtu(tp
->t_inpcb
, mss
);
6200 return mss
- min_protoh
;
6204 * On a partial ack arrives, force the retransmission of the
6205 * next unacknowledged segment. Do not clear tp->t_dupacks.
6206 * By setting snd_nxt to th_ack, this forces retransmission timer to
6210 tcp_newreno_partial_ack(struct tcpcb
*tp
, struct tcphdr
*th
)
6212 tcp_seq onxt
= tp
->snd_nxt
;
6213 u_int32_t ocwnd
= tp
->snd_cwnd
;
6214 tp
->t_timer
[TCPT_REXMT
] = 0;
6215 tp
->t_timer
[TCPT_PTO
] = 0;
6217 tp
->snd_nxt
= th
->th_ack
;
6219 * Set snd_cwnd to one segment beyond acknowledged offset
6220 * (tp->snd_una has not yet been updated when this function
6223 tp
->snd_cwnd
= tp
->t_maxseg
+ BYTES_ACKED(th
, tp
);
6224 (void) tcp_output(tp
);
6225 tp
->snd_cwnd
= ocwnd
;
6226 if (SEQ_GT(onxt
, tp
->snd_nxt
)) {
6230 * Partial window deflation. Relies on fact that tp->snd_una
6233 if (tp
->snd_cwnd
> BYTES_ACKED(th
, tp
)) {
6234 tp
->snd_cwnd
-= BYTES_ACKED(th
, tp
);
6238 tp
->snd_cwnd
+= tp
->t_maxseg
;
6242 * Drop a random TCP connection that hasn't been serviced yet and
6243 * is eligible for discard. There is a one in qlen chance that
6244 * we will return a null, saying that there are no dropable
6245 * requests. In this case, the protocol specific code should drop
6246 * the new request. This insures fairness.
6248 * The listening TCP socket "head" must be locked
6251 tcp_dropdropablreq(struct socket
*head
)
6253 struct socket
*so
, *sonext
;
6254 unsigned int i
, j
, qlen
;
6255 static u_int32_t rnd
= 0;
6256 static u_int64_t old_runtime
;
6257 static unsigned int cur_cnt
, old_cnt
;
6259 struct inpcb
*inp
= NULL
;
6262 if ((head
->so_options
& SO_ACCEPTCONN
) == 0) {
6266 if (TAILQ_EMPTY(&head
->so_incomp
)) {
6270 so_acquire_accept_list(head
, NULL
);
6271 socket_unlock(head
, 0);
6274 * Check if there is any socket in the incomp queue
6275 * that is closed because of a reset from the peer and is
6276 * waiting to be garbage collected. If so, pick that as
6279 TAILQ_FOREACH_SAFE(so
, &head
->so_incomp
, so_list
, sonext
) {
6280 inp
= sotoinpcb(so
);
6281 tp
= intotcpcb(inp
);
6282 if (tp
!= NULL
&& tp
->t_state
== TCPS_CLOSED
&&
6283 so
->so_head
!= NULL
&&
6284 (so
->so_state
& (SS_INCOMP
| SS_CANTSENDMORE
| SS_CANTRCVMORE
)) ==
6285 (SS_INCOMP
| SS_CANTSENDMORE
| SS_CANTRCVMORE
)) {
6287 * The listen socket is already locked but we
6288 * can lock this socket here without lock ordering
6289 * issues because it is in the incomp queue and
6290 * is not visible to others.
6292 if (socket_try_lock(so
)) {
6301 so
= TAILQ_FIRST(&head
->so_incomp
);
6303 now_sec
= net_uptime();
6304 if ((i
= (now_sec
- old_runtime
)) != 0) {
6305 old_runtime
= now_sec
;
6306 old_cnt
= cur_cnt
/ i
;
6310 qlen
= head
->so_incqlen
;
6312 rnd
= RandomULong();
6315 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
6316 rnd
= (314159 * rnd
+ 66329) & 0xffff;
6317 j
= ((qlen
+ 1) * rnd
) >> 16;
6320 so
= TAILQ_NEXT(so
, so_list
);
6323 /* Find a connection that is not already closing (or being served) */
6325 inp
= (struct inpcb
*)so
->so_pcb
;
6327 sonext
= TAILQ_NEXT(so
, so_list
);
6329 if (in_pcb_checkstate(inp
, WNT_ACQUIRE
, 0) != WNT_STOPUSING
) {
6331 * Avoid the issue of a socket being accepted
6332 * by one input thread and being dropped by
6333 * another input thread. If we can't get a hold
6334 * on this mutex, then grab the next socket in
6337 if (socket_try_lock(so
)) {
6339 if ((so
->so_usecount
== 2) &&
6340 (so
->so_state
& SS_INCOMP
) &&
6341 !(so
->so_flags
& SOF_INCOMP_INPROGRESS
)) {
6345 * don't use if being accepted or
6346 * used in any other way
6348 in_pcb_checkstate(inp
, WNT_RELEASE
, 1);
6349 socket_unlock(so
, 1);
6353 * do not try to lock the inp in
6354 * in_pcb_checkstate because the lock
6355 * is already held in some other thread.
6356 * Only drop the inp_wntcnt reference.
6358 in_pcb_checkstate(inp
, WNT_RELEASE
, 1);
6364 socket_lock(head
, 0);
6365 so_release_accept_list(head
);
6369 /* Makes sure socket is still in the right state to be discarded */
6371 if (in_pcb_checkstate(inp
, WNT_RELEASE
, 1) == WNT_STOPUSING
) {
6372 socket_unlock(so
, 1);
6373 socket_lock(head
, 0);
6374 so_release_accept_list(head
);
6379 if (so
->so_usecount
!= 2 || !(so
->so_state
& SS_INCOMP
)) {
6380 /* do not discard: that socket is being accepted */
6381 socket_unlock(so
, 1);
6382 socket_lock(head
, 0);
6383 so_release_accept_list(head
);
6387 socket_lock(head
, 0);
6388 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
6391 so
->so_state
&= ~SS_INCOMP
;
6392 so
->so_flags
|= SOF_OVERFLOW
;
6394 so_release_accept_list(head
);
6395 socket_unlock(head
, 0);
6397 socket_lock_assert_owned(so
);
6401 if (inp
->inp_wantcnt
> 0 && inp
->inp_wantcnt
!= WNT_STOPUSING
) {
6403 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
6404 * doesn't require a lock, it could have happened while
6405 * we are holding the lock. This pcb will have to
6406 * be garbage collected later.
6407 * Release the reference held for so_incomp queue
6409 VERIFY(so
->so_usecount
> 0);
6411 socket_unlock(so
, 1);
6414 * Unlock this socket and leave the reference on.
6415 * We need to acquire the pcbinfo lock in order to
6416 * fully dispose it off
6418 socket_unlock(so
, 0);
6420 lck_rw_lock_exclusive(tcbinfo
.ipi_lock
);
6423 /* Release the reference held for so_incomp queue */
6424 VERIFY(so
->so_usecount
> 0);
6427 if (so
->so_usecount
!= 1 ||
6428 (inp
->inp_wantcnt
> 0 &&
6429 inp
->inp_wantcnt
!= WNT_STOPUSING
)) {
6431 * There is an extra wantcount or usecount
6432 * that must have been added when the socket
6433 * was unlocked. This socket will have to be
6434 * garbage collected later
6436 socket_unlock(so
, 1);
6438 /* Drop the reference held for this function */
6439 VERIFY(so
->so_usecount
> 0);
6444 lck_rw_done(tcbinfo
.ipi_lock
);
6446 tcpstat
.tcps_drops
++;
6448 socket_lock(head
, 0);
6452 /* Set background congestion control on a socket */
6454 tcp_set_background_cc(struct socket
*so
)
6456 tcp_set_new_cc(so
, TCP_CC_ALGO_BACKGROUND_INDEX
);
6459 /* Set foreground congestion control on a socket */
6461 tcp_set_foreground_cc(struct socket
*so
)
6463 if (tcp_use_newreno
) {
6464 tcp_set_new_cc(so
, TCP_CC_ALGO_NEWRENO_INDEX
);
6466 tcp_set_new_cc(so
, TCP_CC_ALGO_CUBIC_INDEX
);
6471 tcp_set_new_cc(struct socket
*so
, uint16_t cc_index
)
6473 struct inpcb
*inp
= sotoinpcb(so
);
6474 struct tcpcb
*tp
= intotcpcb(inp
);
6475 u_char old_cc_index
= 0;
6476 if (tp
->tcp_cc_index
!= cc_index
) {
6477 old_cc_index
= tp
->tcp_cc_index
;
6479 if (CC_ALGO(tp
)->cleanup
!= NULL
) {
6480 CC_ALGO(tp
)->cleanup(tp
);
6482 tp
->tcp_cc_index
= cc_index
;
6484 tcp_cc_allocate_state(tp
);
6486 if (CC_ALGO(tp
)->switch_to
!= NULL
) {
6487 CC_ALGO(tp
)->switch_to(tp
, old_cc_index
);
6490 tcp_ccdbg_trace(tp
, NULL
, TCP_CC_CHANGE_ALGO
);
6495 tcp_set_recv_bg(struct socket
*so
)
6497 if (!IS_TCP_RECV_BG(so
)) {
6498 so
->so_flags1
|= SOF1_TRAFFIC_MGT_TCP_RECVBG
;
6503 tcp_clear_recv_bg(struct socket
*so
)
6505 if (IS_TCP_RECV_BG(so
)) {
6506 so
->so_flags1
&= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG
);
6511 inp_fc_throttle_tcp(struct inpcb
*inp
)
6513 struct tcpcb
*tp
= inp
->inp_ppcb
;
6515 if (!tcp_flow_control_response
) {
6520 * Back off the slow-start threshold and enter
6521 * congestion avoidance phase
6523 if (CC_ALGO(tp
)->pre_fr
!= NULL
) {
6524 CC_ALGO(tp
)->pre_fr(tp
);
6529 inp_fc_unthrottle_tcp(struct inpcb
*inp
)
6531 struct tcpcb
*tp
= inp
->inp_ppcb
;
6533 if (tcp_flow_control_response
) {
6534 if (CC_ALGO(tp
)->post_fr
!= NULL
) {
6535 CC_ALGO(tp
)->post_fr(tp
, NULL
);
6538 tp
->t_bytes_acked
= 0;
6541 * Reset retransmit shift as we know that the reason
6542 * for delay in sending a packet is due to flow
6543 * control on the outgoing interface. There is no need
6544 * to backoff retransmit timer.
6546 TCP_RESET_REXMT_STATE(tp
);
6548 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
6551 * Start the output stream again. Since we are
6552 * not retransmitting data, do not reset the
6553 * retransmit timer or rtt calculation.
6560 * Back off the slow-start threshold and enter
6561 * congestion avoidance phase
6563 if (CC_ALGO(tp
)->pre_fr
!= NULL
) {
6564 CC_ALGO(tp
)->pre_fr(tp
);
6567 tp
->snd_cwnd
= tp
->snd_ssthresh
;
6568 tp
->t_flagsext
&= ~TF_CWND_NONVALIDATED
;
6570 * Restart counting for ABC as we changed the
6571 * congestion window just now.
6573 tp
->t_bytes_acked
= 0;
6575 /* Reset retransmit shift as we know that the reason
6576 * for delay in sending a packet is due to flow
6577 * control on the outgoing interface. There is no need
6578 * to backoff retransmit timer.
6580 TCP_RESET_REXMT_STATE(tp
);
6583 * Start the output stream again. Since we are
6584 * not retransmitting data, do not reset the
6585 * retransmit timer or rtt calculation.
6591 tcp_getstat SYSCTL_HANDLER_ARGS
6593 #pragma unused(oidp, arg1, arg2)
6596 struct tcpstat
*stat
;
6598 #if XNU_TARGET_OS_OSX
6599 struct tcpstat zero_stat
;
6601 if (tcp_disable_access_to_stats
&&
6602 !kauth_cred_issuser(kauth_cred_get())) {
6603 bzero(&zero_stat
, sizeof(zero_stat
));
6607 #endif /* XNU_TARGET_OS_OSX */
6609 if (req
->oldptr
== 0) {
6610 req
->oldlen
= (size_t)sizeof(struct tcpstat
);
6613 error
= SYSCTL_OUT(req
, stat
, MIN(sizeof(tcpstat
), req
->oldlen
));
6619 * Checksum extended TCP header and data.
6622 tcp_input_checksum(int af
, struct mbuf
*m
, struct tcphdr
*th
, int off
, int tlen
)
6624 struct ifnet
*ifp
= m
->m_pkthdr
.rcvif
;
6628 struct ip
*ip
= mtod(m
, struct ip
*);
6629 struct ipovly
*ipov
= (struct ipovly
*)ip
;
6631 /* ip_stripoptions() must have been called before we get here */
6632 ASSERT((ip
->ip_hl
<< 2) == sizeof(*ip
));
6634 if ((hwcksum_rx
|| (ifp
->if_flags
& IFF_LOOPBACK
) ||
6635 (m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
)) &&
6636 (m
->m_pkthdr
.csum_flags
& CSUM_DATA_VALID
)) {
6637 if (m
->m_pkthdr
.csum_flags
& CSUM_PSEUDO_HDR
) {
6638 th
->th_sum
= m
->m_pkthdr
.csum_rx_val
;
6640 uint32_t sum
= m
->m_pkthdr
.csum_rx_val
;
6641 uint32_t start
= m
->m_pkthdr
.csum_rx_start
;
6642 int32_t trailer
= (m_pktlen(m
) - (off
+ tlen
));
6645 * Perform 1's complement adjustment of octets
6646 * that got included/excluded in the hardware-
6647 * calculated checksum value. Ignore cases
6648 * where the value already includes the entire
6649 * IP header span, as the sum for those octets
6650 * would already be 0 by the time we get here;
6651 * IP has already performed its header checksum
6652 * checks. If we do need to adjust, restore
6653 * the original fields in the IP header when
6654 * computing the adjustment value. Also take
6655 * care of any trailing bytes and subtract out
6656 * their partial sum.
6658 ASSERT(trailer
>= 0);
6659 if ((m
->m_pkthdr
.csum_flags
& CSUM_PARTIAL
) &&
6660 ((start
!= 0 && start
!= off
) || trailer
)) {
6661 uint32_t swbytes
= (uint32_t)trailer
;
6664 ip
->ip_len
+= sizeof(*ip
);
6665 #if BYTE_ORDER != BIG_ENDIAN
6668 #endif /* BYTE_ORDER != BIG_ENDIAN */
6670 /* callee folds in sum */
6671 sum
= m_adj_sum16(m
, start
, off
,
6674 swbytes
+= (off
- start
);
6676 swbytes
+= (start
- off
);
6680 #if BYTE_ORDER != BIG_ENDIAN
6683 #endif /* BYTE_ORDER != BIG_ENDIAN */
6684 ip
->ip_len
-= sizeof(*ip
);
6688 tcp_in_cksum_stats(swbytes
);
6695 /* callee folds in sum */
6696 th
->th_sum
= in_pseudo(ip
->ip_src
.s_addr
,
6698 sum
+ htonl(tlen
+ IPPROTO_TCP
));
6700 th
->th_sum
^= 0xffff;
6706 bcopy(ipov
->ih_x1
, b
, sizeof(ipov
->ih_x1
));
6707 bzero(ipov
->ih_x1
, sizeof(ipov
->ih_x1
));
6708 ip_sum
= ipov
->ih_len
;
6709 ipov
->ih_len
= (u_short
)tlen
;
6710 #if BYTE_ORDER != BIG_ENDIAN
6711 HTONS(ipov
->ih_len
);
6713 len
= sizeof(struct ip
) + tlen
;
6714 th
->th_sum
= in_cksum(m
, len
);
6715 bcopy(b
, ipov
->ih_x1
, sizeof(ipov
->ih_x1
));
6716 ipov
->ih_len
= ip_sum
;
6718 tcp_in_cksum_stats(len
);
6723 struct ip6_hdr
*ip6
= mtod(m
, struct ip6_hdr
*);
6725 if ((hwcksum_rx
|| (ifp
->if_flags
& IFF_LOOPBACK
) ||
6726 (m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
)) &&
6727 (m
->m_pkthdr
.csum_flags
& CSUM_DATA_VALID
)) {
6728 if (m
->m_pkthdr
.csum_flags
& CSUM_PSEUDO_HDR
) {
6729 th
->th_sum
= m
->m_pkthdr
.csum_rx_val
;
6731 uint32_t sum
= m
->m_pkthdr
.csum_rx_val
;
6732 uint32_t start
= m
->m_pkthdr
.csum_rx_start
;
6733 int32_t trailer
= (m_pktlen(m
) - (off
+ tlen
));
6736 * Perform 1's complement adjustment of octets
6737 * that got included/excluded in the hardware-
6738 * calculated checksum value. Also take care
6739 * of any trailing bytes and subtract out their
6742 ASSERT(trailer
>= 0);
6743 if ((m
->m_pkthdr
.csum_flags
& CSUM_PARTIAL
) &&
6744 (start
!= off
|| trailer
!= 0)) {
6745 uint16_t s
= 0, d
= 0;
6746 uint32_t swbytes
= (uint32_t)trailer
;
6748 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_src
)) {
6749 s
= ip6
->ip6_src
.s6_addr16
[1];
6750 ip6
->ip6_src
.s6_addr16
[1] = 0;
6752 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_dst
)) {
6753 d
= ip6
->ip6_dst
.s6_addr16
[1];
6754 ip6
->ip6_dst
.s6_addr16
[1] = 0;
6757 /* callee folds in sum */
6758 sum
= m_adj_sum16(m
, start
, off
,
6761 swbytes
+= (off
- start
);
6763 swbytes
+= (start
- off
);
6766 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_src
)) {
6767 ip6
->ip6_src
.s6_addr16
[1] = s
;
6769 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_dst
)) {
6770 ip6
->ip6_dst
.s6_addr16
[1] = d
;
6774 tcp_in6_cksum_stats(swbytes
);
6781 th
->th_sum
= in6_pseudo(
6782 &ip6
->ip6_src
, &ip6
->ip6_dst
,
6783 sum
+ htonl(tlen
+ IPPROTO_TCP
));
6785 th
->th_sum
^= 0xffff;
6787 tcp_in6_cksum_stats(tlen
);
6788 th
->th_sum
= in6_cksum(m
, IPPROTO_TCP
, off
, tlen
);
6797 if (th
->th_sum
!= 0) {
6798 tcpstat
.tcps_rcvbadsum
++;
6799 IF_TCP_STATINC(ifp
, badformat
);
6807 SYSCTL_PROC(_net_inet_tcp
, TCPCTL_STATS
, stats
,
6808 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
, 0, 0, tcp_getstat
,
6809 "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
6812 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
6814 #pragma unused(arg1, arg2)
6816 int error
, val
= tcprexmtthresh
;
6818 error
= sysctl_handle_int(oidp
, &val
, 0, req
);
6819 if (error
|| !req
->newptr
) {
6824 * Constrain the number of duplicate ACKs
6825 * to consider for TCP fast retransmit
6829 if (val
< 2 || val
> 3) {
6833 tcprexmtthresh
= val
;
6838 SYSCTL_PROC(_net_inet_tcp
, OID_AUTO
, rexmt_thresh
, CTLTYPE_INT
| CTLFLAG_RW
|
6839 CTLFLAG_LOCKED
, &tcprexmtthresh
, 0, &sysctl_rexmtthresh
, "I",
6840 "Duplicate ACK Threshold for Fast Retransmit");